561 files changed, 109603 insertions, 53344 deletions
diff --git a/dom/plugins/ipc/PluginInstanceParent.cpp b/dom/plugins/ipc/PluginInstanceParent.cpp
index 372a7a238b..ee88819d56 100644
--- a/dom/plugins/ipc/PluginInstanceParent.cpp
+++ b/dom/plugins/ipc/PluginInstanceParent.cpp
@@ -125,7 +125,6 @@ PluginInstanceParent::PluginInstanceParent(PluginModuleParent* parent,
     , mNPNIface(npniface)
     , mWindowType(NPWindowTypeWindow)
     , mDrawingModel(kDefaultDrawingModel)
-    , mLastRecordedDrawingModel(-1)
     , mFrameID(0)
 #if defined(OS_WIN)
     , mPluginHWND(nullptr)
diff --git a/dom/plugins/ipc/PluginInstanceParent.h b/dom/plugins/ipc/PluginInstanceParent.h
index cb85378db6..637f82dcc1 100644
--- a/dom/plugins/ipc/PluginInstanceParent.h
+++ b/dom/plugins/ipc/PluginInstanceParent.h
@@ -399,11 +399,6 @@ private:
     NPWindowType mWindowType;
     int16_t mDrawingModel;
 
-    // Since plugins may request different drawing models to find a compatible
-    // one, we only record the drawing model after a SetWindow call and if the
-    // drawing model has changed.
-    int mLastRecordedDrawingModel;
-
     nsDataHashtable<nsPtrHashKey<NPObject>, PluginScriptableObjectParent*> mScriptableObjects;
 
     // This is used to tell the compositor that it should invalidate the ImageLayer.
diff --git a/intl/unicharutil/tools/genUnicodePropertyData.pl b/intl/unicharutil/tools/genUnicodePropertyData.pl
index 6107737b38..8c7437f82d 100755
--- a/intl/unicharutil/tools/genUnicodePropertyData.pl
+++ b/intl/unicharutil/tools/genUnicodePropertyData.pl
@@ -9,6 +9,10 @@
 # read from the Unicode Character Database and compiled into multi-level arrays
 # for efficient lookup.
 #
+# Note that for most properties, we now rely on ICU; this tool and the tables
+# it generates are used only for a couple of properties not readily exposed
+# via ICU APIs.
+#
 # To regenerate the tables in nsUnicodePropertyData.cpp:
 #
 # (1) Download the current Unicode data files from
@@ -17,13 +21,6 @@
 #
 #     NB: not all the files are actually needed; currently, we require
 #       - UnicodeData.txt
-#       - Scripts.txt
-#       - BidiMirroring.txt
-#       - BidiBrackets.txt
-#       - HangulSyllableType.txt
-#       - LineBreak.txt
-#       - EastAsianWidth.txt
-#       - DerivedCoreProperties.txt
 #       - ReadMe.txt (to record version/date of the UCD)
 #       - Unihan_Variants.txt (from Unihan.zip)
 #     though this may change if we find a need for additional properties.
@@ -32,12 +29,11 @@
 #
 #     We also require the file
 #        http://www.unicode.org/Public/security/latest/IdentifierStatus.txt
-#        http://www.unicode.org/Public/security/latest/IdentifierType.txt
 #     This file should be in a sub-directory "security" immediately below the
 #        directory containing the other Unicode data files.
 #
-#     We also require the latest data file for UTR50, currently revision-16:
-#        http://www.unicode.org/Public/vertical/revision-16/VerticalOrientation-16.txt
+#     We also require the latest data file for UTR50, currently revision-17:
+#        http://www.unicode.org/Public/vertical/revision-17/VerticalOrientation-17.txt
 #     This file should be in a sub-directory "vertical" immediately below the
 #        directory containing the other Unicode data files.
 #
@@ -45,7 +41,6 @@
 # (2) Run this tool using a command line of the form
 #
 #         perl genUnicodePropertyData.pl      \
-#                 /path/to/harfbuzz/src       \
 #                 /path/to/icu/common/unicode \
 #                 /path/to/UCD-directory
 #
@@ -59,17 +54,15 @@
 use strict;
 use List::Util qw(first);
 
-if ($#ARGV != 2) {
+if ($#ARGV != 1) {
     print <<__EOT;
 # Run this tool using a command line of the form
 #
 #     perl genUnicodePropertyData.pl      \\
-#             /path/to/harfbuzz/src       \\
 #             /path/to/icu/common/unicode \\
 #             /path/to/UCD-directory
 #
-# where harfbuzz/src is the directory containing harfbuzz .cc and .hh files,
-# icu/common/unicode is the directory containing ICU 'common' public headers,
+# where icu/common/unicode is the directory containing ICU 'common' headers,
 # and UCD-directory is a directory containing the current Unicode Character
 # Database files (UnicodeData.txt, etc), available from
 # http://www.unicode.org/Public/UNIDATA/, with additional resources as
@@ -85,35 +78,11 @@ __EOT
     exit 0;
 }
 
-my $HARFBUZZ = $ARGV[0];
-my $ICU = $ARGV[1];
-my $UNICODE = $ARGV[2];
-
-# load HB_Category constants
-
-my $cc = -1;
-my %catCode;
-
-sub readHarfBuzzHeader
-{
-    my $file = shift;
-    open FH, "< $HARFBUZZ/$file" or die "can't open harfbuzz header $HARFBUZZ/$file\n";
-    while (<FH>) {
-        if (m/HB_UNICODE_GENERAL_CATEGORY_([A-Z_]+)/) {
-            $cc++;
-            $catCode{$1} = $cc;
-        }
-    }
-    close FH;
-}
+my $ICU = $ARGV[0];
+my $UNICODE = $ARGV[1];
 
-&readHarfBuzzHeader("hb-unicode.h");
-
-die "didn't find HarfBuzz category codes\n" if $cc == -1;
-
-my %scriptCode;
-my @scriptCodeToTag;
 my @scriptCodeToName;
+my @idtype;
 
 my $sc = -1;
 
@@ -130,8 +99,6 @@ sub readIcuHeader
         s/SIGN_WRITING/SIGNWRITING/;
         if (m|USCRIPT_([A-Z_]+)\s*=\s*([0-9]+),\s*/\*\s*([A-Z][a-z]{3})\s*\*/|) {
             $sc = $2;
-            $scriptCode{$1} = $sc;
-            $scriptCodeToTag[$sc] = $3;
             $scriptCodeToName[$sc] = $1;
         }
     }
@@ -168,35 +135,7 @@ my %idType = (
 # These match the IdentifierType enum in nsUnicodeProperties.h.
 my %mappedIdType = (
   "Restricted"   => 0,
-  "Allowed"      => 1,
-  "Aspirational" => 2 # for Aspirational characters that are not excluded
-                      # by another attribute.
-);
-
-my %bidicategoryCode = (
-  "L"   =>  0, # Left-to-Right
-  "R"   =>  1, # Right-to-Left
-  "EN"  =>  2, # European Number
-  "ES"  =>  3, # European Number Separator
-  "ET"  =>  4, # European Number Terminator
-  "AN"  =>  5, # Arabic Number
-  "CS"  =>  6, # Common Number Separator
-  "B"   =>  7, # Paragraph Separator
-  "S"   =>  8, # Segment Separator
-  "WS"  =>  9, # Whitespace
-  "ON"  => 10, # Other Neutrals
-  "LRE" => 11, # Left-to-Right Embedding
-  "LRO" => 12, # Left-to-Right Override
-  "AL"  => 13, # Right-to-Left Arabic
-  "RLE" => 14, # Right-to-Left Embedding
-  "RLO" => 15, # Right-to-Left Override
-  "PDF" => 16, # Pop Directional Format
-  "NSM" => 17, # Non-Spacing Mark
-  "BN"  => 18, # Boundary Neutral
-  "FSI" => 19, # First Strong Isolate
-  "LRI" => 20, # Left-to-Right Isolate
-  "RLI" => 21, # Right-to-left Isolate
-  "PDI" => 22  # Pop Direcitonal Isolate
+  "Allowed"      => 1
 );
 
 my %verticalOrientationCode = (
@@ -206,141 +145,18 @@ my %verticalOrientationCode = (
   'Tr' => 3  #   Tr - Transformed typographically, with fallback to Rotated
 );
 
-my %lineBreakCode = ( # ordering matches ICU's ULineBreak enum
-  "XX" => 0,
-  "AI" => 1,
-  "AL" => 2,
-  "B2" => 3,
-  "BA" => 4,
-  "BB" => 5,
-  "BK" => 6,
-  "CB" => 7,
-  "CL" => 8,
-  "CM" => 9,
-  "CR" => 10,
-  "EX" => 11,
-  "GL" => 12,
-  "HY" => 13,
-  "ID" => 14,
-  "IN" => 15,
-  "IS" => 16,
-  "LF" => 17,
-  "NS" => 18,
-  "NU" => 19,
-  "OP" => 20,
-  "PO" => 21,
-  "PR" => 22,
-  "QU" => 23,
-  "SA" => 24,
-  "SG" => 25,
-  "SP" => 26,
-  "SY" => 27,
-  "ZW" => 28,
-  "NL" => 29,
-  "WJ" => 30,
-  "H2" => 31,
-  "H3" => 32,
-  "JL" => 33,
-  "JT" => 34,
-  "JV" => 35,
-  "CP" => 36,
-  "CJ" => 37,
-  "HL" => 38,
-  "RI" => 39,
-  "EB" => 40,
-  "EM" => 41,
-  "ZWJ" => 42
-);
-
-my %eastAsianWidthCode = (
-  "N" => 0,
-  "A" => 1,
-  "H" => 2,
-  "W" => 3,
-  "F" => 4,
-  "Na" => 5
-);
-
 # initialize default properties
-my @script;
-my @category;
-my @combining;
-my @mirror;
-my @pairedBracketType;
-my @hangul;
-my @casemap;
-my @idtype;
-my @numericvalue;
 my @hanVariant;
-my @bidicategory;
 my @fullWidth;
 my @fullWidthInverse;
 my @verticalOrientation;
-my @lineBreak;
-my @eastAsianWidthFWH;
-my @defaultIgnorable;
 for (my $i = 0; $i < 0x110000; ++$i) {
-    $script[$i] = $scriptCode{"UNKNOWN"};
-    $category[$i] = $catCode{"UNASSIGNED"};
-    $combining[$i] = 0;
-    $pairedBracketType[$i] = 0;
-    $casemap[$i] = 0;
-    $idtype[$i] = $mappedIdType{'Restricted'};
-    $numericvalue[$i] = -1;
     $hanVariant[$i] = 0;
-    $bidicategory[$i] = $bidicategoryCode{"L"};
     $fullWidth[$i] = 0;
     $fullWidthInverse[$i] = 0;
     $verticalOrientation[$i] = 1; # default for unlisted codepoints is 'R'
-    $lineBreak[$i] = $lineBreakCode{"XX"};
-    $eastAsianWidthFWH[$i] = 0;
-    $defaultIgnorable[$i] = 0;
 }
 
-# blocks where the default for bidi category is not L
-for my $i (0x0600..0x07BF, 0x08A0..0x08FF, 0xFB50..0xFDCF, 0xFDF0..0xFDFF, 0xFE70..0xFEFF, 0x1EE00..0x0001EEFF) {
-  $bidicategory[$i] = $bidicategoryCode{"AL"};
-}
-for my $i (0x0590..0x05FF, 0x07C0..0x089F, 0xFB1D..0xFB4F, 0x00010800..0x00010FFF, 0x0001E800..0x0001EDFF, 0x0001EF00..0x0001EFFF) {
-  $bidicategory[$i] = $bidicategoryCode{"R"};
-}
-for my $i (0x20A0..0x20CF) {
-  $bidicategory[$i] = $bidicategoryCode{"ET"};
-}
-
-my %ucd2hb = (
-'Cc' => 'CONTROL',
-'Cf' => 'FORMAT',
-'Cn' => 'UNASSIGNED',
-'Co' => 'PRIVATE_USE',
-'Cs' => 'SURROGATE',
-'Ll' => 'LOWERCASE_LETTER',
-'Lm' => 'MODIFIER_LETTER',
-'Lo' => 'OTHER_LETTER',
-'Lt' => 'TITLECASE_LETTER',
-'Lu' => 'UPPERCASE_LETTER',
-'Mc' => 'SPACING_MARK',
-'Me' => 'ENCLOSING_MARK',
-'Mn' => 'NON_SPACING_MARK',
-'Nd' => 'DECIMAL_NUMBER',
-'Nl' => 'LETTER_NUMBER',
-'No' => 'OTHER_NUMBER',
-'Pc' => 'CONNECT_PUNCTUATION',
-'Pd' => 'DASH_PUNCTUATION',
-'Pe' => 'CLOSE_PUNCTUATION',
-'Pf' => 'FINAL_PUNCTUATION',
-'Pi' => 'INITIAL_PUNCTUATION',
-'Po' => 'OTHER_PUNCTUATION',
-'Ps' => 'OPEN_PUNCTUATION',
-'Sc' => 'CURRENCY_SYMBOL',
-'Sk' => 'MODIFIER_SYMBOL',
-'Sm' => 'MATH_SYMBOL',
-'So' => 'OTHER_SYMBOL',
-'Zl' => 'LINE_SEPARATOR',
-'Zp' => 'PARAGRAPH_SEPARATOR',
-'Zs' => 'SPACE_SEPARATOR'
-);
-
 # read ReadMe.txt
 my @versionInfo;
 open FH, "< $UNICODE/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
@@ -350,12 +166,6 @@ while (<FH>) {
 }
 close FH;
 
-my $kTitleToUpper = 0x80000000;
-my $kUpperToLower = 0x40000000;
-my $kLowerToTitle = 0x20000000;
-my $kLowerToUpper = 0x10000000;
-my $kCaseMapCharMask = 0x001fffff;
-
 # read UnicodeData.txt
 open FH, "< $UNICODE/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
 while (<FH>) {
@@ -368,12 +178,6 @@ while (<FH>) {
         if ($fields[1] =~ /Last/) {
             my $last = hex "0x$fields[0]";
             do {
-                $category[$first] = $catCode{$ucd2hb{$fields[2]}};
-                $combining[$first] = $fields[3];
-                $bidicategory[$first] = $bidicategoryCode{$fields[4]};
-                unless (length($fields[7]) == 0) {
-                  $numericvalue[$first] = $fields[7];
-                }
                 if ($fields[1] =~ /CJK/) {
                   @hanVariant[$first] = 3;
                 }
@@ -384,33 +188,6 @@ while (<FH>) {
         }
     } else {
         my $usv = hex "0x$fields[0]";
-        $category[$usv] = $catCode{$ucd2hb{$fields[2]}};
-        $combining[$usv] = $fields[3];
-        my $upper = hex $fields[12];
-        my $lower = hex $fields[13];
-        my $title = hex $fields[14];
-        # we only store one mapping for each character,
-        # but also record what kind of mapping it is
-        if ($upper && $lower) {
-            $casemap[$usv] |= $kTitleToUpper;
-            $casemap[$usv] |= ($usv ^ $upper);
-        }
-        elsif ($lower) {
-            $casemap[$usv] |= $kUpperToLower;
-            $casemap[$usv] |= ($usv ^ $lower);
-        }
-        elsif ($title && ($title != $upper)) {
-            $casemap[$usv] |= $kLowerToTitle;
-            $casemap[$usv] |= ($usv ^ $title);
-        }
-        elsif ($upper) {
-            $casemap[$usv] |= $kLowerToUpper;
-            $casemap[$usv] |= ($usv ^ $upper);
-        }
-        $bidicategory[$usv] = $bidicategoryCode{$fields[4]};
-        unless (length($fields[7]) == 0) {
-          $numericvalue[$usv] = $fields[7];
-        }
         if ($fields[1] =~ /CJK/) {
           @hanVariant[$usv] = 3;
         }
@@ -430,176 +207,6 @@ while (<FH>) {
 }
 close FH;
 
-# read Scripts.txt
-open FH, "< $UNICODE/Scripts.txt" or die "can't open UCD file Scripts.txt\n";
-push @versionInfo, "";
-while (<FH>) {
-    chomp;
-    push @versionInfo, $_;
-    last if /Date:/;
-}
-while (<FH>) {
-    if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^ ]+)/) {
-        my $script = uc($3);
-        warn "unknown ICU script $script" unless exists $scriptCode{$script};
-        my $script = $scriptCode{$script};
-        my $start = hex "0x$1";
-        my $end = (defined $2) ? hex "0x$2" : $start;
-        for (my $i = $start; $i <= $end; ++$i) {
-            $script[$i] = $script;
-        }
-    }
-}
-close FH;
-
-# read BidiMirroring.txt
-my @offsets = ();
-push @offsets, 0;
-
-open FH, "< $UNICODE/BidiMirroring.txt" or die "can't open UCD file BidiMirroring.txt\n";
-push @versionInfo, "";
-while (<FH>) {
-    chomp;
-    push @versionInfo, $_;
-    last if /Date:/;
-}
-while (<FH>) {
-    s/#.*//;
-    if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6})/) {
-        my $mirrorOffset = hex("0x$2") - hex("0x$1");
-        my $offsetIndex = first { $offsets[$_] eq $mirrorOffset } 0..$#offsets;
-        if ($offsetIndex == undef) {
-            die "too many offset codes\n" if scalar @offsets == 31;
-            push @offsets, $mirrorOffset;
-            $offsetIndex = $#offsets;
-        }
-        $mirror[hex "0x$1"] = $offsetIndex;
-    }
-}
-close FH;
-
-# read BidiBrackets.txt
-my %pairedBracketTypeCode = (
-  'N' => 0,
-  'O' => 1,
-  'C' => 2
-);
-open FH, "< $UNICODE/BidiBrackets.txt" or die "can't open UCD file BidiBrackets.txt\n";
-push @versionInfo, "";
-while (<FH>) {
-    chomp;
-    push @versionInfo, $_;
-    last if /Date:/;
-}
-while (<FH>) {
-    s/#.*//;
-    if (m/([0-9A-F]{4,6});\s*([0-9A-F]{4,6});\s*(.)/) {
-        my $mirroredChar = $offsets[$mirror[hex "0x$1"]] + hex "0x$1";
-        die "bidi bracket does not match mirrored char\n" unless $mirroredChar == hex "0x$2";
-        my $pbt = uc($3);
-        warn "unknown Bidi Bracket type" unless exists $pairedBracketTypeCode{$pbt};
-        $pairedBracketType[hex "0x$1"] = $pairedBracketTypeCode{$pbt};
-    }
-}
-close FH;
-
-# read HangulSyllableType.txt
-my %hangulType = (
-  'L'   => 0x01,
-  'V'   => 0x02,
-  'T'   => 0x04,
-  'LV'  => 0x03,
-  'LVT' => 0x07
-);
-open FH, "< $UNICODE/HangulSyllableType.txt" or die "can't open UCD file HangulSyllableType.txt\n";
-push @versionInfo, "";
-while (<FH>) {
-    chomp;
-    push @versionInfo, $_;
-    last if /Date:/;
-}
-while (<FH>) {
-    s/#.*//;
-    if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
-        my $hangul = uc($3);
-        warn "unknown Hangul syllable type" unless exists $hangulType{$hangul};
-        $hangul = $hangulType{$hangul};
-        my $start = hex "0x$1";
-        my $end = (defined $2) ? hex "0x$2" : $start;
-        for (my $i = $start; $i <= $end; ++$i) {
-            $hangul[$i] = $hangul;
-        }
-    }
-}
-close FH;
-
-# read LineBreak.txt
-open FH, "< $UNICODE/LineBreak.txt" or die "can't open UCD file LineBreak.txt\n";
-push @versionInfo, "";
-while (<FH>) {
-    chomp;
-    push @versionInfo, $_;
-    last if /Date:/;
-}
-while (<FH>) {
-    s/#.*//;
-    if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
-        my $lb = uc($3);
-        warn "unknown LineBreak class" unless exists $lineBreakCode{$lb};
-        $lb = $lineBreakCode{$lb};
-        my $start = hex "0x$1";
-        my $end = (defined $2) ? hex "0x$2" : $start;
-        for (my $i = $start; $i <= $end; ++$i) {
-            $lineBreak[$i] = $lb;
-        }
-    }
-}
-close FH;
-
-# read EastAsianWidth.txt
-open FH, "< $UNICODE/EastAsianWidth.txt" or die "can't open UCD file EastAsianWidth.txt\n";
-push @versionInfo, "";
-while (<FH>) {
-    chomp;
-    push @versionInfo, $_;
-    last if /Date:/;
-}
-while (<FH>) {
-    s/#.*//;
-    if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*([^ ]+)/) {
-        my $start = hex "0x$1";
-        my $end = (defined $2) ? hex "0x$2" : $start;
-        my $eaw = $3;
-        warn "unknown EastAsianWidth class" unless exists $eastAsianWidthCode{$eaw};
-        my $isFWH = ($eaw =~ m/^[FWH]$/) ? 1 : 0;
-        for (my $i = $start; $i <= $end; ++$i) {
-            $eastAsianWidthFWH[$i] = $isFWH;
-        }
-    }
-}
-close FH;
-
-# read DerivedCoreProperties.txt (for Default-Ignorables)
-open FH, "< $UNICODE/DerivedCoreProperties.txt" or die "can't open UCD file DerivedCoreProperties.txt\n";
-push @versionInfo, "";
-
-while (<FH>) {
-    chomp;
-    push @versionInfo, $_;
-    last if /Date:/;
-}
-while (<FH>) {
-    s/#.*//;
-    if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s*;\s*Default_Ignorable_Code_Point/) {
-        my $start = hex "0x$1";
-        my $end = (defined $2) ? hex "0x$2" : $start;
-        for (my $i = $start; $i <= $end; ++$i) {
-            $defaultIgnorable[$i] = 1;
-        }
-    }
-}
-close FH;
-
 # read IdentifierStatus.txt
 open FH, "< $UNICODE/security/IdentifierStatus.txt" or die "can't open UCD file IdentifierStatus.txt\n";
 push @versionInfo, "";
@@ -623,33 +230,6 @@ while (<FH>) {
 }
 close FH;
 
-# read IdentifierType.txt, to find Aspirational characters
-open FH, "< $UNICODE/security/IdentifierType.txt" or die "can't open UCD file IdentifierType.txt\n";
-push @versionInfo, "";
-while (<FH>) {
-  chomp;
-  s/\xef\xbb\xbf//;
-  push @versionInfo, $_;
-  last if /Date:/;
-}
-while (<FH>) {
-  if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))*\s+;\s+([^#]+)/) {
-    my $idtype = $3;
-    foreach (split(/ /, $idtype)) {
-      warn "unknown Identifier Type $_" unless exists $idType{$_};
-    }
-    my $start = hex "0x$1";
-    my $end = (defined $2) ? hex "0x$2" : $start;
-    if ($idtype =~ /Aspirational/ and (not $idtype =~ /Exclusion|Not_XID|Not_NFKC/)) {
-
-      for (my $i = $start; $i <= $end; ++$i) {
-        $idtype[$i] = $mappedIdType{'Aspirational'};
-      }
-    }
-  }
-}
-close FH;
-
 open FH, "< $UNICODE/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
 push @versionInfo, "";
 while (<FH>) {
@@ -686,8 +266,8 @@ while (<FH>) {
 }
 close FH;
 
-# read VerticalOrientation-16.txt
-open FH, "< $UNICODE/vertical/VerticalOrientation-16.txt" or die "can't open UTR50 data file VerticalOrientation-16.txt\n";
+# read VerticalOrientation-17.txt
+open FH, "< $UNICODE/vertical/VerticalOrientation-17.txt" or die "can't open UTR50 data file VerticalOrientation-17.txt\n";
 push @versionInfo, "";
 while (<FH>) {
     chomp;
@@ -785,8 +365,7 @@ struct nsCharProps2 {
   unsigned char mIdType:2;
 };
 |;
-&genTables("", "",
-           "CharProp2", $type, "nsCharProps2", 9, 7, \&sprintCharProps2, 16, 1, 1);
+&genTables("CharProp2", $type, "nsCharProps2", 9, 7, \&sprintCharProps2, 16, 1, 1);
 
 print HEADER "#pragma pack()\n\n";
 
@@ -802,42 +381,32 @@ sub sprintHanVariants
   return sprintf("0x%02x,", $val);
 }
 ## Han Variant data currently unused but may be needed in future, see bug 857481
-## &genTables("", "", "HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
+## &genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
 
 sub sprintFullWidth
 {
   my $usv = shift;
   return sprintf("0x%04x,", $fullWidth[$usv]);
 }
-&genTables("", "", "FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
+&genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
 
 sub sprintFullWidthInverse
 {
   my $usv = shift;
   return sprintf("0x%04x,", $fullWidthInverse[$usv]);
 }
-&genTables("", "", "FullWidthInverse", "", "uint16_t", 10, 6, \&sprintFullWidthInverse, 0, 2, 1);
+&genTables("FullWidthInverse", "", "uint16_t", 10, 6, \&sprintFullWidthInverse, 0, 2, 1);
 
 print STDERR "Total data = $totalData\n";
 
-printf DATA_TABLES "const uint32_t kTitleToUpper = 0x%08x;\n", $kTitleToUpper;
-printf DATA_TABLES "const uint32_t kUpperToLower = 0x%08x;\n", $kUpperToLower;
-printf DATA_TABLES "const uint32_t kLowerToTitle = 0x%08x;\n", $kLowerToTitle;
-printf DATA_TABLES "const uint32_t kLowerToUpper = 0x%08x;\n", $kLowerToUpper;
-printf DATA_TABLES "const uint32_t kCaseMapCharMask = 0x%08x;\n\n", $kCaseMapCharMask;
-
 sub genTables
 {
-  my ($guardBegin, $guardEnd,
-      $prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
+  my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
 
   if ($typedef ne '') {
-    print HEADER "$guardBegin\n";
     print HEADER "$typedef\n";
-    print HEADER "$guardEnd\n\n";
   }
 
-  print DATA_TABLES "\n$guardBegin\n";
   print DATA_TABLES "#define k${prefix}MaxPlane  $maxPlane\n";
   print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n";
   print DATA_TABLES "#define k${prefix}CharBits  $charBits\n";
@@ -906,7 +475,6 @@ sub genTables
     print DATA_TABLES $i < $#char ? "},\n" : "}\n";
   }
   print DATA_TABLES "};\n";
-  print DATA_TABLES "$guardEnd\n";
 
   my $dataSize = $pmCount * $indexLen * $pmBits/8 +
                  $chCount * $pageLen * $bytesPerEntry + 
@@ -926,7 +494,7 @@ close DATA_TABLES;
 
 print HEADER "namespace mozilla {\n";
 print HEADER "namespace unicode {\n";
-print HEADER "enum class Script {\n";
+print HEADER "enum class Script : int16_t {\n";
 for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) {
   print HEADER "  ", $scriptCodeToName[$i], " = ", $i, ",\n";
 }
diff --git a/intl/unicharutil/util/nsUnicodeProperties.h b/intl/unicharutil/util/nsUnicodeProperties.h
index 2ff69d19a5..d3c4717b9a 100644
--- a/intl/unicharutil/util/nsUnicodeProperties.h
+++ b/intl/unicharutil/util/nsUnicodeProperties.h
@@ -44,7 +44,6 @@ enum PairedBracketType {
 enum IdentifierType {
   IDTYPE_RESTRICTED = 0,
   IDTYPE_ALLOWED = 1,
-  IDTYPE_ASPIRATIONAL = 2,
 };
 
 enum EmojiPresentation {
diff --git a/intl/unicharutil/util/nsUnicodePropertyData.cpp b/intl/unicharutil/util/nsUnicodePropertyData.cpp
index fc730ac5b5..dccf14bcd2 100644
--- a/intl/unicharutil/util/nsUnicodePropertyData.cpp
+++ b/intl/unicharutil/util/nsUnicodePropertyData.cpp
@@ -11,13 +11,12 @@
  */
 
 /*
- * Created on Wed Jun 22 10:09:48 2022 from UCD data files with version info:
+ * Created on Thu Jun 23 07:44:34 2022 from UCD data files with version info:
  *
 
 # Unicode Character Database
-# Date: 2016-06-20, 14:59:00 GMT [KW]
-# © 2016 Unicode®, Inc.
-# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# Date: 2017-06-18, 23:32:00 GMT [KW]
+# © 2017 Unicode®, Inc.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
 # For documentation, see the following:
@@ -25,44 +24,20 @@
 # UAX #38, "Unicode Han Database (Unihan)"
 # UAX #44, "Unicode Character Database."
 #
-# The UAXes can be accessed at http://www.unicode.org/versions/Unicode9.0.0/
+# The UAXes can be accessed at http://www.unicode.org/versions/Unicode10.0.0/
 
 This directory contains the final data files
-for the Unicode Character Database, for Version 9.0.0 of the Unicode Standard.
-
-# Scripts-9.0.0.txt
-# Date: 2016-06-01, 10:34:37 GMT
-
-# BidiMirroring-9.0.0.txt
-# Date: 2016-01-21, 22:00:00 GMT [KW, LI]
-
-# BidiBrackets-9.0.0.txt
-# Date: 2016-06-07, 22:30:00 GMT [AG, LI, KW]
-
-# HangulSyllableType-9.0.0.txt
-# Date: 2016-03-02, 18:55:01 GMT
-
-# LineBreak-9.0.0.txt
-# Date: 2016-05-26, 01:00:00 GMT [KW, LI]
-
-# EastAsianWidth-9.0.0.txt
-# Date: 2016-05-27, 17:00:00 GMT [KW, LI]
-
-# DerivedCoreProperties-9.0.0.txt
-# Date: 2016-06-01, 10:34:24 GMT
+for the Unicode Character Database, for Version 10.0.0 of the Unicode Standard.
 
 # IdentifierStatus.txt
-# Date: 2016-06-16, 13:41:30 GMT
-
-# IdentifierType.txt
-# Date: 2016-06-16, 13:41:30 GMT
+# Date: 2017-04-08, 16:13:41 GMT
 
 #
 # Unihan_Variants.txt
-# Date: 2016-06-01 07:01:48 GMT [JHJ]
+# Date: 2017-05-14 07:01:48 GMT [JHJ]
 
-# VerticalOrientation-16.txt
-# Date: 2016-07-23, 01:00:00 GMT [EM, KI, LI]
+# VerticalOrientation-17.txt
+# Date: 2016-10-20, 07:00:00 GMT [EM, KI, LI]
 
  *
  * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
@@ -71,22 +46,20 @@ for the Unicode Character Database, for Version 9.0.0 of the Unicode Standard.
 #include <stdint.h>
 #include "harfbuzz/hb.h"
 
-
-
 #define kCharProp2MaxPlane  16
 #define kCharProp2IndexBits 9
 #define kCharProp2CharBits  7
 static const uint8_t sCharProp2Planes[16] = {1,2,3,4,4,4,4,4,4,4,4,4,4,4,3,3};
 
 static const uint8_t sCharProp2Pages[5][512] = {
-  {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,34,35,36,37,38,39,40,40,40,41,16,16,42,43,44,16,16,16,16,16,16,16,45,16,16,46,47,48,49,50,51,52,53,54,16,55,56,57,34,16,58,59,34,60,61,16,16,16,16,16,16,62,63,16,16,64,65,16,34,34,34,66,67,68,69,34,34,70,34,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,72,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,73,40,40,40,40,40,40,40,40,40,74,16,16,75,76,77,78,16,16,79,80,81,16,82,16,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,83,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,84,34,16,16,16,16,16,16,85,16,86,87},
-  {16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,88,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,34,34,34,34,89,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,90,91,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,16,16,34,16,16,16,16,16,16,16,16,16,34,34,34,34,34,89,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,92,34,34,34,34,34,34,34,34,34,34,34,16,16,34,34,16,16,16,16,16,16,16,16,16,16,16,16},
-  {71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,93,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,94,71,95,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,96,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,97},
-  {34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,97},
+  {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,34,35,36,37,38,39,34,34,34,34,16,16,40,16,41,16,16,16,16,16,16,16,42,16,16,43,44,45,46,47,48,49,50,51,16,52,53,54,34,16,55,56,34,57,58,16,16,16,16,16,16,59,60,16,16,61,62,16,34,34,34,63,64,65,66,34,34,67,34,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,69,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,70,34,34,34,34,34,34,34,34,34,71,16,16,72,73,74,75,16,16,76,77,78,16,79,16,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,80,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,81,34,16,16,16,16,16,16,82,16,83,84},
+  {16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,85,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,34,34,34,34,86,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,76,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,87,68,88,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,16,16,34,16,16,16,16,16,16,16,16,16,34,34,34,34,34,86,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,34,34,34,34,89,34,34,34,34,34,34,34,34,34,34,34,16,16,34,34,16,16,16,16,16,16,16,16,16,16,16,16},
+  {68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,90,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,91,68,92,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,93,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,68,94,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,95},
+  {34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34,95},
   {16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16}
 };
 
-static const nsCharProps2 sCharProp2Values[98][128] = {
+static const nsCharProps2 sCharProp2Values[96][128] = {
   {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{1,0},{0,0},{1,0},{1,0},{1,0},{1,0},{0,0},{1,0},{1,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{0,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{0,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
   {{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0}},
@@ -106,14 +79,14 @@ static const nsCharProps2 sCharProp2Values[98][128] = {
   {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
-  {{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
+  {{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0}},
   {{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
-  {{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
+  {{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
   {{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,1},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
-  {{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
+  {{1,1},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
   {{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,0},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,1},{1,0},{1,0},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
@@ -126,12 +99,9 @@ static const nsCharProps2 sCharProp2Values[98][128] = {
   {{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
   {{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
-  {{1,0},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2}},
-  {{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2}},
-  {{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,0},{0,0},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2}},
+  {{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
   {{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
-  {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
-  {{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0},{1,0},{1,0},{1,0},{1,0},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+  {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
   {{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1}},
   {{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
@@ -151,17 +121,17 @@ static const nsCharProps2 sCharProp2Values[98][128] = {
   {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
-  {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,2}},
+  {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{0,0},{2,0},{2,0},{0,0},{0,0},{0,1},{0,1},{0,1},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{0,0},{0,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{3,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{3,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1}},
   {{0,1},{0,1},{0,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{2,1},{0,0},{0,0},{0,1},{0,1},{2,0},{2,0},{0,1},{0,1},{0,0},{3,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{2,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{3,1},{0,1},{0,1},{0,0}},
-  {{0,0},{0,0},{0,0},{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+  {{0,0},{0,0},{0,0},{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{2,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
   {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0}},
   {{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{2,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{2,0},{2,0},{2,0},{2,0},{2,0}},
   {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1}},
   {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
-  {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
-  {{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,2},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
+  {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+  {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,1}},
   {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
@@ -177,18 +147,16 @@ static const nsCharProps2 sCharProp2Values[98][128] = {
   {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{3,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{0,0},{0,0},{1,0},{1,0}},
   {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
-  {{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0}},
-  {{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,2},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+  {{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1}},
+  {{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,1},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}},
   {{2,0},{2,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
   {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
   {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1}},
   {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1}},
-  {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+  {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1}},
+  {{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
   {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,0},{1,0}}
 };
-
-
-
 #define kFullWidthMaxPlane  0
 #define kFullWidthIndexBits 10
 #define kFullWidthCharBits  6
@@ -207,9 +175,6 @@ static const uint16_t sFullWidthValues[9][64] = {
   {0x30bf,0x30c1,0x30c4,0x30c6,0x30c8,0x30ca,0x30cb,0x30cc,0x30cd,0x30ce,0x30cf,0x30d2,0x30d5,0x30d8,0x30db,0x30de,0x30df,0x30e0,0x30e1,0x30e2,0x30e4,0x30e6,0x30e8,0x30e9,0x30ea,0x30eb,0x30ec,0x30ed,0x30ef,0x30f3,0x3099,0x309a,0x3164,0x3131,0x3132,0x3133,0x3134,0x3135,0x3136,0x3137,0x3138,0x3139,0x313a,0x313b,0x313c,0x313d,0x313e,0x313f,0x3140,0x3141,0x3142,0x3143,0x3144,0x3145,0x3146,0x3147,0x3148,0x3149,0x314a,0x314b,0x314c,0x314d,0x314e,0x0000},
   {0x0000,0x0000,0x314f,0x3150,0x3151,0x3152,0x3153,0x3154,0x0000,0x0000,0x3155,0x3156,0x3157,0x3158,0x3159,0x315a,0x0000,0x0000,0x315b,0x315c,0x315d,0x315e,0x315f,0x3160,0x0000,0x0000,0x3161,0x3162,0x3163,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x2502,0x2190,0x2191,0x2192,0x2193,0x25a0,0x25cb,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000}
 };
-
-
-
 #define kFullWidthInverseMaxPlane  0
 #define kFullWidthInverseIndexBits 10
 #define kFullWidthInverseCharBits  6
@@ -232,13 +197,6 @@ static const uint16_t sFullWidthInverseValues[13][64] = {
   {0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006a,0x006b,0x006c,0x006d,0x006e,0x006f,0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007a,0x007b,0x007c,0x007d,0x007e,0x2985,0x2986,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000},
   {0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x00a2,0x00a3,0x00ac,0x00af,0x00a6,0x00a5,0x20a9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000}
 };
-
-const uint32_t kTitleToUpper = 0x80000000;
-const uint32_t kUpperToLower = 0x40000000;
-const uint32_t kLowerToTitle = 0x20000000;
-const uint32_t kLowerToUpper = 0x10000000;
-const uint32_t kCaseMapCharMask = 0x001fffff;
-
 /*
  * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
  */
diff --git a/intl/unicharutil/util/nsUnicodeScriptCodes.h b/intl/unicharutil/util/nsUnicodeScriptCodes.h
index 8cbc2b6a4a..b69148386e 100644
--- a/intl/unicharutil/util/nsUnicodeScriptCodes.h
+++ b/intl/unicharutil/util/nsUnicodeScriptCodes.h
@@ -11,13 +11,12 @@
  */
 
 /*
- * Created on Wed Jun 22 10:09:48 2022 from UCD data files with version info:
+ * Created on Thu Jun 23 07:44:34 2022 from UCD data files with version info:
  *
 
 # Unicode Character Database
-# Date: 2016-06-20, 14:59:00 GMT [KW]
-# © 2016 Unicode®, Inc.
-# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# Date: 2017-06-18, 23:32:00 GMT [KW]
+# © 2017 Unicode®, Inc.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
 # For documentation, see the following:
@@ -25,44 +24,20 @@
 # UAX #38, "Unicode Han Database (Unihan)"
 # UAX #44, "Unicode Character Database."
 #
-# The UAXes can be accessed at http://www.unicode.org/versions/Unicode9.0.0/
+# The UAXes can be accessed at http://www.unicode.org/versions/Unicode10.0.0/
 
 This directory contains the final data files
-for the Unicode Character Database, for Version 9.0.0 of the Unicode Standard.
-
-# Scripts-9.0.0.txt
-# Date: 2016-06-01, 10:34:37 GMT
-
-# BidiMirroring-9.0.0.txt
-# Date: 2016-01-21, 22:00:00 GMT [KW, LI]
-
-# BidiBrackets-9.0.0.txt
-# Date: 2016-06-07, 22:30:00 GMT [AG, LI, KW]
-
-# HangulSyllableType-9.0.0.txt
-# Date: 2016-03-02, 18:55:01 GMT
-
-# LineBreak-9.0.0.txt
-# Date: 2016-05-26, 01:00:00 GMT [KW, LI]
-
-# EastAsianWidth-9.0.0.txt
-# Date: 2016-05-27, 17:00:00 GMT [KW, LI]
-
-# DerivedCoreProperties-9.0.0.txt
-# Date: 2016-06-01, 10:34:24 GMT
+for the Unicode Character Database, for Version 10.0.0 of the Unicode Standard.
 
 # IdentifierStatus.txt
-# Date: 2016-06-16, 13:41:30 GMT
-
-# IdentifierType.txt
-# Date: 2016-06-16, 13:41:30 GMT
+# Date: 2017-04-08, 16:13:41 GMT
 
 #
 # Unihan_Variants.txt
-# Date: 2016-06-01 07:01:48 GMT [JHJ]
+# Date: 2017-05-14 07:01:48 GMT [JHJ]
 
-# VerticalOrientation-16.txt
-# Date: 2016-07-23, 01:00:00 GMT [EM, KI, LI]
+# VerticalOrientation-17.txt
+# Date: 2016-10-20, 07:00:00 GMT [EM, KI, LI]
 
  *
  * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
@@ -74,7 +49,6 @@ for the Unicode Character Database, for Version 9.0.0 of the Unicode Standard.
 #pragma pack(1)
 
 
-
 struct nsCharProps2 {
   // Currently only 4 bits are defined here, so 4 more could be added without
   // affecting the storage requirements for this struct. Or we could pack two
@@ -83,13 +57,11 @@ struct nsCharProps2 {
   unsigned char mIdType:2;
 };
 
-
-
 #pragma pack()
 
 namespace mozilla {
 namespace unicode {
-enum class Script {
+enum class Script : int16_t {
   COMMON = 0,
   INHERITED = 1,
   ARABIC = 2,
diff --git a/ipc/chromium/src/base/condition_variable_posix.cc b/ipc/chromium/src/base/condition_variable_posix.cc
index 58565541e8..4a8024c2bc 100644
--- a/ipc/chromium/src/base/condition_variable_posix.cc
+++ b/ipc/chromium/src/base/condition_variable_posix.cc
@@ -19,8 +19,7 @@ using base::TimeDelta;
 ConditionVariable::ConditionVariable(Lock* user_lock)
     : user_mutex_(user_lock->lock_impl()->os_lock()) {
   int rv = 0;
-#if !defined(OS_MACOSX) && \
-    defined(HAVE_PTHREAD_COND_TIMEDWAIT_MONOTONIC)
+#if !defined(OS_MACOSX)
   pthread_condattr_t attrs;
   rv = pthread_condattr_init(&attrs);
   DCHECK_EQ(0, rv);
diff --git a/media/libjpeg/1050342.diff b/media/libjpeg/1050342.diff
deleted file mode 100644
index a409ca2c47..0000000000
--- a/media/libjpeg/1050342.diff
+++ /dev/null
@@ -1,121 +0,0 @@
-Bug 1050342. Fix a case where the fast huffman decoder in libjpeg-turbo can produce different results depending on how data is fed to it.
-
-This change comes from the blink repo https://codereview.appspot.com/229430043/ and is unlikely to be accepted upstream into libjpeg-turbo.
-
-diff --git jdhuff.c jdhuff.c
---- jdhuff.c
-+++ jdhuff.c
-@@ -664,17 +664,17 @@ decode_mcu_fast (j_decompress_ptr cinfo,
-   ASSIGN_STATE(state, entropy->saved);
- 
-   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
-     d_derived_tbl *dctbl = entropy->dc_cur_tbls[blkn];
-     d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn];
-     register int s, k, r, l;
- 
--    HUFF_DECODE_FAST(s, l, dctbl);
-+    HUFF_DECODE_FAST(s, l, dctbl, slow_decode_mcu);
-     if (s) {
-       FILL_BIT_BUFFER_FAST
-       r = GET_BITS(s);
-       s = HUFF_EXTEND(r, s);
-     }
- 
-     if (entropy->dc_needed[blkn]) {
-       int ci = cinfo->MCU_membership[blkn];
-@@ -682,17 +682,17 @@ decode_mcu_fast (j_decompress_ptr cinfo,
-       state.last_dc_val[ci] = s;
-       if (block)
-         (*block)[0] = (JCOEF) s;
-     }
- 
-     if (entropy->ac_needed[blkn] && block) {
- 
-       for (k = 1; k < DCTSIZE2; k++) {
--        HUFF_DECODE_FAST(s, l, actbl);
-+        HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
-         r = s >> 4;
-         s &= 15;
- 
-         if (s) {
-           k += r;
-           FILL_BIT_BUFFER_FAST
-           r = GET_BITS(s);
-           s = HUFF_EXTEND(r, s);
-@@ -701,33 +701,34 @@ decode_mcu_fast (j_decompress_ptr cinfo,
-           if (r != 15) break;
-           k += 15;
-         }
-       }
- 
-     } else {
- 
-       for (k = 1; k < DCTSIZE2; k++) {
--        HUFF_DECODE_FAST(s, l, actbl);
-+        HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
-         r = s >> 4;
-         s &= 15;
- 
-         if (s) {
-           k += r;
-           FILL_BIT_BUFFER_FAST
-           DROP_BITS(s);
-         } else {
-           if (r != 15) break;
-           k += 15;
-         }
-       }
-     }
-   }
- 
-   if (cinfo->unread_marker != 0) {
-+slow_decode_mcu:
-     cinfo->unread_marker = 0;
-     return FALSE;
-   }
- 
-   br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
-   br_state.next_input_byte = buffer;
-   BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-   ASSIGN_STATE(entropy->saved, state);
-diff --git jdhuff.h jdhuff.h
---- jdhuff.h
-+++ jdhuff.h
-@@ -203,32 +203,34 @@ EXTERN(boolean) jpeg_fill_bit_buffer
-   } else { \
- slowlabel: \
-     if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
-         { failaction; } \
-     get_buffer = state.get_buffer; bits_left = state.bits_left; \
-   } \
- }
- 
--#define HUFF_DECODE_FAST(s,nb,htbl) \
-+#define HUFF_DECODE_FAST(s,nb,htbl,slowlabel) \
-   FILL_BIT_BUFFER_FAST; \
-   s = PEEK_BITS(HUFF_LOOKAHEAD); \
-   s = htbl->lookup[s]; \
-   nb = s >> HUFF_LOOKAHEAD; \
-   /* Pre-execute the common case of nb <= HUFF_LOOKAHEAD */ \
-   DROP_BITS(nb); \
-   s = s & ((1 << HUFF_LOOKAHEAD) - 1); \
-   if (nb > HUFF_LOOKAHEAD) { \
-     /* Equivalent of jpeg_huff_decode() */ \
-     /* Don't use GET_BITS() here because we don't want to modify bits_left */ \
-     s = (get_buffer >> bits_left) & ((1 << (nb)) - 1); \
-     while (s > htbl->maxcode[nb]) { \
-       s <<= 1; \
-       s |= GET_BITS(1); \
-       nb++; \
-     } \
--    s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) & 0xFF ]; \
-+    if (nb > 16) \
-+      goto slowlabel; \
-+    s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) ]; \
-   }
- 
- /* Out-of-line case for Huffman code fetching */
- EXTERN(int) jpeg_huff_decode
-         (bitread_working_state *state, register bit_buf_type get_buffer,
-          register int bits_left, d_derived_tbl *htbl, int min_bits);
diff --git a/media/libjpeg/ChangeLog.md b/media/libjpeg/ChangeLog.md
new file mode 100644
index 0000000000..e6700c3c27
--- /dev/null
+++ b/media/libjpeg/ChangeLog.md
@@ -0,0 +1,1839 @@
+2.1.3
+=====
+
+### Significant changes relative to 2.1.2
+
+1. Fixed a regression introduced by 2.0 beta1[7] whereby cjpeg compressed PGM
+input files into full-color JPEG images unless the `-grayscale` option was
+used.
+
+2. cjpeg now automatically compresses GIF and 8-bit BMP input files into
+grayscale JPEG images if the input files contain only shades of gray.
+
+3. The build system now enables the intrinsics implementation of the AArch64
+(Arm 64-bit) Neon SIMD extensions by default when using GCC 12 or later.
+
+4. Fixed a segfault that occurred while decompressing a 4:2:0 JPEG image using
+the merged (non-fancy) upsampling algorithms (that is, with
+`cinfo.do_fancy_upsampling` set to `FALSE`) along with `jpeg_crop_scanline()`.
+Specifically, the segfault occurred if the number of bytes remaining in the
+output buffer was less than the number of bytes required to represent one
+uncropped scanline of the output image.  For that reason, the issue could only
+be reproduced using the libjpeg API, not using djpeg.
+
+
+2.1.2
+=====
+
+### Significant changes relative to 2.1.1
+
+1. Fixed a regression introduced by 2.1 beta1[13] that caused the remaining
+GAS implementations of AArch64 (Arm 64-bit) Neon SIMD functions (which are used
+by default with GCC for performance reasons) to be placed in the `.rodata`
+section rather than in the `.text` section.  This caused the GNU linker to
+automatically place the `.rodata` section in an executable segment, which
+prevented libjpeg-turbo from working properly with other linkers and also
+represented a potential security risk.
+
+2. Fixed an issue whereby the `tjTransform()` function incorrectly computed the
+MCU block size for 4:4:4 JPEG images with non-unary sampling factors and thus
+unduly rejected some cropping regions, even though those regions aligned with
+8x8 MCU block boundaries.
+
+3. Fixed a regression introduced by 2.1 beta1[13] that caused the build system
+to enable the Arm Neon SIMD extensions when targetting Armv6 and other legacy
+architectures that do not support Neon instructions.
+
+4. libjpeg-turbo now performs run-time detection of AltiVec instructions on
+FreeBSD/PowerPC systems if AltiVec instructions are not enabled at compile
+time.  This allows both AltiVec-equipped and non-AltiVec-equipped CPUs to be
+supported using the same build of libjpeg-turbo.
+
+5. cjpeg now accepts a `-strict` argument similar to that of djpeg and
+jpegtran, which causes the compressor to abort if an LZW-compressed GIF input
+image contains incomplete or corrupt image data.
+
+
+2.1.1
+=====
+
+### Significant changes relative to 2.1.0
+
+1. Fixed a regression introduced in 2.1.0 that caused build failures with
+non-GCC-compatible compilers for Un*x/Arm platforms.
+
+2. Fixed a regression introduced by 2.1 beta1[13] that prevented the Arm 32-bit
+(AArch32) Neon SIMD extensions from building unless the C compiler flags
+included `-mfloat-abi=softfp` or `-mfloat-abi=hard`.
+
+3. Fixed an issue in the AArch32 Neon SIMD Huffman encoder whereby reliance on
+undefined C compiler behavior led to crashes ("SIGBUS: illegal alignment") on
+Android systems when running AArch32/Thumb builds of libjpeg-turbo built with
+recent versions of Clang.
+
+4. Added a command-line argument (`-copy icc`) to jpegtran that causes it to
+copy only the ICC profile markers from the source file and discard any other
+metadata.
+
+5. libjpeg-turbo should now build and run on CHERI-enabled architectures, which
+use capability pointers that are larger than the size of `size_t`.
+
+6. Fixed a regression (CVE-2021-37972) introduced by 2.1 beta1[5] that caused a
+segfault in the 64-bit SSE2 Huffman encoder when attempting to losslessly
+transform a specially-crafted malformed JPEG image.
+
+
+2.1.0
+=====
+
+### Significant changes relative to 2.1 beta1
+
+1. Fixed a regression introduced by 2.1 beta1[6(b)] whereby attempting to
+decompress certain progressive JPEG images with one or more component planes of
+width 8 or less caused a buffer overrun.
+
+2. Fixed a regression introduced by 2.1 beta1[6(b)] whereby attempting to
+decompress a specially-crafted malformed progressive JPEG image caused the
+block smoothing algorithm to read from uninitialized memory.
+
+3. Fixed an issue in the Arm Neon SIMD Huffman encoders that caused the
+encoders to generate incorrect results when using the Clang compiler with
+Visual Studio.
+
+4. Fixed a floating point exception (CVE-2021-20205) that occurred when
+attempting to compress a specially-crafted malformed GIF image with a specified
+image width of 0 using cjpeg.
+
+5. Fixed a regression introduced by 2.0 beta1[15] whereby attempting to
+generate a progressive JPEG image on an SSE2-capable CPU using a scan script
+containing one or more scans with lengths divisible by 32 and non-zero
+successive approximation low bit positions would, under certain circumstances,
+result in an error ("Missing Huffman code table entry") and an invalid JPEG
+image.
+
+6. Introduced a new flag (`TJFLAG_LIMITSCANS` in the TurboJPEG C API and
+`TJ.FLAG_LIMIT_SCANS` in the TurboJPEG Java API) and a corresponding TJBench
+command-line argument (`-limitscans`) that causes the TurboJPEG decompression
+and transform functions/operations to return/throw an error if a progressive
+JPEG image contains an unreasonably large number of scans.  This allows
+applications that use the TurboJPEG API to guard against an exploit of the
+progressive JPEG format described in the report
+["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+7. The PPM reader now throws an error, rather than segfaulting (due to a buffer
+overrun) or generating incorrect pixels, if an application attempts to use the
+`tjLoadImage()` function to load a 16-bit binary PPM file (a binary PPM file
+with a maximum value greater than 255) into a grayscale image buffer or to load
+a 16-bit binary PGM file into an RGB image buffer.
+
+8. Fixed an issue in the PPM reader that caused incorrect pixels to be
+generated when using the `tjLoadImage()` function to load a 16-bit binary PPM
+file into an extended RGB image buffer.
+
+9. Fixed an issue whereby, if a JPEG buffer was automatically re-allocated by
+one of the TurboJPEG compression or transform functions and an error
+subsequently occurred during compression or transformation, the JPEG buffer
+pointer passed by the application was not updated when the function returned.
+
+
+2.0.90 (2.1 beta1)
+==================
+
+### Significant changes relative to 2.0.6:
+
+1. The build system, x86-64 SIMD extensions, and accelerated Huffman codec now
+support the x32 ABI on Linux, which allows for using x86-64 instructions with
+32-bit pointers.  The x32 ABI is generally enabled by adding `-mx32` to the
+compiler flags.
+
+     Caveats:
+     - CMake 3.9.0 or later is required in order for the build system to
+automatically detect an x32 build.
+     - Java does not support the x32 ABI, and thus the TurboJPEG Java API will
+automatically be disabled with x32 builds.
+
+2. Added Loongson MMI SIMD implementations of the RGB-to-grayscale, 4:2:2 fancy
+chroma upsampling, 4:2:2 and 4:2:0 merged chroma upsampling/color conversion,
+and fast integer DCT/IDCT algorithms.  Relative to libjpeg-turbo 2.0.x, this
+speeds up:
+
+     - the compression of RGB source images into grayscale JPEG images by
+approximately 20%
+     - the decompression of 4:2:2 JPEG images by approximately 40-60% when
+using fancy upsampling
+     - the decompression of 4:2:2 and 4:2:0 JPEG images by approximately
+15-20% when using merged upsampling
+     - the compression of RGB source images by approximately 30-45% when using
+the fast integer DCT
+     - the decompression of JPEG images into RGB destination images by
+approximately 2x when using the fast integer IDCT
+
+    The overall decompression speedup for RGB images is now approximately
+2.3-3.7x (compared to 2-3.5x with libjpeg-turbo 2.0.x.)
+
+3. 32-bit (Armv7 or Armv7s) iOS builds of libjpeg-turbo are no longer
+supported, and the libjpeg-turbo build system can no longer be used to package
+such builds.  32-bit iOS apps cannot run in iOS 11 and later, and the App Store
+no longer allows them.
+
+4. 32-bit (i386) OS X/macOS builds of libjpeg-turbo are no longer supported,
+and the libjpeg-turbo build system can no longer be used to package such
+builds.  32-bit Mac applications cannot run in macOS 10.15 "Catalina" and
+later, and the App Store no longer allows them.
+
+5. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been
+significantly optimized, resulting in a measured average overall compression
+speedup of 12-28% for 64-bit code and 22-52% for 32-bit code on various Intel
+and AMD CPUs, as well as a measured average overall compression speedup of
+0-23% on platforms that do not have a SIMD-accelerated Huffman encoding
+implementation.
+
+6. The block smoothing algorithm that is applied by default when decompressing
+progressive Huffman-encoded JPEG images has been improved in the following
+ways:
+
+     - The algorithm is now more fault-tolerant.  Previously, if a particular
+scan was incomplete, then the smoothing parameters for the incomplete scan
+would be applied to the entire output image, including the parts of the image
+that were generated by the prior (complete) scan.  Visually, this had the
+effect of removing block smoothing from lower-frequency scans if they were
+followed by an incomplete higher-frequency scan.  libjpeg-turbo now applies
+block smoothing parameters to each iMCU row based on which scan generated the
+pixels in that row, rather than always using the block smoothing parameters for
+the most recent scan.
+     - When applying block smoothing to DC scans, a Gaussian-like kernel with a
+5x5 window is used to reduce the "blocky" appearance.
+
+7. Added SIMD acceleration for progressive Huffman encoding on Arm platforms.
+This speeds up the compression of full-color progressive JPEGs by about 30-40%
+on average (relative to libjpeg-turbo 2.0.x) when using modern Arm CPUs.
+
+8. Added configure-time and run-time auto-detection of Loongson MMI SIMD
+instructions, so that the Loongson MMI SIMD extensions can be included in any
+MIPS64 libjpeg-turbo build.
+
+9. Added fault tolerance features to djpeg and jpegtran, mainly to demonstrate
+methods by which applications can guard against the exploits of the JPEG format
+described in the report
+["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+     - Both programs now accept a `-maxscans` argument, which can be used to
+limit the number of allowable scans in the input file.
+     - Both programs now accept a `-strict` argument, which can be used to
+treat all warnings as fatal.
+
+10. CMake package config files are now included for both the libjpeg and
+TurboJPEG API libraries.  This facilitates using libjpeg-turbo with CMake's
+`find_package()` function.  For example:
+
+        find_package(libjpeg-turbo CONFIG REQUIRED)
+
+        add_executable(libjpeg_program libjpeg_program.c)
+        target_link_libraries(libjpeg_program PUBLIC libjpeg-turbo::jpeg)
+
+        add_executable(libjpeg_program_static libjpeg_program.c)
+        target_link_libraries(libjpeg_program_static PUBLIC
+          libjpeg-turbo::jpeg-static)
+
+        add_executable(turbojpeg_program turbojpeg_program.c)
+        target_link_libraries(turbojpeg_program PUBLIC
+          libjpeg-turbo::turbojpeg)
+
+        add_executable(turbojpeg_program_static turbojpeg_program.c)
+        target_link_libraries(turbojpeg_program_static PUBLIC
+          libjpeg-turbo::turbojpeg-static)
+
+11. Since the Unisys LZW patent has long expired, cjpeg and djpeg can now
+read/write both LZW-compressed and uncompressed GIF files (feature ported from
+jpeg-6a and jpeg-9d.)
+
+12. jpegtran now includes the `-wipe` and `-drop` options from jpeg-9a and
+jpeg-9d, as well as the ability to expand the image size using the `-crop`
+option.  Refer to jpegtran.1 or usage.txt for more details.
+
+13. Added a complete intrinsics implementation of the Arm Neon SIMD extensions,
+thus providing SIMD acceleration on Arm platforms for all of the algorithms
+that are SIMD-accelerated on x86 platforms.  This new implementation is
+significantly faster in some cases than the old GAS implementation--
+depending on the algorithms used, the type of CPU core, and the compiler.  GCC,
+as of this writing, does not provide a full or optimal set of Neon intrinsics,
+so for performance reasons, the default when building libjpeg-turbo with GCC is
+to continue using the GAS implementation of the following algorithms:
+
+     - 32-bit RGB-to-YCbCr color conversion
+     - 32-bit fast and accurate inverse DCT
+     - 64-bit RGB-to-YCbCr and YCbCr-to-RGB color conversion
+     - 64-bit accurate forward and inverse DCT
+     - 64-bit Huffman encoding
+
+    A new CMake variable (`NEON_INTRINSICS`) can be used to override this
+default.
+
+    Since the new intrinsics implementation includes SIMD acceleration
+for merged upsampling/color conversion, 1.5.1[5] is no longer necessary and has
+been reverted.
+
+14. The Arm Neon SIMD extensions can now be built using Visual Studio.
+
+15. The build system can now be used to generate a universal x86-64 + Armv8
+libjpeg-turbo SDK package for both iOS and macOS.
+
+
+2.0.6
+=====
+
+### Significant changes relative to 2.0.5:
+
+1. Fixed "using JNI after critical get" errors that occurred on Android
+platforms when using any of the YUV encoding/compression/decompression/decoding
+methods in the TurboJPEG Java API.
+
+2. Fixed or worked around multiple issues with `jpeg_skip_scanlines()`:
+
+     - Fixed segfaults or "Corrupt JPEG data: premature end of data segment"
+errors in `jpeg_skip_scanlines()` that occurred when decompressing 4:2:2 or
+4:2:0 JPEG images using merged (non-fancy) upsampling/color conversion (that
+is, when setting `cinfo.do_fancy_upsampling` to `FALSE`.)  2.0.0[6] was a
+similar fix, but it did not cover all cases.
+     - `jpeg_skip_scanlines()` now throws an error if two-pass color
+quantization is enabled.  Two-pass color quantization never worked properly
+with `jpeg_skip_scanlines()`, and the issues could not readily be fixed.
+     - Fixed an issue whereby `jpeg_skip_scanlines()` always returned 0 when
+skipping past the end of an image.
+
+3. The Arm 64-bit (Armv8) Neon SIMD extensions can now be built using MinGW
+toolchains targetting Arm64 (AArch64) Windows binaries.
+
+4. Fixed unexpected visual artifacts that occurred when using
+`jpeg_crop_scanline()` and interblock smoothing while decompressing only the DC
+scan of a progressive JPEG image.
+
+5. Fixed an issue whereby libjpeg-turbo would not build if 12-bit-per-component
+JPEG support (`WITH_12BIT`) was enabled along with libjpeg v7 or libjpeg v8
+API/ABI emulation (`WITH_JPEG7` or `WITH_JPEG8`.)
+
+
+2.0.5
+=====
+
+### Significant changes relative to 2.0.4:
+
+1. Worked around issues in the MIPS DSPr2 SIMD extensions that caused failures
+in the libjpeg-turbo regression tests.  Specifically, the
+`jsimd_h2v1_downsample_dspr2()` and `jsimd_h2v2_downsample_dspr2()` functions
+in the MIPS DSPr2 SIMD extensions are now disabled until/unless they can be
+fixed, and other functions that are incompatible with big endian MIPS CPUs are
+disabled when building libjpeg-turbo for such CPUs.
+
+2. Fixed an oversight in the `TJCompressor.compress(int)` method in the
+TurboJPEG Java API that caused an error ("java.lang.IllegalStateException: No
+source image is associated with this instance") when attempting to use that
+method to compress a YUV image.
+
+3. Fixed an issue (CVE-2020-13790) in the PPM reader that caused a buffer
+overrun in cjpeg, TJBench, or the `tjLoadImage()` function if one of the values
+in a binary PPM/PGM input file exceeded the maximum value defined in the file's
+header and that maximum value was less than 255.  libjpeg-turbo 1.5.0 already
+included a similar fix for binary PPM/PGM files with maximum values greater
+than 255.
+
+4. The TurboJPEG API library's global error handler, which is used in functions
+such as `tjBufSize()` and `tjLoadImage()` that do not require a TurboJPEG
+instance handle, is now thread-safe on platforms that support thread-local
+storage.
+
+
+2.0.4
+=====
+
+### Significant changes relative to 2.0.3:
+
+1. Fixed a regression in the Windows packaging system (introduced by
+2.0 beta1[2]) whereby, if both the 64-bit libjpeg-turbo SDK for GCC and the
+64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
+one of them could be uninstalled.
+
+2. Fixed a signed integer overflow and subsequent segfault that occurred when
+attempting to decompress images with more than 715827882 pixels using the
+64-bit C version of TJBench.
+
+3. Fixed out-of-bounds write in `tjDecompressToYUV2()` and
+`tjDecompressToYUVPlanes()` (sometimes manifesting as a double free) that
+occurred when attempting to decompress grayscale JPEG images that were
+compressed with a sampling factor other than 1 (for instance, with
+`cjpeg -grayscale -sample 2x2`).
+
+4. Fixed a regression introduced by 2.0.2[5] that caused the TurboJPEG API to
+incorrectly identify some JPEG images with unusual sampling factors as 4:4:4
+JPEG images.  This was known to cause a buffer overflow when attempting to
+decompress some such images using `tjDecompressToYUV2()` or
+`tjDecompressToYUVPlanes()`.
+
+5. Fixed an issue (CVE-2020-17541), detected by ASan, whereby attempting to
+losslessly transform a specially-crafted malformed JPEG image containing an
+extremely-high-frequency coefficient block (junk image data that could never be
+generated by a legitimate JPEG compressor) could cause the Huffman encoder's
+local buffer to be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].)  Given that
+the buffer overrun was fully contained within the stack and did not cause a
+segfault or other user-visible errant behavior, and given that the lossless
+transformer (unlike the decompressor) is not generally exposed to arbitrary
+data exploits, this issue did not likely pose a security risk.
+
+6. The Arm 64-bit (Armv8) Neon SIMD assembly code now stores constants in a
+separate read-only data section rather than in the text section, to support
+execute-only memory layouts.
+
+
+2.0.3
+=====
+
+### Significant changes relative to 2.0.2:
+
+1. Fixed "using JNI after critical get" errors that occurred on Android
+platforms when passing invalid arguments to certain methods in the TurboJPEG
+Java API.
+
+2. Fixed a regression in the SIMD feature detection code, introduced by
+the AVX2 SIMD extensions (2.0 beta1[1]), that was known to cause an illegal
+instruction exception, in rare cases, on CPUs that lack support for CPUID leaf
+07H (or on which the maximum CPUID leaf has been limited by way of a BIOS
+setting.)
+
+3. The 4:4:0 (h1v2) fancy (smooth) chroma upsampling algorithm in the
+decompressor now uses a similar bias pattern to that of the 4:2:2 (h2v1) fancy
+chroma upsampling algorithm, rounding up or down the upsampled result for
+alternate pixels rather than always rounding down.  This ensures that,
+regardless of whether a 4:2:2 JPEG image is rotated or transposed prior to
+decompression (in the frequency domain) or after decompression (in the spatial
+domain), the final image will be similar.
+
+4. Fixed an integer overflow and subsequent segfault that occurred when
+attempting to compress or decompress images with more than 1 billion pixels
+using the TurboJPEG API.
+
+5. Fixed a regression introduced by 2.0 beta1[15] whereby attempting to
+generate a progressive JPEG image on an SSE2-capable CPU using a scan script
+containing one or more scans with lengths divisible by 16 would result in an
+error ("Missing Huffman code table entry") and an invalid JPEG image.
+
+6. Fixed an issue whereby `tjDecodeYUV()` and `tjDecodeYUVPlanes()` would throw
+an error ("Invalid progressive parameters") or a warning ("Inconsistent
+progression sequence") if passed a TurboJPEG instance that was previously used
+to decompress a progressive JPEG image.
+
+
+2.0.2
+=====
+
+### Significant changes relative to 2.0.1:
+
+1. Fixed a regression introduced by 2.0.1[5] that prevented a runtime search
+path (rpath) from being embedded in the libjpeg-turbo shared libraries and
+executables for macOS and iOS.  This caused a fatal error of the form
+"dyld: Library not loaded" when attempting to use one of the executables,
+unless `DYLD_LIBRARY_PATH` was explicitly set to the location of the
+libjpeg-turbo shared libraries.
+
+2. Fixed an integer overflow and subsequent segfault (CVE-2018-20330) that
+occurred when attempting to load a BMP file with more than 1 billion pixels
+using the `tjLoadImage()` function.
+
+3. Fixed a buffer overrun (CVE-2018-19664) that occurred when attempting to
+decompress a specially-crafted malformed JPEG image to a 256-color BMP using
+djpeg.
+
+4. Fixed a floating point exception that occurred when attempting to
+decompress a specially-crafted malformed JPEG image with a specified image
+width or height of 0 using the C version of TJBench.
+
+5. The TurboJPEG API will now decompress 4:4:4 JPEG images with 2x1, 1x2, 3x1,
+or 1x3 luminance and chrominance sampling factors.  This is a non-standard way
+of specifying 1x subsampling (normally 4:4:4 JPEGs have 1x1 luminance and
+chrominance sampling factors), but the JPEG format and the libjpeg API both
+allow it.
+
+6. Fixed a regression introduced by 2.0 beta1[7] that caused djpeg to generate
+incorrect PPM images when used with the `-colors` option.
+
+7. Fixed an issue whereby a static build of libjpeg-turbo (a build in which
+`ENABLE_SHARED` is `0`) could not be installed using the Visual Studio IDE.
+
+8. Fixed a severe performance issue in the Loongson MMI SIMD extensions that
+occurred when compressing RGB images whose image rows were not 64-bit-aligned.
+
+
+2.0.1
+=====
+
+### Significant changes relative to 2.0.0:
+
+1. Fixed a regression introduced with the new CMake-based Un*x build system,
+whereby jconfig.h could cause compiler warnings of the form
+`"HAVE_*_H" redefined` if it was included by downstream Autotools-based
+projects that used `AC_CHECK_HEADERS()` to check for the existence of locale.h,
+stddef.h, or stdlib.h.
+
+2. The `jsimd_quantize_float_dspr2()` and `jsimd_convsamp_float_dspr2()`
+functions in the MIPS DSPr2 SIMD extensions are now disabled at compile time
+if the soft float ABI is enabled.  Those functions use instructions that are
+incompatible with the soft float ABI.
+
+3. Fixed a regression in the SIMD feature detection code, introduced by
+the AVX2 SIMD extensions (2.0 beta1[1]), that caused libjpeg-turbo to crash on
+Windows 7 if Service Pack 1 was not installed.
+
+4. Fixed out-of-bounds read in cjpeg that occurred when attempting to compress
+a specially-crafted malformed color-index (8-bit-per-sample) Targa file in
+which some of the samples (color indices) exceeded the bounds of the Targa
+file's color table.
+
+5. Fixed an issue whereby installing a fully static build of libjpeg-turbo
+(a build in which `CFLAGS` contains `-static` and `ENABLE_SHARED` is `0`) would
+fail with "No valid ELF RPATH or RUNPATH entry exists in the file."
+
+
+2.0.0
+=====
+
+### Significant changes relative to 2.0 beta1:
+
+1. The TurboJPEG API can now decompress CMYK JPEG images that have subsampled M
+and Y components (not to be confused with YCCK JPEG images, in which the C/M/Y
+components have been transformed into luma and chroma.)   Previously, an error
+was generated ("Could not determine subsampling type for JPEG image") when such
+an image was passed to `tjDecompressHeader3()`, `tjTransform()`,
+`tjDecompressToYUVPlanes()`, `tjDecompressToYUV2()`, or the equivalent Java
+methods.
+
+2. Fixed an issue (CVE-2018-11813) whereby a specially-crafted malformed input
+file (specifically, a file with a valid Targa header but incomplete pixel data)
+would cause cjpeg to generate a JPEG file that was potentially thousands of
+times larger than the input file.  The Targa reader in cjpeg was not properly
+detecting that the end of the input file had been reached prematurely, so after
+all valid pixels had been read from the input, the reader injected dummy pixels
+with values of 255 into the JPEG compressor until the number of pixels
+specified in the Targa header had been compressed.  The Targa reader in cjpeg
+now behaves like the PPM reader and aborts compression if the end of the input
+file is reached prematurely.  Because this issue only affected cjpeg and not
+the underlying library, and because it did not involve any out-of-bounds reads
+or other exploitable behaviors, it was not believed to represent a security
+threat.
+
+3. Fixed an issue whereby the `tjLoadImage()` and `tjSaveImage()` functions
+would produce a "Bogus message code" error message if the underlying bitmap and
+PPM readers/writers threw an error that was specific to the readers/writers
+(as opposed to a general libjpeg API error.)
+
+4. Fixed an issue (CVE-2018-1152) whereby a specially-crafted malformed BMP
+file, one in which the header specified an image width of 1073741824 pixels,
+would trigger a floating point exception (division by zero) in the
+`tjLoadImage()` function when attempting to load the BMP file into a
+4-component image buffer.
+
+5. Fixed an issue whereby certain combinations of calls to
+`jpeg_skip_scanlines()` and `jpeg_read_scanlines()` could trigger an infinite
+loop when decompressing progressive JPEG images that use vertical chroma
+subsampling (for instance, 4:2:0 or 4:4:0.)
+
+6. Fixed a segfault in `jpeg_skip_scanlines()` that occurred when decompressing
+a 4:2:2 or 4:2:0 JPEG image using the merged (non-fancy) upsampling algorithms
+(that is, when setting `cinfo.do_fancy_upsampling` to `FALSE`.)
+
+7. The new CMake-based build system will now disable the MIPS DSPr2 SIMD
+extensions if it detects that the compiler does not support DSPr2 instructions.
+
+8. Fixed out-of-bounds read in cjpeg (CVE-2018-14498) that occurred when
+attempting to compress a specially-crafted malformed color-index
+(8-bit-per-sample) BMP file in which some of the samples (color indices)
+exceeded the bounds of the BMP file's color table.
+
+9. Fixed a signed integer overflow in the progressive Huffman decoder, detected
+by the Clang and GCC undefined behavior sanitizers, that could be triggered by
+attempting to decompress a specially-crafted malformed JPEG image.  This issue
+did not pose a security threat, but removing the warning made it easier to
+detect actual security issues, should they arise in the future.
+
+
+1.5.90 (2.0 beta1)
+==================
+
+### Significant changes relative to 1.5.3:
+
+1. Added AVX2 SIMD implementations of the colorspace conversion, chroma
+downsampling and upsampling, integer quantization and sample conversion, and
+accurate integer DCT/IDCT algorithms.  When using the accurate integer DCT/IDCT
+algorithms on AVX2-equipped CPUs, the compression of RGB images is
+approximately 13-36% (avg. 22%) faster (relative to libjpeg-turbo 1.5.x) with
+64-bit code and 11-21% (avg. 17%) faster with 32-bit code, and the
+decompression of RGB images is approximately 9-35% (avg. 17%) faster with
+64-bit code and 7-17% (avg. 12%) faster with 32-bit code.  (As tested on a
+3 GHz Intel Core i7.  Actual mileage may vary.)
+
+2. Overhauled the build system to use CMake on all platforms, and removed the
+autotools-based build system.  This decision resulted from extensive
+discussions within the libjpeg-turbo community.  libjpeg-turbo traditionally
+used CMake only for Windows builds, but there was an increasing amount of
+demand to extend CMake support to other platforms.  However, because of the
+unique nature of our code base (the need to support different assemblers on
+each platform, the need for Java support, etc.), providing dual build systems
+as other OSS imaging libraries do (including libpng and libtiff) would have
+created a maintenance burden.  The use of CMake greatly simplifies some aspects
+of our build system, owing to CMake's built-in support for various assemblers,
+Java, and unit testing, as well as generally fewer quirks that have to be
+worked around in order to implement our packaging system.  Eliminating
+autotools puts our project slightly at odds with the traditional practices of
+the OSS community, since most "system libraries" tend to be built with
+autotools, but it is believed that the benefits of this move outweigh the
+risks.  In addition to providing a unified build environment, switching to
+CMake allows for the use of various build tools and IDEs that aren't supported
+under autotools, including XCode, Ninja, and Eclipse.  It also eliminates the
+need to install autotools via MacPorts/Homebrew on OS X and allows
+libjpeg-turbo to be configured without the use of a terminal/command prompt.
+Extensive testing was conducted to ensure that all features provided by the
+autotools-based build system are provided by the new build system.
+
+3. The libjpeg API in this version of libjpeg-turbo now includes two additional
+functions, `jpeg_read_icc_profile()` and `jpeg_write_icc_profile()`, that can
+be used to extract ICC profile data from a JPEG file while decompressing or to
+embed ICC profile data in a JPEG file while compressing or transforming.  This
+eliminates the need for downstream projects, such as color management libraries
+and browsers, to include their own glueware for accomplishing this.
+
+4. Improved error handling in the TurboJPEG API library:
+
+     - Introduced a new function (`tjGetErrorStr2()`) in the TurboJPEG C API
+that allows compression/decompression/transform error messages to be retrieved
+in a thread-safe manner.  Retrieving error messages from global functions, such
+as `tjInitCompress()` or `tjBufSize()`, is still thread-unsafe, but since those
+functions will only throw errors if passed an invalid argument or if a memory
+allocation failure occurs, thread safety is not as much of a concern.
+     - Introduced a new function (`tjGetErrorCode()`) in the TurboJPEG C API
+and a new method (`TJException.getErrorCode()`) in the TurboJPEG Java API that
+can be used to determine the severity of the last
+compression/decompression/transform error.  This allows applications to
+choose whether to ignore warnings (non-fatal errors) from the underlying
+libjpeg API or to treat them as fatal.
+     - Introduced a new flag (`TJFLAG_STOPONWARNING` in the TurboJPEG C API and
+`TJ.FLAG_STOPONWARNING` in the TurboJPEG Java API) that causes the library to
+immediately halt a compression/decompression/transform operation if it
+encounters a warning from the underlying libjpeg API (the default behavior is
+to allow the operation to complete unless a fatal error is encountered.)
+
+5. Introduced a new flag in the TurboJPEG C and Java APIs (`TJFLAG_PROGRESSIVE`
+and `TJ.FLAG_PROGRESSIVE`, respectively) that causes the library to use
+progressive entropy coding in JPEG images generated by compression and
+transform operations.  Additionally, a new transform option
+(`TJXOPT_PROGRESSIVE` in the C API and `TJTransform.OPT_PROGRESSIVE` in the
+Java API) has been introduced, allowing progressive entropy coding to be
+enabled for selected transforms in a multi-transform operation.
+
+6. Introduced a new transform option in the TurboJPEG API (`TJXOPT_COPYNONE` in
+the C API and `TJTransform.OPT_COPYNONE` in the Java API) that allows the
+copying of markers (including EXIF and ICC profile data) to be disabled for a
+particular transform.
+
+7. Added two functions to the TurboJPEG C API (`tjLoadImage()` and
+`tjSaveImage()`) that can be used to load/save a BMP or PPM/PGM image to/from a
+memory buffer with a specified pixel format and layout.  These functions
+replace the project-private (and slow) bmp API, which was previously used by
+TJBench, and they also provide a convenient way for first-time users of
+libjpeg-turbo to quickly develop a complete JPEG compression/decompression
+program.
+
+8. The TurboJPEG C API now includes a new convenience array (`tjAlphaOffset[]`)
+that contains the alpha component index for each pixel format (or -1 if the
+pixel format lacks an alpha component.)  The TurboJPEG Java API now includes a
+new method (`TJ.getAlphaOffset()`) that returns the same value.  In addition,
+the `tjRedOffset[]`, `tjGreenOffset[]`, and `tjBlueOffset[]` arrays-- and the
+corresponding `TJ.getRedOffset()`, `TJ.getGreenOffset()`, and
+`TJ.getBlueOffset()` methods-- now return -1 for `TJPF_GRAY`/`TJ.PF_GRAY`
+rather than 0.  This allows programs to easily determine whether a pixel format
+has red, green, blue, and alpha components.
+
+9. Added a new example (tjexample.c) that demonstrates the basic usage of the
+TurboJPEG C API.  This example mirrors the functionality of TJExample.java.
+Both files are now included in the libjpeg-turbo documentation.
+
+10. Fixed two signed integer overflows in the arithmetic decoder, detected by
+the Clang undefined behavior sanitizer, that could be triggered by attempting
+to decompress a specially-crafted malformed JPEG image.  These issues did not
+pose a security threat, but removing the warnings makes it easier to detect
+actual security issues, should they arise in the future.
+
+11. Fixed a bug in the merged 4:2:0 upsampling/dithered RGB565 color conversion
+algorithm that caused incorrect dithering in the output image.  This algorithm
+now produces bitwise-identical results to the unmerged algorithms.
+
+12. The SIMD function symbols for x86[-64]/ELF, MIPS/ELF, macOS/x86[-64] (if
+libjpeg-turbo is built with Yasm), and iOS/Arm[64] builds are now private.
+This prevents those symbols from being exposed in applications or shared
+libraries that link statically with libjpeg-turbo.
+
+13. Added Loongson MMI SIMD implementations of the RGB-to-YCbCr and
+YCbCr-to-RGB colorspace conversion, 4:2:0 chroma downsampling, 4:2:0 fancy
+chroma upsampling, integer quantization, and accurate integer DCT/IDCT
+algorithms.  When using the accurate integer DCT/IDCT, this speeds up the
+compression of RGB images by approximately 70-100% and the decompression of RGB
+images by approximately 2-3.5x.
+
+14. Fixed a build error when building with older MinGW releases (regression
+caused by 1.5.1[7].)
+
+15. Added SIMD acceleration for progressive Huffman encoding on SSE2-capable
+x86 and x86-64 platforms.  This speeds up the compression of full-color
+progressive JPEGs by about 85-90% on average (relative to libjpeg-turbo 1.5.x)
+when using modern Intel and AMD CPUs.
+
+
+1.5.3
+=====
+
+### Significant changes relative to 1.5.2:
+
+1. Fixed a NullPointerException in the TurboJPEG Java wrapper that occurred
+when using the YUVImage constructor that creates an instance backed by separate
+image planes and allocates memory for the image planes.
+
+2. Fixed an issue whereby the Java version of TJUnitTest would fail when
+testing BufferedImage encoding/decoding on big endian systems.
+
+3. Fixed a segfault in djpeg that would occur if an output format other than
+PPM/PGM was selected along with the `-crop` option.  The `-crop` option now
+works with the GIF and Targa formats as well (unfortunately, it cannot be made
+to work with the BMP and RLE formats due to the fact that those output engines
+write scanlines in bottom-up order.)  djpeg will now exit gracefully if an
+output format other than PPM/PGM, GIF, or Targa is selected along with the
+`-crop` option.
+
+4. Fixed an issue (CVE-2017-15232) whereby `jpeg_skip_scanlines()` would
+segfault if color quantization was enabled.
+
+5. TJBench (both C and Java versions) will now display usage information if any
+command-line argument is unrecognized.  This prevents the program from silently
+ignoring typos.
+
+6. Fixed an access violation in tjbench.exe (Windows) that occurred when the
+program was used to decompress an existing JPEG image.
+
+7. Fixed an ArrayIndexOutOfBoundsException in the TJExample Java program that
+occurred when attempting to decompress a JPEG image that had been compressed
+with 4:1:1 chrominance subsampling.
+
+8. Fixed an issue whereby, when using `jpeg_skip_scanlines()` to skip to the
+end of a single-scan (non-progressive) image, subsequent calls to
+`jpeg_consume_input()` would return `JPEG_SUSPENDED` rather than
+`JPEG_REACHED_EOI`.
+
+9. `jpeg_crop_scanline()` now works correctly when decompressing grayscale JPEG
+images that were compressed with a sampling factor other than 1 (for instance,
+with `cjpeg -grayscale -sample 2x2`).
+
+
+1.5.2
+=====
+
+### Significant changes relative to 1.5.1:
+
+1. Fixed a regression introduced by 1.5.1[7] that prevented libjpeg-turbo from
+building with Android NDK platforms prior to android-21 (5.0).
+
+2. Fixed a regression introduced by 1.5.1[1] that prevented the MIPS DSPR2 SIMD
+code in libjpeg-turbo from building.
+
+3. Fixed a regression introduced by 1.5 beta1[11] that prevented the Java
+version of TJBench from outputting any reference images (the `-nowrite` switch
+was accidentally enabled by default.)
+
+4. libjpeg-turbo should now build and run with full AltiVec SIMD acceleration
+on PowerPC-based AmigaOS 4 and OpenBSD systems.
+
+5. Fixed build and runtime errors on Windows that occurred when building
+libjpeg-turbo with libjpeg v7 API/ABI emulation and the in-memory
+source/destination managers.  Due to an oversight, the `jpeg_skip_scanlines()`
+and `jpeg_crop_scanline()` functions were not being included in jpeg7.dll when
+libjpeg-turbo was built with `-DWITH_JPEG7=1` and `-DWITH_MEMSRCDST=1`.
+
+6. Fixed "Bogus virtual array access" error that occurred when using the
+lossless crop feature in jpegtran or the TurboJPEG API, if libjpeg-turbo was
+built with libjpeg v7 API/ABI emulation.  This was apparently a long-standing
+bug that has existed since the introduction of libjpeg v7/v8 API/ABI emulation
+in libjpeg-turbo v1.1.
+
+7. The lossless transform features in jpegtran and the TurboJPEG API will now
+always attempt to adjust the EXIF image width and height tags if the image size
+changed as a result of the transform.  This behavior has always existed when
+using libjpeg v8 API/ABI emulation.  It was supposed to be available with
+libjpeg v7 API/ABI emulation as well but did not work properly due to a bug.
+Furthermore, there was never any good reason not to enable it with libjpeg v6b
+API/ABI emulation, since the behavior is entirely internal.  Note that
+`-copy all` must be passed to jpegtran in order to transfer the EXIF tags from
+the source image to the destination image.
+
+8. Fixed several memory leaks in the TurboJPEG API library that could occur
+if the library was built with certain compilers and optimization levels
+(known to occur with GCC 4.x and clang with `-O1` and higher but not with
+GCC 5.x or 6.x) and one of the underlying libjpeg API functions threw an error
+after a TurboJPEG API function allocated a local buffer.
+
+9. The libjpeg-turbo memory manager will now honor the `max_memory_to_use`
+structure member in jpeg\_memory\_mgr, which can be set to the maximum amount
+of memory (in bytes) that libjpeg-turbo should use during decompression or
+multi-pass (including progressive) compression.  This limit can also be set
+using the `JPEGMEM` environment variable or using the `-maxmemory` switch in
+cjpeg/djpeg/jpegtran (refer to the respective man pages for more details.)
+This has been a documented feature of libjpeg since v5, but the
+`malloc()`/`free()` implementation of the memory manager (jmemnobs.c) never
+implemented the feature.  Restricting libjpeg-turbo's memory usage is useful
+for two reasons:  it allows testers to more easily work around the 2 GB limit
+in libFuzzer, and it allows developers of security-sensitive applications to
+more easily defend against one of the progressive JPEG exploits (LJT-01-004)
+identified in
+[this report](http://www.libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+10. TJBench will now run each benchmark for 1 second prior to starting the
+timer, in order to improve the consistency of the results.  Furthermore, the
+`-warmup` option is now used to specify the amount of warmup time rather than
+the number of warmup iterations.
+
+11. Fixed an error (`short jump is out of range`) that occurred when assembling
+the 32-bit x86 SIMD extensions with NASM versions prior to 2.04.  This was a
+regression introduced by 1.5 beta1[12].
+
+
+1.5.1
+=====
+
+### Significant changes relative to 1.5.0:
+
+1. Previously, the undocumented `JSIMD_FORCE*` environment variables could be
+used to force-enable a particular SIMD instruction set if multiple instruction
+sets were available on a particular platform.  On x86 platforms, where CPU
+feature detection is bulletproof and multiple SIMD instruction sets are
+available, it makes sense for those environment variables to allow forcing the
+use of an instruction set only if that instruction set is available.  However,
+since the ARM implementations of libjpeg-turbo can only use one SIMD
+instruction set, and since their feature detection code is less bulletproof
+(parsing /proc/cpuinfo), it makes sense for the `JSIMD_FORCENEON` environment
+variable to bypass the feature detection code and really force the use of NEON
+instructions.  A new environment variable (`JSIMD_FORCEDSPR2`) was introduced
+in the MIPS implementation for the same reasons, and the existing
+`JSIMD_FORCENONE` environment variable was extended to that implementation.
+These environment variables provide a workaround for those attempting to test
+ARM and MIPS builds of libjpeg-turbo in QEMU, which passes through
+/proc/cpuinfo from the host system.
+
+2. libjpeg-turbo previously assumed that AltiVec instructions were always
+available on PowerPC platforms, which led to "illegal instruction" errors when
+running on PowerPC chips that lack AltiVec support (such as the older 7xx/G3
+and newer e5500 series.)  libjpeg-turbo now examines /proc/cpuinfo on
+Linux/Android systems and enables AltiVec instructions only if the CPU supports
+them.  It also now provides two environment variables, `JSIMD_FORCEALTIVEC` and
+`JSIMD_FORCENONE`, to force-enable and force-disable AltiVec instructions in
+environments where /proc/cpuinfo is an unreliable means of CPU feature
+detection (such as when running in QEMU.)  On OS X, libjpeg-turbo continues to
+assume that AltiVec support is always available, which means that libjpeg-turbo
+cannot be used with G3 Macs unless you set the environment variable
+`JSIMD_FORCENONE` to `1`.
+
+3. Fixed an issue whereby 64-bit ARM (AArch64) builds of libjpeg-turbo would
+crash when built with recent releases of the Clang/LLVM compiler.  This was
+caused by an ABI conformance issue in some of libjpeg-turbo's 64-bit NEON SIMD
+routines.  Those routines were incorrectly using 64-bit instructions to
+transfer a 32-bit JDIMENSION argument, whereas the ABI allows the upper
+(unused) 32 bits of a 32-bit argument's register to be undefined.  The new
+Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
+structure members into a single 64-bit register, and this exposed the ABI
+conformance issue.
+
+4. Fancy upsampling is now supported when decompressing JPEG images that use
+4:4:0 (h1v2) chroma subsampling.  These images are generated when losslessly
+rotating or transposing JPEG images that use 4:2:2 (h2v1) chroma subsampling.
+The h1v2 fancy upsampling algorithm is not currently SIMD-accelerated.
+
+5. If merged upsampling isn't SIMD-accelerated but YCbCr-to-RGB conversion is,
+then libjpeg-turbo will now disable merged upsampling when decompressing YCbCr
+JPEG images into RGB or extended RGB output images.  This significantly speeds
+up the decompression of 4:2:0 and 4:2:2 JPEGs on ARM platforms if fancy
+upsampling is not used (for example, if the `-nosmooth` option to djpeg is
+specified.)
+
+6. The TurboJPEG API will now decompress 4:2:2 and 4:4:0 JPEG images with
+2x2 luminance sampling factors and 2x1 or 1x2 chrominance sampling factors.
+This is a non-standard way of specifying 2x subsampling (normally 4:2:2 JPEGs
+have 2x1 luminance and 1x1 chrominance sampling factors, and 4:4:0 JPEGs have
+1x2 luminance and 1x1 chrominance sampling factors), but the JPEG format and
+the libjpeg API both allow it.
+
+7. Fixed an unsigned integer overflow in the libjpeg memory manager, detected
+by the Clang undefined behavior sanitizer, that could be triggered by
+attempting to decompress a specially-crafted malformed JPEG image.  This issue
+affected only 32-bit code and did not pose a security threat, but removing the
+warning makes it easier to detect actual security issues, should they arise in
+the future.
+
+8. Fixed additional negative left shifts and other issues reported by the GCC
+and Clang undefined behavior sanitizers when attempting to decompress
+specially-crafted malformed JPEG images.  None of these issues posed a security
+threat, but removing the warnings makes it easier to detect actual security
+issues, should they arise in the future.
+
+9. Fixed an out-of-bounds array reference, introduced by 1.4.90[2] (partial
+image decompression) and detected by the Clang undefined behavior sanitizer,
+that could be triggered by a specially-crafted malformed JPEG image with more
+than four components.  Because the out-of-bounds reference was still within the
+same structure, it was not known to pose a security threat, but removing the
+warning makes it easier to detect actual security issues, should they arise in
+the future.
+
+10. Fixed another ABI conformance issue in the 64-bit ARM (AArch64) NEON SIMD
+code.  Some of the routines were incorrectly reading and storing data below the
+stack pointer, which caused segfaults in certain applications under specific
+circumstances.
+
+
+1.5.0
+=====
+
+### Significant changes relative to 1.5 beta1:
+
+1. Fixed an issue whereby a malformed motion-JPEG frame could cause the "fast
+path" of libjpeg-turbo's Huffman decoder to read from uninitialized memory.
+
+2. Added libjpeg-turbo version and build information to the global string table
+of the libjpeg and TurboJPEG API libraries.  This is a common practice in other
+infrastructure libraries, such as OpenSSL and libpng, because it makes it easy
+to examine an application binary and determine which version of the library the
+application was linked against.
+
+3. Fixed a couple of issues in the PPM reader that would cause buffer overruns
+in cjpeg if one of the values in a binary PPM/PGM input file exceeded the
+maximum value defined in the file's header and that maximum value was greater
+than 255.  libjpeg-turbo 1.4.2 already included a similar fix for ASCII PPM/PGM
+files.  Note that these issues were not security bugs, since they were confined
+to the cjpeg program and did not affect any of the libjpeg-turbo libraries.
+
+4. Fixed an issue whereby attempting to decompress a JPEG file with a corrupt
+header using the `tjDecompressToYUV2()` function would cause the function to
+abort without returning an error and, under certain circumstances, corrupt the
+stack.  This only occurred if `tjDecompressToYUV2()` was called prior to
+calling `tjDecompressHeader3()`, or if the return value from
+`tjDecompressHeader3()` was ignored (both cases represent incorrect usage of
+the TurboJPEG API.)
+
+5. Fixed an issue in the ARM 32-bit SIMD-accelerated Huffman encoder that
+prevented the code from assembling properly with clang.
+
+6. The `jpeg_stdio_src()`, `jpeg_mem_src()`, `jpeg_stdio_dest()`, and
+`jpeg_mem_dest()` functions in the libjpeg API will now throw an error if a
+source/destination manager has already been assigned to the compress or
+decompress object by a different function or by the calling program.  This
+prevents these functions from attempting to reuse a source/destination manager
+structure that was allocated elsewhere, because there is no way to ensure that
+it would be big enough to accommodate the new source/destination manager.
+
+
+1.4.90 (1.5 beta1)
+==================
+
+### Significant changes relative to 1.4.2:
+
+1. Added full SIMD acceleration for PowerPC platforms using AltiVec VMX
+(128-bit SIMD) instructions.  Although the performance of libjpeg-turbo on
+PowerPC was already good, due to the increased number of registers available
+to the compiler vs. x86, it was still possible to speed up compression by about
+3-4x and decompression by about 2-2.5x (relative to libjpeg v6b) through the
+use of AltiVec instructions.
+
+2. Added two new libjpeg API functions (`jpeg_skip_scanlines()` and
+`jpeg_crop_scanline()`) that can be used to partially decode a JPEG image.  See
+[libjpeg.txt](libjpeg.txt) for more details.
+
+3. The TJCompressor and TJDecompressor classes in the TurboJPEG Java API now
+implement the Closeable interface, so those classes can be used with a
+try-with-resources statement.
+
+4. The TurboJPEG Java classes now throw unchecked idiomatic exceptions
+(IllegalArgumentException, IllegalStateException) for unrecoverable errors
+caused by incorrect API usage, and those classes throw a new checked exception
+type (TJException) for errors that are passed through from the C library.
+
+5. Source buffers for the TurboJPEG C API functions, as well as the
+`jpeg_mem_src()` function in the libjpeg API, are now declared as const
+pointers.  This facilitates passing read-only buffers to those functions and
+ensures the caller that the source buffer will not be modified.  This should
+not create any backward API or ABI incompatibilities with prior libjpeg-turbo
+releases.
+
+6. The MIPS DSPr2 SIMD code can now be compiled to support either FR=0 or FR=1
+FPUs.
+
+7. Fixed additional negative left shifts and other issues reported by the GCC
+and Clang undefined behavior sanitizers.  Most of these issues affected only
+32-bit code, and none of them was known to pose a security threat, but removing
+the warnings makes it easier to detect actual security issues, should they
+arise in the future.
+
+8. Removed the unnecessary `.arch` directive from the ARM64 NEON SIMD code.
+This directive was preventing the code from assembling using the clang
+integrated assembler.
+
+9. Fixed a regression caused by 1.4.1[6] that prevented 32-bit and 64-bit
+libjpeg-turbo RPMs from being installed simultaneously on recent Red Hat/Fedora
+distributions.  This was due to the addition of a macro in jconfig.h that
+allows the Huffman codec to determine the word size at compile time.  Since
+that macro differs between 32-bit and 64-bit builds, this caused a conflict
+between the i386 and x86_64 RPMs (any differing files, other than executables,
+are not allowed when 32-bit and 64-bit RPMs are installed simultaneously.)
+Since the macro is used only internally, it has been moved into jconfigint.h.
+
+10. The x86-64 SIMD code can now be disabled at run time by setting the
+`JSIMD_FORCENONE` environment variable to `1` (the other SIMD implementations
+already had this capability.)
+
+11. Added a new command-line argument to TJBench (`-nowrite`) that prevents the
+benchmark from outputting any images.  This removes any potential operating
+system overhead that might be caused by lazy writes to disk and thus improves
+the consistency of the performance measurements.
+
+12. Added SIMD acceleration for Huffman encoding on SSE2-capable x86 and x86-64
+platforms.  This speeds up the compression of full-color JPEGs by about 10-15%
+on average (relative to libjpeg-turbo 1.4.x) when using modern Intel and AMD
+CPUs.  Additionally, this works around an issue in the clang optimizer that
+prevents it (as of this writing) from achieving the same performance as GCC
+when compiling the C version of the Huffman encoder
+(<https://llvm.org/bugs/show_bug.cgi?id=16035>).  For the purposes of
+benchmarking or regression testing, SIMD-accelerated Huffman encoding can be
+disabled by setting the `JSIMD_NOHUFFENC` environment variable to `1`.
+
+13. Added ARM 64-bit (ARMv8) NEON SIMD implementations of the commonly-used
+compression algorithms (including the accurate integer forward DCT and h2v2 &
+h2v1 downsampling algorithms, which are not accelerated in the 32-bit NEON
+implementation.)  This speeds up the compression of full-color JPEGs by about
+75% on average on a Cavium ThunderX processor and by about 2-2.5x on average on
+Cortex-A53 and Cortex-A57 cores.
+
+14. Added SIMD acceleration for Huffman encoding on NEON-capable ARM 32-bit
+and 64-bit platforms.
+
+    For 32-bit code, this speeds up the compression of full-color JPEGs by
+about 30% on average on a typical iOS device (iPhone 4S, Cortex-A9) and by
+about 6-7% on average on a typical Android device (Nexus 5X, Cortex-A53 and
+Cortex-A57), relative to libjpeg-turbo 1.4.x.  Note that the larger speedup
+under iOS is due to the fact that iOS builds use LLVM, which does not optimize
+the C Huffman encoder as well as GCC does.
+
+    For 64-bit code, NEON-accelerated Huffman encoding speeds up the
+compression of full-color JPEGs by about 40% on average on a typical iOS device
+(iPhone 5S, Apple A7) and by about 7-8% on average on a typical Android device
+(Nexus 5X, Cortex-A53 and Cortex-A57), in addition to the speedup described in
+[13] above.
+
+    For the purposes of benchmarking or regression testing, SIMD-accelerated
+Huffman encoding can be disabled by setting the `JSIMD_NOHUFFENC` environment
+variable to `1`.
+
+15. pkg-config (.pc) scripts are now included for both the libjpeg and
+TurboJPEG API libraries on Un*x systems.  Note that if a project's build system
+relies on these scripts, then it will not be possible to build that project
+with libjpeg or with a prior version of libjpeg-turbo.
+
+16. Optimized the ARM 64-bit (ARMv8) NEON SIMD decompression routines to
+improve performance on CPUs with in-order pipelines.  This speeds up the
+decompression of full-color JPEGs by nearly 2x on average on a Cavium ThunderX
+processor and by about 15% on average on a Cortex-A53 core.
+
+17. Fixed an issue in the accelerated Huffman decoder that could have caused
+the decoder to read past the end of the input buffer when a malformed,
+specially-crafted JPEG image was being decompressed.  In prior versions of
+libjpeg-turbo, the accelerated Huffman decoder was invoked (in most cases) only
+if there were > 128 bytes of data in the input buffer.  However, it is possible
+to construct a JPEG image in which a single Huffman block is over 430 bytes
+long, so this version of libjpeg-turbo activates the accelerated Huffman
+decoder only if there are > 512 bytes of data in the input buffer.
+
+18. Fixed a memory leak in tjunittest encountered when running the program
+with the `-yuv` option.
+
+
+1.4.2
+=====
+
+### Significant changes relative to 1.4.1:
+
+1. Fixed an issue whereby cjpeg would segfault if a Windows bitmap with a
+negative width or height was used as an input image (Windows bitmaps can have
+a negative height if they are stored in top-down order, but such files are
+rare and not supported by libjpeg-turbo.)
+
+2. Fixed an issue whereby, under certain circumstances, libjpeg-turbo would
+incorrectly encode certain JPEG images when quality=100 and the fast integer
+forward DCT were used.  This was known to cause `make test` to fail when the
+library was built with `-march=haswell` on x86 systems.
+
+3. Fixed an issue whereby libjpeg-turbo would crash when built with the latest
+& greatest development version of the Clang/LLVM compiler.  This was caused by
+an x86-64 ABI conformance issue in some of libjpeg-turbo's 64-bit SSE2 SIMD
+routines.  Those routines were incorrectly using a 64-bit `mov` instruction to
+transfer a 32-bit JDIMENSION argument, whereas the x86-64 ABI allows the upper
+(unused) 32 bits of a 32-bit argument's register to be undefined.  The new
+Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
+structure members into a single 64-bit register, and this exposed the ABI
+conformance issue.
+
+4. Fixed a bug in the MIPS DSPr2 4:2:0 "plain" (non-fancy and non-merged)
+upsampling routine that caused a buffer overflow (and subsequent segfault) when
+decompressing a 4:2:0 JPEG image whose scaled output width was less than 16
+pixels.  The "plain" upsampling routines are normally only used when
+decompressing a non-YCbCr JPEG image, but they are also used when decompressing
+a JPEG image whose scaled output height is 1.
+
+5. Fixed various negative left shifts and other issues reported by the GCC and
+Clang undefined behavior sanitizers.  None of these was known to pose a
+security threat, but removing the warnings makes it easier to detect actual
+security issues, should they arise in the future.
+
+
+1.4.1
+=====
+
+### Significant changes relative to 1.4.0:
+
+1. tjbench now properly handles CMYK/YCCK JPEG files.  Passing an argument of
+`-cmyk` (instead of, for instance, `-rgb`) will cause tjbench to internally
+convert the source bitmap to CMYK prior to compression, to generate YCCK JPEG
+files, and to internally convert the decompressed CMYK pixels back to RGB after
+decompression (the latter is done automatically if a CMYK or YCCK JPEG is
+passed to tjbench as a source image.)  The CMYK<->RGB conversion operation is
+not benchmarked.  NOTE: The quick & dirty CMYK<->RGB conversions that tjbench
+uses are suitable for testing only.  Proper conversion between CMYK and RGB
+requires a color management system.
+
+2. `make test` now performs additional bitwise regression tests using tjbench,
+mainly for the purpose of testing compression from/decompression to a subregion
+of a larger image buffer.
+
+3. `make test` no longer tests the regression of the floating point DCT/IDCT
+by default, since the results of those tests can vary if the algorithms in
+question are not implemented using SIMD instructions on a particular platform.
+See the comments in [Makefile.am](Makefile.am) for information on how to
+re-enable the tests and to specify an expected result for them based on the
+particulars of your platform.
+
+4. The NULL color conversion routines have been significantly optimized,
+which speeds up the compression of RGB and CMYK JPEGs by 5-20% when using
+64-bit code and 0-3% when using 32-bit code, and the decompression of those
+images by 10-30% when using 64-bit code and 3-12% when using 32-bit code.
+
+5. Fixed an "illegal instruction" error that occurred when djpeg from a
+SIMD-enabled libjpeg-turbo MIPS build was executed with the `-nosmooth` option
+on a MIPS machine that lacked DSPr2 support.  The MIPS SIMD routines for h2v1
+and h2v2 merged upsampling were not properly checking for the existence of
+DSPr2.
+
+6. Performance has been improved significantly on 64-bit non-Linux and
+non-Windows platforms (generally 10-20% faster compression and 5-10% faster
+decompression.)  Due to an oversight, the 64-bit version of the accelerated
+Huffman codec was not being compiled in when libjpeg-turbo was built on
+platforms other than Windows or Linux.  Oops.
+
+7. Fixed an extremely rare bug in the Huffman encoder that caused 64-bit
+builds of libjpeg-turbo to incorrectly encode a few specific test images when
+quality=98, an optimized Huffman table, and the accurate integer forward DCT
+were used.
+
+8. The Windows (CMake) build system now supports building only static or only
+shared libraries.  This is accomplished by adding either `-DENABLE_STATIC=0` or
+`-DENABLE_SHARED=0` to the CMake command line.
+
+9. TurboJPEG API functions will now return an error code if a warning is
+triggered in the underlying libjpeg API.  For instance, if a JPEG file is
+corrupt, the TurboJPEG decompression functions will attempt to decompress
+as much of the image as possible, but those functions will now return -1 to
+indicate that the decompression was not entirely successful.
+
+10. Fixed a bug in the MIPS DSPr2 4:2:2 fancy upsampling routine that caused a
+buffer overflow (and subsequent segfault) when decompressing a 4:2:2 JPEG image
+in which the right-most MCU was 5 or 6 pixels wide.
+
+
+1.4.0
+=====
+
+### Significant changes relative to 1.4 beta1:
+
+1. Fixed a build issue on OS X PowerPC platforms (md5cmp failed to build
+because OS X does not provide the `le32toh()` and `htole32()` functions.)
+
+2. The non-SIMD RGB565 color conversion code did not work correctly on big
+endian machines.  This has been fixed.
+
+3. Fixed an issue in `tjPlaneSizeYUV()` whereby it would erroneously return 1
+instead of -1 if `componentID` was > 0 and `subsamp` was `TJSAMP_GRAY`.
+
+3. Fixed an issue in `tjBufSizeYUV2()` whereby it would erroneously return 0
+instead of -1 if `width` was < 1.
+
+5. The Huffman encoder now uses `clz` and `bsr` instructions for bit counting
+on ARM64 platforms (see 1.4 beta1[5].)
+
+6. The `close()` method in the TJCompressor and TJDecompressor Java classes is
+now idempotent.  Previously, that method would call the native `tjDestroy()`
+function even if the TurboJPEG instance had already been destroyed.  This
+caused an exception to be thrown during finalization, if the `close()` method
+had already been called.  The exception was caught, but it was still an
+expensive operation.
+
+7. The TurboJPEG API previously generated an error (`Could not determine
+subsampling type for JPEG image`) when attempting to decompress grayscale JPEG
+images that were compressed with a sampling factor other than 1 (for instance,
+with `cjpeg -grayscale -sample 2x2`).  Subsampling technically has no meaning
+with grayscale JPEGs, and thus the horizontal and vertical sampling factors
+for such images are ignored by the decompressor.  However, the TurboJPEG API
+was being too rigid and was expecting the sampling factors to be equal to 1
+before it treated the image as a grayscale JPEG.
+
+8. cjpeg, djpeg, and jpegtran now accept an argument of `-version`, which will
+print the library version and exit.
+
+9. Referring to 1.4 beta1[15], another extremely rare circumstance was
+discovered under which the Huffman encoder's local buffer can be overrun
+when a buffered destination manager is being used and an
+extremely-high-frequency block (basically junk image data) is being encoded.
+Even though the Huffman local buffer was increased from 128 bytes to 136 bytes
+to address the previous issue, the new issue caused even the larger buffer to
+be overrun.  Further analysis reveals that, in the absolute worst case (such as
+setting alternating AC coefficients to 32767 and -32768 in the JPEG scanning
+order), the Huffman encoder can produce encoded blocks that approach double the
+size of the unencoded blocks.  Thus, the Huffman local buffer was increased to
+256 bytes, which should prevent any such issue from re-occurring in the future.
+
+10. The new `tjPlaneSizeYUV()`, `tjPlaneWidth()`, and `tjPlaneHeight()`
+functions were not actually usable on any platform except OS X and Windows,
+because those functions were not included in the libturbojpeg mapfile.  This
+has been fixed.
+
+11. Restored the `JPP()`, `JMETHOD()`, and `FAR` macros in the libjpeg-turbo
+header files.  The `JPP()` and `JMETHOD()` macros were originally implemented
+in libjpeg as a way of supporting non-ANSI compilers that lacked support for
+prototype parameters.  libjpeg-turbo has never supported such compilers, but
+some software packages still use the macros to define their own prototypes.
+Similarly, libjpeg-turbo has never supported MS-DOS and other platforms that
+have far symbols, but some software packages still use the `FAR` macro.  A
+pretty good argument can be made that this is a bad practice on the part of the
+software in question, but since this affects more than one package, it's just
+easier to fix it here.
+
+12. Fixed issues that were preventing the ARM 64-bit SIMD code from compiling
+for iOS, and included an ARMv8 architecture in all of the binaries installed by
+the "official" libjpeg-turbo SDK for OS X.
+
+
+1.3.90 (1.4 beta1)
+==================
+
+### Significant changes relative to 1.3.1:
+
+1. New features in the TurboJPEG API:
+
+     - YUV planar images can now be generated with an arbitrary line padding
+(previously only 4-byte padding, which was compatible with X Video, was
+supported.)
+     - The decompress-to-YUV function has been extended to support image
+scaling.
+     - JPEG images can now be compressed from YUV planar source images.
+     - YUV planar images can now be decoded into RGB or grayscale images.
+     - 4:1:1 subsampling is now supported.  This is mainly included for
+compatibility, since 4:1:1 is not fully accelerated in libjpeg-turbo and has no
+significant advantages relative to 4:2:0.
+     - CMYK images are now supported.  This feature allows CMYK source images
+to be compressed to YCCK JPEGs and YCCK or CMYK JPEGs to be decompressed to
+CMYK destination images.  Conversion between CMYK/YCCK and RGB or YUV images is
+not supported.  Such conversion requires a color management system and is thus
+out of scope for a codec library.
+     - The handling of YUV images in the Java API has been significantly
+refactored and should now be much more intuitive.
+     - The Java API now supports encoding a YUV image from an arbitrary
+position in a large image buffer.
+     - All of the YUV functions now have a corresponding function that operates
+on separate image planes instead of a unified image buffer.  This allows for
+compressing/decoding from or decompressing/encoding to a subregion of a larger
+YUV image.  It also allows for handling YUV formats that swap the order of the
+U and V planes.
+
+2. Added SIMD acceleration for DSPr2-capable MIPS platforms.  This speeds up
+the compression of full-color JPEGs by 70-80% on such platforms and
+decompression by 25-35%.
+
+3. If an application attempts to decompress a Huffman-coded JPEG image whose
+header does not contain Huffman tables, libjpeg-turbo will now insert the
+default Huffman tables.  In order to save space, many motion JPEG video frames
+are encoded without the default Huffman tables, so these frames can now be
+successfully decompressed by libjpeg-turbo without additional work on the part
+of the application.  An application can still override the Huffman tables, for
+instance to re-use tables from a previous frame of the same video.
+
+4. The Mac packaging system now uses pkgbuild and productbuild rather than
+PackageMaker (which is obsolete and no longer supported.)  This means that
+OS X 10.6 "Snow Leopard" or later must be used when packaging libjpeg-turbo,
+although the packages produced can be installed on OS X 10.5 "Leopard" or
+later.  OS X 10.4 "Tiger" is no longer supported.
+
+5. The Huffman encoder now uses `clz` and `bsr` instructions for bit counting
+on ARM platforms rather than a lookup table.  This reduces the memory footprint
+by 64k, which may be important for some mobile applications.  Out of four
+Android devices that were tested, two demonstrated a small overall performance
+loss (~3-4% on average) with ARMv6 code and a small gain (also ~3-4%) with
+ARMv7 code when enabling this new feature, but the other two devices
+demonstrated a significant overall performance gain with both ARMv6 and ARMv7
+code (~10-20%) when enabling the feature.  Actual mileage may vary.
+
+6. Worked around an issue with Visual C++ 2010 and later that caused incorrect
+pixels to be generated when decompressing a JPEG image to a 256-color bitmap,
+if compiler optimization was enabled when libjpeg-turbo was built.  This caused
+the regression tests to fail when doing a release build under Visual C++ 2010
+and later.
+
+7. Improved the accuracy and performance of the non-SIMD implementation of the
+floating point inverse DCT (using code borrowed from libjpeg v8a and later.)
+The accuracy of this implementation now matches the accuracy of the SSE/SSE2
+implementation.  Note, however, that the floating point DCT/IDCT algorithms are
+mainly a legacy feature.  They generally do not produce significantly better
+accuracy than the accurate integer DCT/IDCT algorithms, and they are quite a
+bit slower.
+
+8. Added a new output colorspace (`JCS_RGB565`) to the libjpeg API that allows
+for decompressing JPEG images into RGB565 (16-bit) pixels.  If dithering is not
+used, then this code path is SIMD-accelerated on ARM platforms.
+
+9. Numerous obsolete features, such as support for non-ANSI compilers and
+support for the MS-DOS memory model, were removed from the libjpeg code,
+greatly improving its readability and making it easier to maintain and extend.
+
+10. Fixed a segfault that occurred when calling `output_message()` with
+`msg_code` set to `JMSG_COPYRIGHT`.
+
+11. Fixed an issue whereby wrjpgcom was allowing comments longer than 65k
+characters to be passed on the command line, which was causing it to generate
+incorrect JPEG files.
+
+12. Fixed a bug in the build system that was causing the Windows version of
+wrjpgcom to be built using the rdjpgcom source code.
+
+13. Restored 12-bit-per-component JPEG support.  A 12-bit version of
+libjpeg-turbo can now be built by passing an argument of `--with-12bit` to
+configure (Unix) or `-DWITH_12BIT=1` to cmake (Windows.)  12-bit JPEG support
+is included only for convenience.  Enabling this feature disables all of the
+performance features in libjpeg-turbo, as well as arithmetic coding and the
+TurboJPEG API.  The resulting library still contains the other libjpeg-turbo
+features (such as the colorspace extensions), but in general, it performs no
+faster than libjpeg v6b.
+
+14. Added ARM 64-bit SIMD acceleration for the YCC-to-RGB color conversion
+and IDCT algorithms (both are used during JPEG decompression.)  For unknown
+reasons (probably related to clang), this code cannot currently be compiled for
+iOS.
+
+15. Fixed an extremely rare bug (CVE-2014-9092) that could cause the Huffman
+encoder's local buffer to overrun when a very high-frequency MCU is compressed
+using quality 100 and no subsampling, and when the JPEG output buffer is being
+dynamically resized by the destination manager.  This issue was so rare that,
+even with a test program specifically designed to make the bug occur (by
+injecting random high-frequency YUV data into the compressor), it was
+reproducible only once in about every 25 million iterations.
+
+16. Fixed an oversight in the TurboJPEG C wrapper:  if any of the JPEG
+compression functions was called repeatedly with the same
+automatically-allocated destination buffer, then TurboJPEG would erroneously
+assume that the `jpegSize` parameter was equal to the size of the buffer, when
+in fact that parameter was probably equal to the size of the most recently
+compressed JPEG image.  If the size of the previous JPEG image was not as large
+as the current JPEG image, then TurboJPEG would unnecessarily reallocate the
+destination buffer.
+
+
+1.3.1
+=====
+
+### Significant changes relative to 1.3.0:
+
+1. On Un*x systems, `make install` now installs the libjpeg-turbo libraries
+into /opt/libjpeg-turbo/lib32 by default on any 32-bit system, not just x86,
+and into /opt/libjpeg-turbo/lib64 by default on any 64-bit system, not just
+x86-64.  You can override this by overriding either the `prefix` or `libdir`
+configure variables.
+
+2. The Windows installer now places a copy of the TurboJPEG DLLs in the same
+directory as the rest of the libjpeg-turbo binaries.  This was mainly done
+to support TurboVNC 1.3, which bundles the DLLs in its Windows installation.
+When using a 32-bit version of CMake on 64-bit Windows, it is impossible to
+access the c:\WINDOWS\system32 directory, which made it impossible for the
+TurboVNC build scripts to bundle the 64-bit TurboJPEG DLL.
+
+3. Fixed a bug whereby attempting to encode a progressive JPEG with arithmetic
+entropy coding (by passing arguments of `-progressive -arithmetic` to cjpeg or
+jpegtran, for instance) would result in an error, `Requested feature was
+omitted at compile time`.
+
+4. Fixed a couple of issues (CVE-2013-6629 and CVE-2013-6630) whereby malformed
+JPEG images would cause libjpeg-turbo to use uninitialized memory during
+decompression.
+
+5. Fixed an error (`Buffer passed to JPEG library is too small`) that occurred
+when calling the TurboJPEG YUV encoding function with a very small (< 5x5)
+source image, and added a unit test to check for this error.
+
+6. The Java classes should now build properly under Visual Studio 2010 and
+later.
+
+7. Fixed an issue that prevented SRPMs generated using the in-tree packaging
+tools from being rebuilt on certain newer Linux distributions.
+
+8. Numerous minor fixes to eliminate compilation and build/packaging system
+warnings, fix cosmetic issues, improve documentation clarity, and other general
+source cleanup.
+
+
+1.3.0
+=====
+
+### Significant changes relative to 1.3 beta1:
+
+1. `make test` now works properly on FreeBSD, and it no longer requires the
+md5sum executable to be present on other Un*x platforms.
+
+2. Overhauled the packaging system:
+
+     - To avoid conflict with vendor-supplied libjpeg-turbo packages, the
+official RPMs and DEBs for libjpeg-turbo have been renamed to
+"libjpeg-turbo-official".
+     - The TurboJPEG libraries are now located under /opt/libjpeg-turbo in the
+official Linux and Mac packages, to avoid conflict with vendor-supplied
+packages and also to streamline the packaging system.
+     - Release packages are now created with the directory structure defined
+by the configure variables `prefix`, `bindir`, `libdir`, etc. (Un\*x) or by the
+`CMAKE_INSTALL_PREFIX` variable (Windows.)  The exception is that the docs are
+always located under the system default documentation directory on Un\*x and
+Mac systems, and on Windows, the TurboJPEG DLL is always located in the Windows
+system directory.
+     - To avoid confusion, official libjpeg-turbo packages on Linux/Unix
+platforms (except for Mac) will always install the 32-bit libraries in
+/opt/libjpeg-turbo/lib32 and the 64-bit libraries in /opt/libjpeg-turbo/lib64.
+     - Fixed an issue whereby, in some cases, the libjpeg-turbo executables on
+Un*x systems were not properly linking with the shared libraries installed by
+the same package.
+     - Fixed an issue whereby building the "installer" target on Windows when
+`WITH_JAVA=1` would fail if the TurboJPEG JAR had not been previously built.
+     - Building the "install" target on Windows now installs files into the
+same places that the installer does.
+
+3. Fixed a Huffman encoder bug that prevented I/O suspension from working
+properly.
+
+
+1.2.90 (1.3 beta1)
+==================
+
+### Significant changes relative to 1.2.1:
+
+1. Added support for additional scaling factors (3/8, 5/8, 3/4, 7/8, 9/8, 5/4,
+11/8, 3/2, 13/8, 7/4, 15/8, and 2) when decompressing.  Note that the IDCT will
+not be SIMD-accelerated when using any of these new scaling factors.
+
+2. The TurboJPEG dynamic library is now versioned.  It was not strictly
+necessary to do so, because TurboJPEG uses versioned symbols, and if a function
+changes in an ABI-incompatible way, that function is renamed and a legacy
+function is provided to maintain backward compatibility.  However, certain
+Linux distro maintainers have a policy against accepting any library that isn't
+versioned.
+
+3. Extended the TurboJPEG Java API so that it can be used to compress a JPEG
+image from and decompress a JPEG image to an arbitrary position in a large
+image buffer.
+
+4. The `tjDecompressToYUV()` function now supports the `TJFLAG_FASTDCT` flag.
+
+5. The 32-bit supplementary package for amd64 Debian systems now provides
+symlinks in /usr/lib/i386-linux-gnu for the TurboJPEG libraries in /usr/lib32.
+This allows those libraries to be used on MultiArch-compatible systems (such as
+Ubuntu 11 and later) without setting the linker path.
+
+6. The TurboJPEG Java wrapper should now find the JNI library on Mac systems
+without having to pass `-Djava.library.path=/usr/lib` to java.
+
+7. TJBench has been ported to Java to provide a convenient way of validating
+the performance of the TurboJPEG Java API.  It can be run with
+`java -cp turbojpeg.jar TJBench`.
+
+8. cjpeg can now be used to generate JPEG files with the RGB colorspace
+(feature ported from jpeg-8d.)
+
+9. The width and height in the `-crop` argument passed to jpegtran can now be
+suffixed with `f` to indicate that, when the upper left corner of the cropping
+region is automatically moved to the nearest iMCU boundary, the bottom right
+corner should be moved by the same amount.  In other words, this feature causes
+jpegtran to strictly honor the specified width/height rather than the specified
+bottom right corner (feature ported from jpeg-8d.)
+
+10. JPEG files using the RGB colorspace can now be decompressed into grayscale
+images (feature ported from jpeg-8d.)
+
+11. Fixed a regression caused by 1.2.1[7] whereby the build would fail with
+multiple "Mismatch in operand sizes" errors when attempting to build the x86
+SIMD code with NASM 0.98.
+
+12. The in-memory source/destination managers (`jpeg_mem_src()` and
+`jpeg_mem_dest()`) are now included by default when building libjpeg-turbo with
+libjpeg v6b or v7 emulation, so that programs can take advantage of these
+functions without requiring the use of the backward-incompatible libjpeg v8
+ABI.  The "age number" of the libjpeg-turbo library on Un*x systems has been
+incremented by 1 to reflect this.  You can disable this feature with a
+configure/CMake switch in order to retain strict API/ABI compatibility with the
+libjpeg v6b or v7 API/ABI (or with previous versions of libjpeg-turbo.)  See
+[README.md](README.md) for more details.
+
+13. Added ARMv7s architecture to libjpeg.a and libturbojpeg.a in the official
+libjpeg-turbo binary package for OS X, so that those libraries can be used to
+build applications that leverage the faster CPUs in the iPhone 5 and iPad 4.
+
+
+1.2.1
+=====
+
+### Significant changes relative to 1.2.0:
+
+1. Creating or decoding a JPEG file that uses the RGB colorspace should now
+properly work when the input or output colorspace is one of the libjpeg-turbo
+colorspace extensions.
+
+2. When libjpeg-turbo was built without SIMD support and merged (non-fancy)
+upsampling was used along with an alpha-enabled colorspace during
+decompression, the unused byte of the decompressed pixels was not being set to
+0xFF.  This has been fixed.  TJUnitTest has also been extended to test for the
+correct behavior of the colorspace extensions when merged upsampling is used.
+
+3. Fixed a bug whereby the libjpeg-turbo SSE2 SIMD code would not preserve the
+upper 64 bits of xmm6 and xmm7 on Win64 platforms, which violated the Win64
+calling conventions.
+
+4. Fixed a regression (CVE-2012-2806) caused by 1.2.0[6] whereby decompressing
+corrupt JPEG images (specifically, images in which the component count was
+erroneously set to a large value) would cause libjpeg-turbo to segfault.
+
+5. Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
+processors.  The `MASKMOVDQU` instruction, which was used by the libjpeg-turbo
+SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
+it is painfully slow on Bobcat processors in particular.  Eliminating the use
+of this instruction improved performance by an order of magnitude on Bobcat
+processors and by a small amount (typically 5%) on AMD desktop processors.
+
+6. Added SIMD acceleration for performing 4:2:2 upsampling on NEON-capable ARM
+platforms.  This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
+platforms.
+
+7. Fixed a regression caused by 1.2.0[2] whereby, on Linux/x86 platforms
+running the 32-bit SSE2 SIMD code in libjpeg-turbo, decompressing a 4:2:0 or
+4:2:2 JPEG image into a 32-bit (RGBX, BGRX, etc.) buffer without using fancy
+upsampling would produce several incorrect columns of pixels at the right-hand
+side of the output image if each row in the output image was not evenly
+divisible by 16 bytes.
+
+8. Fixed an issue whereby attempting to build the SIMD extensions with Xcode
+4.3 on OS X platforms would cause NASM to return numerous errors of the form
+"'%define' expects a macro identifier".
+
+9. Added flags to the TurboJPEG API that allow the caller to force the use of
+either the fast or the accurate DCT/IDCT algorithms in the underlying codec.
+
+
+1.2.0
+=====
+
+### Significant changes relative to 1.2 beta1:
+
+1. Fixed build issue with Yasm on Unix systems (the libjpeg-turbo build system
+was not adding the current directory to the assembler include path, so Yasm
+was not able to find jsimdcfg.inc.)
+
+2. Fixed out-of-bounds read in SSE2 SIMD code that occurred when decompressing
+a JPEG image to a bitmap buffer whose size was not a multiple of 16 bytes.
+This was more of an annoyance than an actual bug, since it did not cause any
+actual run-time problems, but the issue showed up when running libjpeg-turbo in
+valgrind.  See <http://crbug.com/72399> for more information.
+
+3. Added a compile-time macro (`LIBJPEG_TURBO_VERSION`) that can be used to
+check the version of libjpeg-turbo against which an application was compiled.
+
+4. Added new RGBA/BGRA/ABGR/ARGB colorspace extension constants (libjpeg API)
+and pixel formats (TurboJPEG API), which allow applications to specify that,
+when decompressing to a 4-component RGB buffer, the unused byte should be set
+to 0xFF so that it can be interpreted as an opaque alpha channel.
+
+5. Fixed regression issue whereby DevIL failed to build against libjpeg-turbo
+because libjpeg-turbo's distributed version of jconfig.h contained an `INLINE`
+macro, which conflicted with a similar macro in DevIL.  This macro is used only
+internally when building libjpeg-turbo, so it was moved into config.h.
+
+6. libjpeg-turbo will now correctly decompress erroneous CMYK/YCCK JPEGs whose
+K component is assigned a component ID of 1 instead of 4.  Although these files
+are in violation of the spec, other JPEG implementations handle them
+correctly.
+
+7. Added ARMv6 and ARMv7 architectures to libjpeg.a and libturbojpeg.a in
+the official libjpeg-turbo binary package for OS X, so that those libraries can
+be used to build both OS X and iOS applications.
+
+
+1.1.90 (1.2 beta1)
+==================
+
+### Significant changes relative to 1.1.1:
+
+1. Added a Java wrapper for the TurboJPEG API.  See [java/README](java/README)
+for more details.
+
+2. The TurboJPEG API can now be used to scale down images during
+decompression.
+
+3. Added SIMD routines for RGB-to-grayscale color conversion, which
+significantly improves the performance of grayscale JPEG compression from an
+RGB source image.
+
+4. Improved the performance of the C color conversion routines, which are used
+on platforms for which SIMD acceleration is not available.
+
+5. Added a function to the TurboJPEG API that performs lossless transforms.
+This function is implemented using the same back end as jpegtran, but it
+performs transcoding entirely in memory and allows multiple transforms and/or
+crop operations to be batched together, so the source coefficients only need to
+be read once.  This is useful when generating image tiles from a single source
+JPEG.
+
+6. Added tests for the new TurboJPEG scaled decompression and lossless
+transform features to tjbench (the TurboJPEG benchmark, formerly called
+"jpgtest".)
+
+7. Added support for 4:4:0 (transposed 4:2:2) subsampling in TurboJPEG, which
+was necessary in order for it to read 4:2:2 JPEG files that had been losslessly
+transposed or rotated 90 degrees.
+
+8. All legacy VirtualGL code has been re-factored, and this has allowed
+libjpeg-turbo, in its entirety, to be re-licensed under a BSD-style license.
+
+9. libjpeg-turbo can now be built with Yasm.
+
+10. Added SIMD acceleration for ARM Linux and iOS platforms that support
+NEON instructions.
+
+11. Refactored the TurboJPEG C API and documented it using Doxygen.  The
+TurboJPEG 1.2 API uses pixel formats to define the size and component order of
+the uncompressed source/destination images, and it includes a more efficient
+version of `TJBUFSIZE()` that computes a worst-case JPEG size based on the
+level of chrominance subsampling.  The refactored implementation of the
+TurboJPEG API now uses the libjpeg memory source and destination managers,
+which allows the TurboJPEG compressor to grow the JPEG buffer as necessary.
+
+12. Eliminated errors in the output of jpegtran on Windows that occurred when
+the application was invoked using I/O redirection
+(`jpegtran <input.jpg >output.jpg`.)
+
+13. The inclusion of libjpeg v7 and v8 emulation as well as arithmetic coding
+support in libjpeg-turbo v1.1.0 introduced several new error constants in
+jerror.h, and these were mistakenly enabled for all emulation modes, causing
+the error enum in libjpeg-turbo to sometimes have different values than the
+same enum in libjpeg.  This represents an ABI incompatibility, and it caused
+problems with rare applications that took specific action based on a particular
+error value.  The fix was to include the new error constants conditionally
+based on whether libjpeg v7 or v8 emulation was enabled.
+
+14. Fixed an issue whereby Windows applications that used libjpeg-turbo would
+fail to compile if the Windows system headers were included before jpeglib.h.
+This issue was caused by a conflict in the definition of the INT32 type.
+
+15. Fixed 32-bit supplementary package for amd64 Debian systems, which was
+broken by enhancements to the packaging system in 1.1.
+
+16. When decompressing a JPEG image using an output colorspace of
+`JCS_EXT_RGBX`, `JCS_EXT_BGRX`, `JCS_EXT_XBGR`, or `JCS_EXT_XRGB`,
+libjpeg-turbo will now set the unused byte to 0xFF, which allows applications
+to interpret that byte as an alpha channel (0xFF = opaque).
+
+
+1.1.1
+=====
+
+### Significant changes relative to 1.1.0:
+
+1. Fixed a 1-pixel error in row 0, column 21 of the luminance plane generated
+by `tjEncodeYUV()`.
+
+2. libjpeg-turbo's accelerated Huffman decoder previously ignored unexpected
+markers found in the middle of the JPEG data stream during decompression.  It
+will now hand off decoding of a particular block to the unaccelerated Huffman
+decoder if an unexpected marker is found, so that the unaccelerated Huffman
+decoder can generate an appropriate warning.
+
+3. Older versions of MinGW64 prefixed symbol names with underscores by
+default, which differed from the behavior of 64-bit Visual C++.  MinGW64 1.0
+has adopted the behavior of 64-bit Visual C++ as the default, so to accommodate
+this, the libjpeg-turbo SIMD function names are no longer prefixed with an
+underscore when building with MinGW64.  This means that, when building
+libjpeg-turbo with older versions of MinGW64, you will now have to add
+`-fno-leading-underscore` to the `CFLAGS`.
+
+4. Fixed a regression bug in the NSIS script that caused the Windows installer
+build to fail when using the Visual Studio IDE.
+
+5. Fixed a bug in `jpeg_read_coefficients()` whereby it would not initialize
+`cinfo->image_width` and `cinfo->image_height` if libjpeg v7 or v8 emulation
+was enabled.  This specifically caused the jpegoptim program to fail if it was
+linked against a version of libjpeg-turbo that was built with libjpeg v7 or v8
+emulation.
+
+6. Eliminated excessive I/O overhead that occurred when reading BMP files in
+cjpeg.
+
+7. Eliminated errors in the output of cjpeg on Windows that occurred when the
+application was invoked using I/O redirection (`cjpeg <inputfile >output.jpg`.)
+
+
+1.1.0
+=====
+
+### Significant changes relative to 1.1 beta1:
+
+1. The algorithm used by the SIMD quantization function cannot produce correct
+results when the JPEG quality is >= 98 and the fast integer forward DCT is
+used.  Thus, the non-SIMD quantization function is now used for those cases,
+and libjpeg-turbo should now produce identical output to libjpeg v6b in all
+cases.
+
+2. Despite the above, the fast integer forward DCT still degrades somewhat for
+JPEG qualities greater than 95, so the TurboJPEG wrapper will now automatically
+use the accurate integer forward DCT when generating JPEG images of quality 96
+or greater.  This reduces compression performance by as much as 15% for these
+high-quality images but is necessary to ensure that the images are perceptually
+lossless.  It also ensures that the library can avoid the performance pitfall
+created by [1].
+
+3. Ported jpgtest.cxx to pure C to avoid the need for a C++ compiler.
+
+4. Fixed visual artifacts in grayscale JPEG compression caused by a typo in
+the RGB-to-luminance lookup tables.
+
+5. The Windows distribution packages now include the libjpeg run-time programs
+(cjpeg, etc.)
+
+6. All packages now include jpgtest.
+
+7. The TurboJPEG dynamic library now uses versioned symbols.
+
+8. Added two new TurboJPEG API functions, `tjEncodeYUV()` and
+`tjDecompressToYUV()`, to replace the somewhat hackish `TJ_YUV` flag.
+
+
+1.0.90 (1.1 beta1)
+==================
+
+### Significant changes relative to 1.0.1:
+
+1. Added emulation of the libjpeg v7 and v8 APIs and ABIs.  See
+[README.md](README.md) for more details.  This feature was sponsored by
+CamTrace SAS.
+
+2. Created a new CMake-based build system for the Visual C++ and MinGW builds.
+
+3. Grayscale bitmaps can now be compressed from/decompressed to using the
+TurboJPEG API.
+
+4. jpgtest can now be used to test decompression performance with existing
+JPEG images.
+
+5. If the default install prefix (/opt/libjpeg-turbo) is used, then
+`make install` now creates /opt/libjpeg-turbo/lib32 and
+/opt/libjpeg-turbo/lib64 sym links to duplicate the behavior of the binary
+packages.
+
+6. All symbols in the libjpeg-turbo dynamic library are now versioned, even
+when the library is built with libjpeg v6b emulation.
+
+7. Added arithmetic encoding and decoding support (can be disabled with
+configure or CMake options)
+
+8. Added a `TJ_YUV` flag to the TurboJPEG API, which causes both the compressor
+and decompressor to output planar YUV images.
+
+9. Added an extended version of `tjDecompressHeader()` to the TurboJPEG API,
+which allows the caller to determine the type of subsampling used in a JPEG
+image.
+
+10. Added further protections against invalid Huffman codes.
+
+
+1.0.1
+=====
+
+### Significant changes relative to 1.0.0:
+
+1. The Huffman decoder will now handle erroneous Huffman codes (for instance,
+from a corrupt JPEG image.)  Previously, these would cause libjpeg-turbo to
+crash under certain circumstances.
+
+2. Fixed typo in SIMD dispatch routines that was causing 4:2:2 upsampling to
+be used instead of 4:2:0 when decompressing JPEG images using SSE2 code.
+
+3. The configure script will now automatically determine whether the
+`INCOMPLETE_TYPES_BROKEN` macro should be defined.
+
+
+1.0.0
+=====
+
+### Significant changes relative to 0.0.93:
+
+1. 2983700: Further FreeBSD build tweaks (no longer necessary to specify
+`--host` when configuring on a 64-bit system)
+
+2. Created symlinks in the Unix/Linux packages so that the TurboJPEG
+include file can always be found in /opt/libjpeg-turbo/include, the 32-bit
+static libraries can always be found in /opt/libjpeg-turbo/lib32, and the
+64-bit static libraries can always be found in /opt/libjpeg-turbo/lib64.
+
+3. The Unix/Linux distribution packages now include the libjpeg run-time
+programs (cjpeg, etc.) and man pages.
+
+4. Created a 32-bit supplementary package for amd64 Debian systems, which
+contains just the 32-bit libjpeg-turbo libraries.
+
+5. Moved the libraries from */lib32 to */lib in the i386 Debian package.
+
+6. Include distribution package for Cygwin
+
+7. No longer necessary to specify `--without-simd` on non-x86 architectures,
+and unit tests now work on those architectures.
+
+
+0.0.93
+======
+
+### Significant changes since 0.0.91:
+
+1. 2982659: Fixed x86-64 build on FreeBSD systems
+
+2. 2988188: Added support for Windows 64-bit systems
+
+
+0.0.91
+======
+
+### Significant changes relative to 0.0.90:
+
+1. Added documentation to .deb packages
+
+2. 2968313: Fixed data corruption issues when decompressing large JPEG images
+and/or using buffered I/O with the libjpeg-turbo decompressor
+
+
+0.0.90
+======
+
+Initial release
diff --git a/media/libjpeg/LICENSE.md b/media/libjpeg/LICENSE.md
index 4623e29425..d753e1d76a 100644
--- a/media/libjpeg/LICENSE.md
+++ b/media/libjpeg/LICENSE.md
@@ -9,12 +9,12 @@ libjpeg-turbo is covered by three compatible BSD-style open source licenses:
   This license applies to the libjpeg API library and associated programs
   (any code inherited from libjpeg, and any modifications to that code.)
 
-- The Modified (3-clause) BSD License, which is listed in
-  [turbojpeg.c](turbojpeg.c)
+- The Modified (3-clause) BSD License, which is listed below
 
-  This license covers the TurboJPEG API library and associated programs.
+  This license covers the TurboJPEG API library and associated programs, as
+  well as the build system.
 
-- The zlib License, which is listed in [simd/jsimdext.inc](simd/jsimdext.inc)
+- The [zlib License](https://opensource.org/licenses/Zlib)
 
   This license is a subset of the other two, and it covers the libjpeg-turbo
   SIMD extensions.
@@ -66,7 +66,7 @@ best of our understanding.
 
     2.  If your binary distribution includes or uses the TurboJPEG API, then
         your product documentation must include the text of the Modified BSD
-        License.
+        License (see below.)
 
         **Origin**
         - Clause 2 of the Modified BSD License
@@ -86,3 +86,47 @@ best of our understanding.
     - IJG License
     - Modified BSD License
     - zlib License
+
+
+The Modified (3-clause) BSD License
+===================================
+
+Copyright (C)2009-2022 D. R. Commander.  All Rights Reserved.<br>
+Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+- Neither the name of the libjpeg-turbo Project nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+Why Three Licenses?
+===================
+
+The zlib License could have been used instead of the Modified (3-clause) BSD
+License, and since the IJG License effectively subsumes the distribution
+conditions of the zlib License, this would have effectively placed
+libjpeg-turbo binary distributions under the IJG License.  However, the IJG
+License specifically refers to the Independent JPEG Group and does not extend
+attribution and endorsement protections to other entities.  Thus, it was
+desirable to choose a license that granted us the same protections for new code
+that were granted to the IJG for code derived from their software.
diff --git a/media/libjpeg/MOZCHANGES b/media/libjpeg/MOZCHANGES
index 6e7824cdd7..4e65df2222 100644
--- a/media/libjpeg/MOZCHANGES
+++ b/media/libjpeg/MOZCHANGES
@@ -48,6 +48,34 @@ To upgrade to a new revision of libjpeg-turbo, do the following:
 
     $ hg addremove
 
+== February 28, 2022 (libjpeg-turbo v2.1.3 c5f269eb9665435271c05fbcaf8721fa58e9eafa 2022-02-25) ==
+
+* Updated to v2.1.3 release.
+
+== September 9, 2021 (libjpeg-turbo v2.1.1 0a9b9721782d3a60a5c16c8c9a7abf3d4b1ecd42 2020-08-10) ==
+
+* Updated to v2.1.1 release.
+
+== November 19, 2020 (libjpeg-turbo v2.0.6 10ba6ed3365615ed5c2995fe2d240cb2d5000173 2020-11-16) ==
+
+* Updated to v2.0.6 release.
+
+== January 6, 2020 (libjpeg-turbo v2.0.4 166e34213e4f4e2363ce058a7bcc69fd03e38b76 2019-12-31) ==
+
+* Updated to v2.0.4 release.
+
+== September 5, 2019 (libjpeg-turbo v2.0.3 5db6a6819d0f904e0b58f34ae928fea234adb1a0 2019-09-04) ==
+
+* Updated to v2.0.3 release.
+
+== October 4, 2018 (libjpeg-turbo v2.0.0 574f3a772c96dc9db2c98ef24706feb3f6dbda9a 2018-06-27) ==
+
+* Updated to v2.0.0 release.
+
+== July 13, 2017 (libjpeg-turbo v1.5.2 e5c1613ccdfeffcd060fd94248b7c8ac7c0cfb0f 2017-08-09) ==
+
+* Updated to v1.5.2 release.
+
 == September 22, 2016 (libjpeg-turbo v1.5.1 cb88e5da8003afcdc443b787fdcb77285e5a8a02 2016-09-20) ==
 
 * Updated to v1.5.1 release.
diff --git a/media/libjpeg/README.ijg b/media/libjpeg/README.ijg
index 9c450ceb07..9453c19501 100644
--- a/media/libjpeg/README.ijg
+++ b/media/libjpeg/README.ijg
@@ -43,7 +43,7 @@ User documentation:
   change.log        Version-to-version change highlights.
 Programmer and internal documentation:
   libjpeg.txt       How to use the JPEG library in your own programs.
-  example.c         Sample code for calling the JPEG library.
+  example.txt       Sample code for calling the JPEG library.
   structure.txt     Overview of the JPEG library's internal structure.
   coderules.txt     Coding style rules --- please read if you contribute code.
 
@@ -128,7 +128,7 @@ with respect to this software, its quality, accuracy, merchantability, or
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.
 
-This software is copyright (C) 1991-2016, Thomas G. Lane, Guido Vollbeding.
+This software is copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.
 
 Permission is hereby granted to use, copy, modify, and distribute this
@@ -159,25 +159,6 @@ commercial products, provided that all warranty or liability claims are
 assumed by the product vendor.
 
 
-The Unix configuration script "configure" was produced with GNU Autoconf.
-It is copyright by the Free Software Foundation but is freely distributable.
-The same holds for its supporting scripts (config.guess, config.sub,
-ltmain.sh).  Another support script, install-sh, is copyright by X Consortium
-but is also freely distributable.
-
-The IJG distribution formerly included code to read and write GIF files.
-To avoid entanglement with the Unisys LZW patent (now expired), GIF reading
-support has been removed altogether, and the GIF writer has been simplified
-to produce "uncompressed GIFs".  This technique does not use the LZW
-algorithm; the resulting GIF files are larger than usual, but are readable
-by all standard GIF decoders.
-
-We are required to state that
-    "The Graphics Interchange Format(c) is the Copyright property of
-    CompuServe Incorporated.  GIF(sm) is a Service Mark property of
-    CompuServe Incorporated."
-
-
 REFERENCES
 ==========
 
@@ -185,8 +166,8 @@ We recommend reading one or more of these references before trying to
 understand the innards of the JPEG software.
 
 The best short technical introduction to the JPEG compression algorithm is
-	Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
-	Communications of the ACM, April 1991 (vol. 34 no. 4), pp. 30-44.
+        Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
+        Communications of the ACM, April 1991 (vol. 34 no. 4), pp. 30-44.
 (Adjacent articles in that issue discuss MPEG motion picture compression,
 applications of JPEG, and related topics.)  If you don't have the CACM issue
 handy, a PDF file containing a revised version of Wallace's article is
@@ -220,21 +201,21 @@ Continuous-tone Still Images, Part 2: Compliance testing" and has document
 numbers ISO/IEC IS 10918-2, ITU-T T.83.
 
 The JPEG standard does not specify all details of an interchangeable file
-format.  For the omitted details we follow the "JFIF" conventions, revision
-1.02.  JFIF 1.02 has been adopted as an Ecma International Technical Report
-and thus received a formal publication status.  It is available as a free
-download in PDF format from
-http://www.ecma-international.org/publications/techreports/E-TR-098.htm.
-A PostScript version of the JFIF document is available at
-http://www.ijg.org/files/jfif.ps.gz.  There is also a plain text version at
-http://www.ijg.org/files/jfif.txt.gz, but it is missing the figures.
-
-The TIFF 6.0 file format specification can be obtained by FTP from
-ftp://ftp.sgi.com/graphics/tiff/TIFF6.ps.gz.  The JPEG incorporation scheme
-found in the TIFF 6.0 spec of 3-June-92 has a number of serious problems.
-IJG does not recommend use of the TIFF 6.0 design (TIFF Compression tag 6).
-Instead, we recommend the JPEG design proposed by TIFF Technical Note #2
-(Compression tag 7).  Copies of this Note can be obtained from
+format.  For the omitted details, we follow the "JFIF" conventions, revision
+1.02.  JFIF version 1 has been adopted as ISO/IEC 10918-5 (05/2013) and
+Recommendation ITU-T T.871 (05/2011): Information technology - Digital
+compression and coding of continuous-tone still images: JPEG File Interchange
+Format (JFIF).  It is available as a free download in PDF file format from
+https://www.iso.org/standard/54989.html and http://www.itu.int/rec/T-REC-T.871.
+A PDF file of the older JFIF 1.02 specification is available at
+http://www.w3.org/Graphics/JPEG/jfif3.pdf.
+
+The TIFF 6.0 file format specification can be obtained from
+http://mirrors.ctan.org/graphics/tiff/TIFF6.ps.gz.  The JPEG incorporation
+scheme found in the TIFF 6.0 spec of 3-June-92 has a number of serious
+problems.  IJG does not recommend use of the TIFF 6.0 design (TIFF Compression
+tag 6).  Instead, we recommend the JPEG design proposed by TIFF Technical Note
+#2 (Compression tag 7).  Copies of this Note can be obtained from
 http://www.ijg.org/files/.  It is expected that the next revision
 of the TIFF spec will replace the 6.0 JPEG design with the Note's design.
 Although IJG's own code does not support TIFF/JPEG, the free libtiff library
@@ -249,28 +230,26 @@ The most recent released version can always be found there in
 directory "files".
 
 The JPEG FAQ (Frequently Asked Questions) article is a source of some
-general information about JPEG.
-It is available on the World Wide Web at http://www.faqs.org/faqs/jpeg-faq/
-and other news.answers archive sites, including the official news.answers
-archive at rtfm.mit.edu: ftp://rtfm.mit.edu/pub/usenet/news.answers/jpeg-faq/.
-If you don't have Web or FTP access, send e-mail to mail-server@rtfm.mit.edu
-with body
-	send usenet/news.answers/jpeg-faq/part1
-	send usenet/news.answers/jpeg-faq/part2
-
-
-FILE FORMAT WARS
-================
-
-The ISO/IEC JTC1/SC29/WG1 standards committee (also known as JPEG, together
-with ITU-T SG16) currently promotes different formats containing the name
-"JPEG" which are incompatible with original DCT-based JPEG.  IJG therefore does
-not support these formats (see REFERENCES).  Indeed, one of the original
-reasons for developing this free software was to help force convergence on
-common, interoperable format standards for JPEG files.
-Don't use an incompatible file format!
-(In any case, our decoder will remain capable of reading existing JPEG
-image files indefinitely.)
+general information about JPEG.  It is available at
+http://www.faqs.org/faqs/jpeg-faq.
+
+
+FILE FORMAT COMPATIBILITY
+=========================
+
+This software implements ITU T.81 | ISO/IEC 10918 with some extensions from
+ITU T.871 | ISO/IEC 10918-5 (JPEG File Interchange Format-- see REFERENCES).
+Informally, the term "JPEG image" or "JPEG file" most often refers to JFIF or
+a subset thereof, but there are other formats containing the name "JPEG" that
+are incompatible with the DCT-based JPEG standard or with JFIF (for instance,
+JPEG 2000 and JPEG XR).  This software therefore does not support these
+formats.  Indeed, one of the original reasons for developing this free software
+was to help force convergence on a common, interoperable format standard for
+JPEG files.
+
+JFIF is a minimal or "low end" representation.  TIFF/JPEG (TIFF revision 6.0 as
+modified by TIFF Technical Note #2) can be used for "high end" applications
+that need to record a lot of additional data about an image.
 
 
 TO DO
diff --git a/media/libjpeg/README.md b/media/libjpeg/README.md
index ca8866e06c..01e391ea7c 100755..100644
--- a/media/libjpeg/README.md
+++ b/media/libjpeg/README.md
@@ -1,13 +1,14 @@
 Background
 ==========
 
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-NEON, AltiVec) to accelerate baseline JPEG compression and decompression on
-x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is
-generally 2-6x as fast as libjpeg, all else being equal.  On other types of
-systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
-virtue of its highly-optimized Huffman coding routines.  In many cases, the
-performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
+baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm
+systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
+all else being equal.  On other types of systems, libjpeg-turbo can still
+outperform libjpeg by a significant amount, by virtue of its highly-optimized
+Huffman coding routines.  In many cases, the performance of libjpeg-turbo
+rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less
 powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
@@ -42,21 +43,25 @@ Using libjpeg-turbo
 libjpeg-turbo includes two APIs that can be used to compress and decompress
 JPEG images:
 
-- **TurboJPEG API**  
+- **TurboJPEG API**<br>
   This API provides an easy-to-use interface for compressing and decompressing
   JPEG images in memory.  It also provides some functionality that would not be
   straightforward to achieve using the underlying libjpeg API, such as
   generating planar YUV images and performing multiple simultaneous lossless
   transforms on an image.  The Java interface for libjpeg-turbo is written on
-  top of the TurboJPEG API.
+  top of the TurboJPEG API.  The TurboJPEG API is recommended for first-time
+  users of libjpeg-turbo.  Refer to [tjexample.c](tjexample.c) and
+  [TJExample.java](java/TJExample.java) for examples of its usage and to
+  <http://libjpeg-turbo.org/Documentation/Documentation> for API documentation.
 
-- **libjpeg API**  
+- **libjpeg API**<br>
   This is the de facto industry-standard API for compressing and decompressing
   JPEG images.  It is more difficult to use than the TurboJPEG API but also
   more powerful.  The libjpeg API implementation in libjpeg-turbo is both
   API/ABI-compatible and mathematically compatible with libjpeg v6b.  It can
   also optionally be configured to be API/ABI-compatible with libjpeg v7 and v8
-  (see below.)
+  (see below.)  Refer to [cjpeg.c](cjpeg.c) and [djpeg.c](djpeg.c) for examples
+  of its usage and to [libjpeg.txt](libjpeg.txt) for API documentation.
 
 There is no significant performance advantage to either API when both are used
 to perform similar operations.
@@ -130,28 +135,27 @@ without recompiling.  libjpeg-turbo does not claim to support all of the
 libjpeg v7+ features, nor to produce identical output to libjpeg v7+ in all
 cases (see below.)
 
-By passing an argument of `--with-jpeg7` or `--with-jpeg8` to `configure`, or
-an argument of `-DWITH_JPEG7=1` or `-DWITH_JPEG8=1` to `cmake`, you can build a
-version of libjpeg-turbo that emulates the libjpeg v7 or v8 ABI, so that
-programs that are built against libjpeg v7 or v8 can be run with libjpeg-turbo.
-The following section describes which libjpeg v7+ features are supported and
-which aren't.
+By passing an argument of `-DWITH_JPEG7=1` or `-DWITH_JPEG8=1` to `cmake`, you
+can build a version of libjpeg-turbo that emulates the libjpeg v7 or v8 ABI, so
+that programs that are built against libjpeg v7 or v8 can be run with
+libjpeg-turbo.  The following section describes which libjpeg v7+ features are
+supported and which aren't.
 
 ### Support for libjpeg v7 and v8 Features
 
 #### Fully supported
 
-- **libjpeg: IDCT scaling extensions in decompressor**  
+- **libjpeg API: IDCT scaling extensions in decompressor**<br>
   libjpeg-turbo supports IDCT scaling with scaling factors of 1/8, 1/4, 3/8,
   1/2, 5/8, 3/4, 7/8, 9/8, 5/4, 11/8, 3/2, 13/8, 7/4, 15/8, and 2/1 (only 1/4
   and 1/2 are SIMD-accelerated.)
 
-- **libjpeg: Arithmetic coding**
+- **libjpeg API: Arithmetic coding**
 
-- **libjpeg: In-memory source and destination managers**  
+- **libjpeg API: In-memory source and destination managers**<br>
   See notes below.
 
-- **cjpeg: Separate quality settings for luminance and chrominance**  
+- **cjpeg: Separate quality settings for luminance and chrominance**<br>
   Note that the libpjeg v7+ API was extended to accommodate this feature only
   for convenience purposes.  It has always been possible to implement this
   feature with libjpeg v6b (see rdswitch.c for an example.)
@@ -175,19 +179,19 @@ which aren't.
 
 NOTE:  As of this writing, extensive research has been conducted into the
 usefulness of DCT scaling as a means of data reduction and SmartScale as a
-means of quality improvement.  The reader is invited to peruse the research at
-<http://www.libjpeg-turbo.org/About/SmartScale> and draw his/her own conclusions,
+means of quality improvement.  Readers are invited to peruse the research at
+<http://www.libjpeg-turbo.org/About/SmartScale> and draw their own conclusions,
 but it is the general belief of our project that these features have not
 demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
 
-- **libjpeg: DCT scaling in compressor**  
+- **libjpeg API: DCT scaling in compressor**<br>
   `cinfo.scale_num` and `cinfo.scale_denom` are silently ignored.
   There is no technical reason why DCT scaling could not be supported when
   emulating the libjpeg v7+ API/ABI, but without the SmartScale extension (see
   below), only scaling factors of 1/2, 8/15, 4/7, 8/13, 2/3, 8/11, 4/5, and
   8/9 would be available, which is of limited usefulness.
 
-- **libjpeg: SmartScale**  
+- **libjpeg API: SmartScale**<br>
   `cinfo.block_size` is silently ignored.
   SmartScale is an extension to the JPEG format that allows for DCT block
   sizes other than 8x8.  Providing support for this new format would be
@@ -200,15 +204,15 @@ demonstrated sufficient usefulness to justify inclusion in libjpeg-turbo.
   interest in providing this feature would be as a means of supporting
   additional DCT scaling factors.
 
-- **libjpeg: Fancy downsampling in compressor**  
+- **libjpeg API: Fancy downsampling in compressor**<br>
   `cinfo.do_fancy_downsampling` is silently ignored.
   This requires the DCT scaling feature, which is not supported.
 
-- **jpegtran: Scaling**  
+- **jpegtran: Scaling**<br>
   This requires both the DCT scaling and SmartScale features, which are not
   supported.
 
-- **Lossless RGB JPEG files**  
+- **Lossless RGB JPEG files**<br>
   This requires the SmartScale feature, which is not supported.
 
 ### What About libjpeg v9?
@@ -226,7 +230,7 @@ generally accomplish anything that can't already be accomplished better with
 existing, standard lossless formats.  Therefore, at this time it is our belief
 that there is not sufficient technical justification for software projects to
 upgrade from libjpeg v8 to libjpeg v9, and thus there is not sufficient
-echnical justification for us to emulate the libjpeg v9 ABI.
+technical justification for us to emulate the libjpeg v9 ABI.
 
 In-Memory Source/Destination Managers
 -------------------------------------
@@ -242,15 +246,14 @@ don't, and it allows those functions to be provided in the "official"
 libjpeg-turbo binaries.
 
 Those who are concerned about maintaining strict conformance with the libjpeg
-v6b or v7 API can pass an argument of `--without-mem-srcdst` to `configure` or
-an argument of `-DWITH_MEM_SRCDST=0` to `cmake` prior to building
-libjpeg-turbo.  This will restore the pre-1.3 behavior, in which
+v6b or v7 API can pass an argument of `-DWITH_MEM_SRCDST=0` to `cmake` prior to
+building libjpeg-turbo.  This will restore the pre-1.3 behavior, in which
 `jpeg_mem_src()` and `jpeg_mem_dest()` are only included when emulating the
 libjpeg v8 API/ABI.
 
 On Un*x systems, including the in-memory source/destination managers changes
-the dynamic library version from 62.0.0 to 62.1.0 if using libjpeg v6b API/ABI
-emulation and from 7.0.0 to 7.1.0 if using libjpeg v7 API/ABI emulation.
+the dynamic library version from 62.2.0 to 62.3.0 if using libjpeg v6b API/ABI
+emulation and from 7.2.0 to 7.3.0 if using libjpeg v7 API/ABI emulation.
 
 Note that, on most Un*x systems, the dynamic linker will not look for a
 function in a library until that function is actually used.  Thus, if a program
@@ -284,12 +287,13 @@ following reasons:
   (and slightly faster) floating point IDCT algorithm introduced in libjpeg
   v8a as opposed to the algorithm used in libjpeg v6b.  It should be noted,
   however, that this algorithm basically brings the accuracy of the floating
-  point IDCT in line with the accuracy of the slow integer IDCT.  The floating
-  point DCT/IDCT algorithms are mainly a legacy feature, and they do not
-  produce significantly more accuracy than the slow integer algorithms (to put
-  numbers on this, the typical difference in PNSR between the two algorithms
-  is less than 0.10 dB, whereas changing the quality level by 1 in the upper
-  range of the quality scale is typically more like a 1.0 dB difference.)
+  point IDCT in line with the accuracy of the accurate integer IDCT.  The
+  floating point DCT/IDCT algorithms are mainly a legacy feature, and they do
+  not produce significantly more accuracy than the accurate integer algorithms
+  (to put numbers on this, the typical difference in PNSR between the two
+  algorithms is less than 0.10 dB, whereas changing the quality level by 1 in
+  the upper range of the quality scale is typically more like a 1.0 dB
+  difference.)
 
 - If the floating point algorithms in libjpeg-turbo are not implemented using
   SIMD instructions on a particular platform, then the accuracy of the
@@ -326,7 +330,7 @@ in a way that makes the rest of the libjpeg infrastructure happy, so it is
 necessary to use the slow Huffman decoder when decompressing a JPEG image that
 has restart markers.  This can cause the decompression performance to drop by
 as much as 20%, but the performance will still be much greater than that of
-libjpeg.  Many consumer packages, such as PhotoShop, use restart markers when
+libjpeg.  Many consumer packages, such as Photoshop, use restart markers when
 generating JPEG images, so images generated by those programs will experience
 this issue.
 
@@ -337,5 +341,17 @@ The algorithm used by the SIMD-accelerated quantization function cannot produce
 correct results whenever the fast integer forward DCT is used along with a JPEG
 quality of 98-100.  Thus, libjpeg-turbo must use the non-SIMD quantization
 function in those cases.  This causes performance to drop by as much as 40%.
-It is therefore strongly advised that you use the slow integer forward DCT
+It is therefore strongly advised that you use the accurate integer forward DCT
 whenever encoding images with a JPEG quality of 98 or higher.
+
+
+Memory Debugger Pitfalls
+========================
+
+Valgrind and Memory Sanitizer (MSan) can generate false positives
+(specifically, incorrect reports of uninitialized memory accesses) when used
+with libjpeg-turbo's SIMD extensions.  It is generally recommended that the
+SIMD extensions be disabled, either by passing an argument of `-DWITH_SIMD=0`
+to `cmake` when configuring the build or by setting the environment variable
+`JSIMD_FORCENONE` to `1` at run time, when testing libjpeg-turbo with Valgrind,
+MSan, or other memory debuggers.
diff --git a/media/libjpeg/jaricom.c b/media/libjpeg/jaricom.c
index 3bb557f7a3..215640cc44 100644
--- a/media/libjpeg/jaricom.c
+++ b/media/libjpeg/jaricom.c
@@ -4,16 +4,16 @@
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2018, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
  * This file contains probability estimation tables for common use in
  * arithmetic entropy encoding and decoding routines.
  *
- * This data represents Table D.2 in the JPEG spec (ISO/IEC IS 10918-1
- * and CCITT Recommendation ITU-T T.81) and Table 24 in the JBIG spec
- * (ISO/IEC IS 11544 and CCITT Recommendation ITU-T T.82).
+ * This data represents Table D.2 in
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994 and Table 24 in
+ * Recommendation ITU-T T.82 (1993) | ISO/IEC 11544:1993.
  */
 
 #define JPEG_INTERNALS
@@ -29,9 +29,10 @@
  * implementation (jbig_tab.c).
  */
 
-#define V(i,a,b,c,d) (((JLONG)a << 16) | ((JLONG)c << 8) | ((JLONG)d << 7) | b)
+#define V(i, a, b, c, d) \
+  (((JLONG)a << 16) | ((JLONG)c << 8) | ((JLONG)d << 7) | b)
 
-const JLONG jpeg_aritab[113+1] = {
+const JLONG jpeg_aritab[113 + 1] = {
 /*
  * Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS
  */
diff --git a/media/libjpeg/jcapimin.c b/media/libjpeg/jcapimin.c
index 15674be54a..84e7ecc9a7 100644
--- a/media/libjpeg/jcapimin.c
+++ b/media/libjpeg/jcapimin.c
@@ -4,8 +4,8 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * Modified 2003-2010 by Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -31,7 +31,7 @@
  */
 
 GLOBAL(void)
-jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
+jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize)
 {
   int i;
 
@@ -41,7 +41,7 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
     ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version);
   if (structsize != sizeof(struct jpeg_compress_struct))
     ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE,
-             (int) sizeof(struct jpeg_compress_struct), (int) structsize);
+             (int)sizeof(struct jpeg_compress_struct), (int)structsize);
 
   /* For debugging purposes, we zero the whole master structure.
    * But the application has already set the err pointer, and may have set
@@ -52,14 +52,14 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
   {
     struct jpeg_error_mgr *err = cinfo->err;
     void *client_data = cinfo->client_data; /* ignore Purify complaint here */
-    MEMZERO(cinfo, sizeof(struct jpeg_compress_struct));
+    memset(cinfo, 0, sizeof(struct jpeg_compress_struct));
     cinfo->err = err;
     cinfo->client_data = client_data;
   }
   cinfo->is_decompressor = FALSE;
 
   /* Initialize a memory manager instance for this object */
-  jinit_memory_mgr((j_common_ptr) cinfo);
+  jinit_memory_mgr((j_common_ptr)cinfo);
 
   /* Zero out pointers to permanent structures. */
   cinfo->progress = NULL;
@@ -83,7 +83,7 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
   /* Must do it here for emit_dqt in case jpeg_write_tables is used */
   cinfo->block_size = DCTSIZE;
   cinfo->natural_order = jpeg_natural_order;
-  cinfo->lim_Se = DCTSIZE2-1;
+  cinfo->lim_Se = DCTSIZE2 - 1;
 #endif
 
   cinfo->script_space = NULL;
@@ -100,9 +100,9 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
  */
 
 GLOBAL(void)
-jpeg_destroy_compress (j_compress_ptr cinfo)
+jpeg_destroy_compress(j_compress_ptr cinfo)
 {
-  jpeg_destroy((j_common_ptr) cinfo); /* use common routine */
+  jpeg_destroy((j_common_ptr)cinfo); /* use common routine */
 }
 
 
@@ -112,9 +112,9 @@ jpeg_destroy_compress (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_abort_compress (j_compress_ptr cinfo)
+jpeg_abort_compress(j_compress_ptr cinfo)
 {
-  jpeg_abort((j_common_ptr) cinfo); /* use common routine */
+  jpeg_abort((j_common_ptr)cinfo); /* use common routine */
 }
 
 
@@ -131,7 +131,7 @@ jpeg_abort_compress (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress)
+jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress)
 {
   int i;
   JQUANT_TBL *qtbl;
@@ -159,7 +159,7 @@ jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress)
  */
 
 GLOBAL(void)
-jpeg_finish_compress (j_compress_ptr cinfo)
+jpeg_finish_compress(j_compress_ptr cinfo)
 {
   JDIMENSION iMCU_row;
 
@@ -172,18 +172,18 @@ jpeg_finish_compress (j_compress_ptr cinfo)
   } else if (cinfo->global_state != CSTATE_WRCOEFS)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   /* Perform any remaining passes */
-  while (! cinfo->master->is_last_pass) {
+  while (!cinfo->master->is_last_pass) {
     (*cinfo->master->prepare_for_pass) (cinfo);
     for (iMCU_row = 0; iMCU_row < cinfo->total_iMCU_rows; iMCU_row++) {
       if (cinfo->progress != NULL) {
-        cinfo->progress->pass_counter = (long) iMCU_row;
-        cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows;
-        (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        cinfo->progress->pass_counter = (long)iMCU_row;
+        cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows;
+        (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
       }
       /* We bypass the main controller and invoke coef controller directly;
        * all work is being done from the coefficient buffer.
        */
-      if (! (*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE) NULL))
+      if (!(*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE)NULL))
         ERREXIT(cinfo, JERR_CANT_SUSPEND);
     }
     (*cinfo->master->finish_pass) (cinfo);
@@ -192,7 +192,7 @@ jpeg_finish_compress (j_compress_ptr cinfo)
   (*cinfo->marker->write_file_trailer) (cinfo);
   (*cinfo->dest->term_destination) (cinfo);
   /* We can use jpeg_abort to release memory and reset global_state */
-  jpeg_abort((j_common_ptr) cinfo);
+  jpeg_abort((j_common_ptr)cinfo);
 }
 
 
@@ -204,8 +204,8 @@ jpeg_finish_compress (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_write_marker (j_compress_ptr cinfo, int marker,
-                   const JOCTET *dataptr, unsigned int datalen)
+jpeg_write_marker(j_compress_ptr cinfo, int marker, const JOCTET *dataptr,
+                  unsigned int datalen)
 {
   void (*write_marker_byte) (j_compress_ptr info, int val);
 
@@ -226,7 +226,7 @@ jpeg_write_marker (j_compress_ptr cinfo, int marker,
 /* Same, but piecemeal. */
 
 GLOBAL(void)
-jpeg_write_m_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
+jpeg_write_m_header(j_compress_ptr cinfo, int marker, unsigned int datalen)
 {
   if (cinfo->next_scanline != 0 ||
       (cinfo->global_state != CSTATE_SCANNING &&
@@ -238,7 +238,7 @@ jpeg_write_m_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
 }
 
 GLOBAL(void)
-jpeg_write_m_byte (j_compress_ptr cinfo, int val)
+jpeg_write_m_byte(j_compress_ptr cinfo, int val)
 {
   (*cinfo->marker->write_marker_byte) (cinfo, val);
 }
@@ -266,13 +266,13 @@ jpeg_write_m_byte (j_compress_ptr cinfo, int val)
  */
 
 GLOBAL(void)
-jpeg_write_tables (j_compress_ptr cinfo)
+jpeg_write_tables(j_compress_ptr cinfo)
 {
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
 
   /* (Re)initialize error mgr and destination modules */
-  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   (*cinfo->dest->init_destination) (cinfo);
   /* Initialize the marker writer ... bit of a crock to do it here. */
   jinit_marker_writer(cinfo);
diff --git a/media/libjpeg/jcapistd.c b/media/libjpeg/jcapistd.c
index 5c6d0be255..aa2aad9f66 100644
--- a/media/libjpeg/jcapistd.c
+++ b/media/libjpeg/jcapistd.c
@@ -36,7 +36,7 @@
  */
 
 GLOBAL(void)
-jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables)
+jpeg_start_compress(j_compress_ptr cinfo, boolean write_all_tables)
 {
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
@@ -45,7 +45,7 @@ jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables)
     jpeg_suppress_tables(cinfo, FALSE); /* mark all tables to be written */
 
   /* (Re)initialize error mgr and destination modules */
-  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   (*cinfo->dest->init_destination) (cinfo);
   /* Perform master selection of active modules */
   jinit_compress_master(cinfo);
@@ -75,8 +75,8 @@ jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables)
  */
 
 GLOBAL(JDIMENSION)
-jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines,
-                      JDIMENSION num_lines)
+jpeg_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+                     JDIMENSION num_lines)
 {
   JDIMENSION row_ctr, rows_left;
 
@@ -87,9 +87,9 @@ jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines,
 
   /* Call progress monitor hook if present */
   if (cinfo->progress != NULL) {
-    cinfo->progress->pass_counter = (long) cinfo->next_scanline;
-    cinfo->progress->pass_limit = (long) cinfo->image_height;
-    (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+    cinfo->progress->pass_counter = (long)cinfo->next_scanline;
+    cinfo->progress->pass_limit = (long)cinfo->image_height;
+    (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
   }
 
   /* Give master control module another chance if this is first call to
@@ -118,8 +118,8 @@ jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines,
  */
 
 GLOBAL(JDIMENSION)
-jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
-                     JDIMENSION num_lines)
+jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                    JDIMENSION num_lines)
 {
   JDIMENSION lines_per_iMCU_row;
 
@@ -132,9 +132,9 @@ jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
 
   /* Call progress monitor hook if present */
   if (cinfo->progress != NULL) {
-    cinfo->progress->pass_counter = (long) cinfo->next_scanline;
-    cinfo->progress->pass_limit = (long) cinfo->image_height;
-    (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+    cinfo->progress->pass_counter = (long)cinfo->next_scanline;
+    cinfo->progress->pass_limit = (long)cinfo->image_height;
+    (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
   }
 
   /* Give master control module another chance if this is first call to
@@ -151,7 +151,7 @@ jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
   /* Directly compress the row. */
-  if (! (*cinfo->coef->compress_data) (cinfo, data)) {
+  if (!(*cinfo->coef->compress_data) (cinfo, data)) {
     /* If compressor did not consume the whole row, suspend processing. */
     return 0;
   }
diff --git a/media/libjpeg/jcarith.c b/media/libjpeg/jcarith.c
index 6d3b8af5b4..b1720521bf 100644
--- a/media/libjpeg/jcarith.c
+++ b/media/libjpeg/jcarith.c
@@ -4,16 +4,19 @@
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2018, 2021-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
  * This file contains portable arithmetic entropy encoding routines for JPEG
- * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ * (implementing Recommendation ITU-T T.81 | ISO/IEC 10918-1).
  *
  * Both sequential and progressive modes are supported in this single module.
  *
  * Suspension is not currently supported in this module.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
  */
 
 #define JPEG_INTERNALS
@@ -63,8 +66,8 @@ typedef arith_entropy_encoder *arith_entropy_ptr;
  * in the lower bits (mask 0x7F).
  */
 
-#define DC_STAT_BINS 64
-#define AC_STAT_BINS 256
+#define DC_STAT_BINS  64
+#define AC_STAT_BINS  256
 
 /* NOTE: Uncomment the following #define if you want to use the
  * given formula for calculating the AC conditioning parameter Kx
@@ -105,25 +108,25 @@ typedef arith_entropy_encoder *arith_entropy_ptr;
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
 #define ISHIFT_TEMPS    int ishift_temp;
-#define IRIGHT_SHIFT(x,shft)  \
-        ((ishift_temp = (x)) < 0 ? \
-         (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
-         (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+  ((ishift_temp = (x)) < 0 ? \
+   (ishift_temp >> (shft)) | ((~0) << (16 - (shft))) : \
+   (ishift_temp >> (shft)))
 #else
 #define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft)   ((x) >> (shft))
 #endif
 
 
 LOCAL(void)
-emit_byte (int val, j_compress_ptr cinfo)
+emit_byte(int val, j_compress_ptr cinfo)
 /* Write next output byte; we do not support suspension in this module. */
 {
   struct jpeg_destination_mgr *dest = cinfo->dest;
 
-  *dest->next_output_byte++ = (JOCTET) val;
+  *dest->next_output_byte++ = (JOCTET)val;
   if (--dest->free_in_buffer == 0)
-    if (! (*dest->empty_output_buffer) (cinfo))
+    if (!(*dest->empty_output_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
 }
 
@@ -133,22 +136,22 @@ emit_byte (int val, j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-finish_pass (j_compress_ptr cinfo)
+finish_pass(j_compress_ptr cinfo)
 {
-  arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
   JLONG temp;
 
   /* Section D.1.8: Termination of encoding */
 
   /* Find the e->c in the coding interval with the largest
    * number of trailing zero bits */
-  if ((temp = (e->a - 1 + e->c) & 0xFFFF0000L) < e->c)
+  if ((temp = (e->a - 1 + e->c) & 0xFFFF0000UL) < e->c)
     e->c = temp + 0x8000L;
   else
     e->c = temp;
   /* Send remaining bytes to output */
   e->c <<= e->ct;
-  if (e->c & 0xF8000000L) {
+  if (e->c & 0xF8000000UL) {
     /* One final overflow has to be handled */
     if (e->buffer >= 0) {
       if (e->zc)
@@ -219,9 +222,9 @@ finish_pass (j_compress_ptr cinfo)
  */
 
 LOCAL(void)
-arith_encode (j_compress_ptr cinfo, unsigned char *st, int val)
+arith_encode(j_compress_ptr cinfo, unsigned char *st, int val)
 {
-  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
   register unsigned char nl, nm;
   register JLONG qe, temp;
   register int sv;
@@ -231,8 +234,8 @@ arith_encode (j_compress_ptr cinfo, unsigned char *st, int val)
    */
   sv = *st;
   qe = jpeg_aritab[sv & 0x7F];  /* => Qe_Value */
-  nl = qe & 0xFF; qe >>= 8;     /* Next_Index_LPS + Switch_MPS */
-  nm = qe & 0xFF; qe >>= 8;     /* Next_Index_MPS */
+  nl = qe & 0xFF;  qe >>= 8;    /* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF;  qe >>= 8;    /* Next_Index_MPS */
 
   /* Encode & estimation procedures per sections D.1.4 & D.1.5 */
   e->a -= qe;
@@ -319,9 +322,9 @@ arith_encode (j_compress_ptr cinfo, unsigned char *st, int val)
  */
 
 LOCAL(void)
-emit_restart (j_compress_ptr cinfo, int restart_num)
+emit_restart(j_compress_ptr cinfo, int restart_num)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   int ci;
   jpeg_component_info *compptr;
 
@@ -335,14 +338,14 @@ emit_restart (j_compress_ptr cinfo, int restart_num)
     compptr = cinfo->cur_comp_info[ci];
     /* DC needs no table for refinement scan */
     if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
-      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
       /* Reset DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
     }
     /* AC needs no table when not present */
     if (cinfo->progressive_mode == 0 || cinfo->Se) {
-      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+      memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
     }
   }
 
@@ -362,9 +365,9 @@ emit_restart (j_compress_ptr cinfo, int restart_num)
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int blkn, ci, tbl;
@@ -391,7 +394,7 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     /* Compute the DC value after the required point transform by Al.
      * This is simply an arithmetic right shift.
      */
-    m = IRIGHT_SHIFT((int) ((*block)[0]), cinfo->Al);
+    m = IRIGHT_SHIFT((int)((*block)[0]), cinfo->Al);
 
     /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
 
@@ -432,9 +435,9 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
       }
       arith_encode(cinfo, st, 0);
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
-      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+      if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
         entropy->dc_context[ci] = 0;    /* zero diff category */
-      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+      else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
         entropy->dc_context[ci] += 8;   /* large diff category */
       /* Figure F.9: Encoding the magnitude bit pattern of v */
       st += 14;
@@ -453,9 +456,9 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int tbl, k, ke;
@@ -510,7 +513,7 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
           break;
         }
       }
-      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+      arith_encode(cinfo, st + 1, 0);  st += 3;  k++;
     }
     st += 2;
     /* Figure F.8: Encoding the magnitude category of v */
@@ -552,9 +555,9 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   unsigned char *st;
   int Al, blkn;
 
@@ -587,9 +590,9 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int tbl, k, ke, kex;
@@ -662,7 +665,7 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
           break;
         }
       }
-      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+      arith_encode(cinfo, st + 1, 0);  st += 3;  k++;
     }
   }
   /* Encode EOB decision only if k <= cinfo->Se */
@@ -680,9 +683,9 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   jpeg_component_info *compptr;
   JBLOCKROW block;
   unsigned char *st;
@@ -747,9 +750,9 @@ encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
       }
       arith_encode(cinfo, st, 0);
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
-      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+      if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
         entropy->dc_context[ci] = 0;    /* zero diff category */
-      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+      else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
         entropy->dc_context[ci] += 8;   /* large diff category */
       /* Figure F.9: Encoding the magnitude bit pattern of v */
       st += 14;
@@ -770,7 +773,7 @@ encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
       st = entropy->ac_stats[tbl] + 3 * (k - 1);
       arith_encode(cinfo, st, 0);       /* EOB decision */
       while ((v = (*block)[jpeg_natural_order[k]]) == 0) {
-        arith_encode(cinfo, st + 1, 0); st += 3; k++;
+        arith_encode(cinfo, st + 1, 0);  st += 3;  k++;
       }
       arith_encode(cinfo, st + 1, 1);
       /* Figure F.6: Encoding nonzero value v */
@@ -822,9 +825,9 @@ encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(void)
-start_pass (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass(j_compress_ptr cinfo, boolean gather_statistics)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   int ci, tbl;
   jpeg_component_info *compptr;
 
@@ -833,7 +836,7 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
      * We are fully adaptive here and need no extra
      * statistics gathering pass!
      */
-    ERREXIT(cinfo, JERR_NOT_COMPILED);
+    ERREXIT(cinfo, JERR_NOTIMPL);
 
   /* We assume jcmaster.c already validated the progressive scan parameters. */
 
@@ -862,9 +865,9 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->dc_stats[tbl] == NULL)
-        entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
-      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+        entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
       /* Initialize DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
@@ -875,13 +878,14 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->ac_stats[tbl] == NULL)
-        entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
-      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+        entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
 #ifdef CALCULATE_SPECTRAL_CONDITIONING
       if (cinfo->progressive_mode)
         /* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
-        cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4);
+        cinfo->arith_ac_K[tbl] = cinfo->Ss +
+                                 ((8 + cinfo->Se - cinfo->Ss) >> 4);
 #endif
     }
   }
@@ -905,15 +909,15 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
  */
 
 GLOBAL(void)
-jinit_arith_encoder (j_compress_ptr cinfo)
+jinit_arith_encoder(j_compress_ptr cinfo)
 {
   arith_entropy_ptr entropy;
   int i;
 
   entropy = (arith_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(arith_entropy_encoder));
-  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
   entropy->pub.start_pass = start_pass;
   entropy->pub.finish_pass = finish_pass;
 
diff --git a/media/libjpeg/jccoefct.c b/media/libjpeg/jccoefct.c
index a08d6e3230..068232a527 100644
--- a/media/libjpeg/jccoefct.c
+++ b/media/libjpeg/jccoefct.c
@@ -58,21 +58,19 @@ typedef my_coef_controller *my_coef_ptr;
 
 
 /* Forward declarations */
-METHODDEF(boolean) compress_data
-        (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf);
 #ifdef FULL_COEF_BUFFER_SUPPORTED
-METHODDEF(boolean) compress_first_pass
-        (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
-METHODDEF(boolean) compress_output
-        (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_first_pass(j_compress_ptr cinfo,
+                                       JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf);
 #endif
 
 
 LOCAL(void)
-start_iMCU_row (j_compress_ptr cinfo)
+start_iMCU_row(j_compress_ptr cinfo)
 /* Reset within-iMCU-row counters for a new row */
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   /* In an interleaved scan, an MCU row is the same as an iMCU row.
    * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
@@ -81,7 +79,7 @@ start_iMCU_row (j_compress_ptr cinfo)
   if (cinfo->comps_in_scan > 1) {
     coef->MCU_rows_per_iMCU_row = 1;
   } else {
-    if (coef->iMCU_row_num < (cinfo->total_iMCU_rows-1))
+    if (coef->iMCU_row_num < (cinfo->total_iMCU_rows - 1))
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
     else
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
@@ -97,9 +95,9 @@ start_iMCU_row (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_coef(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   coef->iMCU_row_num = 0;
   start_iMCU_row(cinfo);
@@ -140,9 +138,9 @@ start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(boolean)
-compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -167,31 +165,33 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       blkn = 0;
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
         compptr = cinfo->cur_comp_info[ci];
-        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-                                                : compptr->last_col_width;
+        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width :
+                                                  compptr->last_col_width;
         xpos = MCU_col_num * compptr->MCU_sample_width;
         ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
           if (coef->iMCU_row_num < last_iMCU_row ||
-              yoffset+yindex < compptr->last_row_height) {
+              yoffset + yindex < compptr->last_row_height) {
             (*cinfo->fdct->forward_DCT) (cinfo, compptr,
                                          input_buf[compptr->component_index],
                                          coef->MCU_buffer[blkn],
-                                         ypos, xpos, (JDIMENSION) blockcnt);
+                                         ypos, xpos, (JDIMENSION)blockcnt);
             if (blockcnt < compptr->MCU_width) {
               /* Create some dummy blocks at the right edge of the image. */
-              jzero_far((void *) coef->MCU_buffer[blkn + blockcnt],
+              jzero_far((void *)coef->MCU_buffer[blkn + blockcnt],
                         (compptr->MCU_width - blockcnt) * sizeof(JBLOCK));
               for (bi = blockcnt; bi < compptr->MCU_width; bi++) {
-                coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn+bi-1][0][0];
+                coef->MCU_buffer[blkn + bi][0][0] =
+                  coef->MCU_buffer[blkn + bi - 1][0][0];
               }
             }
           } else {
             /* Create a row of dummy blocks at the bottom of the image. */
-            jzero_far((void *) coef->MCU_buffer[blkn],
+            jzero_far((void *)coef->MCU_buffer[blkn],
                       compptr->MCU_width * sizeof(JBLOCK));
             for (bi = 0; bi < compptr->MCU_width; bi++) {
-              coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn-1][0][0];
+              coef->MCU_buffer[blkn + bi][0][0] =
+                coef->MCU_buffer[blkn - 1][0][0];
             }
           }
           blkn += compptr->MCU_width;
@@ -201,7 +201,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       /* Try to write the MCU.  In event of a suspension failure, we will
        * re-DCT the MCU on restart (a bit inefficient, could be fixed...)
        */
-      if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
+      if (!(*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->mcu_ctr = MCU_col_num;
@@ -242,9 +242,9 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  */
 
 METHODDEF(boolean)
-compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_first_pass(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   JDIMENSION blocks_across, MCUs_across, MCUindex;
   int bi, ci, h_samp_factor, block_row, block_rows, ndummy;
@@ -257,21 +257,21 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
        ci++, compptr++) {
     /* Align the virtual buffer for this component. */
     buffer = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[ci],
+      ((j_common_ptr)cinfo, coef->whole_image[ci],
        coef->iMCU_row_num * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, TRUE);
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
     /* Count non-dummy DCT block rows in this iMCU row. */
     if (coef->iMCU_row_num < last_iMCU_row)
       block_rows = compptr->v_samp_factor;
     else {
       /* NB: can't use last_row_height here, since may not be set! */
-      block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+      block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
       if (block_rows == 0) block_rows = compptr->v_samp_factor;
     }
     blocks_across = compptr->width_in_blocks;
     h_samp_factor = compptr->h_samp_factor;
     /* Count number of dummy blocks to be added at the right margin. */
-    ndummy = (int) (blocks_across % h_samp_factor);
+    ndummy = (int)(blocks_across % h_samp_factor);
     if (ndummy > 0)
       ndummy = h_samp_factor - ndummy;
     /* Perform DCT for all non-dummy blocks in this iMCU row.  Each call
@@ -281,12 +281,12 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       thisblockrow = buffer[block_row];
       (*cinfo->fdct->forward_DCT) (cinfo, compptr,
                                    input_buf[ci], thisblockrow,
-                                   (JDIMENSION) (block_row * DCTSIZE),
-                                   (JDIMENSION) 0, blocks_across);
+                                   (JDIMENSION)(block_row * DCTSIZE),
+                                   (JDIMENSION)0, blocks_across);
       if (ndummy > 0) {
         /* Create dummy blocks at the right edge of the image. */
         thisblockrow += blocks_across; /* => first dummy block */
-        jzero_far((void *) thisblockrow, ndummy * sizeof(JBLOCK));
+        jzero_far((void *)thisblockrow, ndummy * sizeof(JBLOCK));
         lastDC = thisblockrow[-1][0];
         for (bi = 0; bi < ndummy; bi++) {
           thisblockrow[bi][0] = lastDC;
@@ -304,11 +304,11 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       for (block_row = block_rows; block_row < compptr->v_samp_factor;
            block_row++) {
         thisblockrow = buffer[block_row];
-        lastblockrow = buffer[block_row-1];
-        jzero_far((void *) thisblockrow,
-                  (size_t) (blocks_across * sizeof(JBLOCK)));
+        lastblockrow = buffer[block_row - 1];
+        jzero_far((void *)thisblockrow,
+                  (size_t)(blocks_across * sizeof(JBLOCK)));
         for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
-          lastDC = lastblockrow[h_samp_factor-1][0];
+          lastDC = lastblockrow[h_samp_factor - 1][0];
           for (bi = 0; bi < h_samp_factor; bi++) {
             thisblockrow[bi][0] = lastDC;
           }
@@ -338,9 +338,9 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  */
 
 METHODDEF(boolean)
-compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   int blkn, ci, xindex, yindex, yoffset;
   JDIMENSION start_col;
@@ -355,9 +355,9 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     buffer[ci] = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+      ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
        coef->iMCU_row_num * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, FALSE);
+       (JDIMENSION)compptr->v_samp_factor, FALSE);
   }
 
   /* Loop to process one whole iMCU row */
@@ -371,14 +371,14 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
         compptr = cinfo->cur_comp_info[ci];
         start_col = MCU_col_num * compptr->MCU_width;
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-          buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+          buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
           for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
             coef->MCU_buffer[blkn++] = buffer_ptr++;
           }
         }
       }
       /* Try to write the MCU. */
-      if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
+      if (!(*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->mcu_ctr = MCU_col_num;
@@ -402,14 +402,14 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  */
 
 GLOBAL(void)
-jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
+jinit_c_coef_controller(j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_coef_ptr coef;
 
   coef = (my_coef_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_coef_controller));
-  cinfo->coef = (struct jpeg_c_coef_controller *) coef;
+  cinfo->coef = (struct jpeg_c_coef_controller *)coef;
   coef->pub.start_pass = start_pass_coef;
 
   /* Create the coefficient buffer. */
@@ -423,12 +423,12 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
          ci++, compptr++) {
       coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-         (JDIMENSION) jround_up((long) compptr->width_in_blocks,
-                                (long) compptr->h_samp_factor),
-         (JDIMENSION) jround_up((long) compptr->height_in_blocks,
-                                (long) compptr->v_samp_factor),
-         (JDIMENSION) compptr->v_samp_factor);
+        ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+         (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                               (long)compptr->h_samp_factor),
+         (JDIMENSION)jround_up((long)compptr->height_in_blocks,
+                               (long)compptr->v_samp_factor),
+         (JDIMENSION)compptr->v_samp_factor);
     }
 #else
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -439,7 +439,7 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     int i;
 
     buffer = (JBLOCKROW)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
     for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
       coef->MCU_buffer[i] = buffer + i;
diff --git a/media/libjpeg/jccolext.c b/media/libjpeg/jccolext.c
index 479b320446..303b322ce6 100644
--- a/media/libjpeg/jccolext.c
+++ b/media/libjpeg/jccolext.c
@@ -29,13 +29,13 @@
 
 INLINE
 LOCAL(void)
-rgb_ycc_convert_internal (j_compress_ptr cinfo,
-                          JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                          JDIMENSION output_row, int num_rows)
+rgb_ycc_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                         JSAMPIMAGE output_buf, JDIMENSION output_row,
+                         int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
-  register JLONG * ctab = cconvert->rgb_ycc_tab;
+  register JLONG *ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr0, outptr1, outptr2;
   register JDIMENSION col;
@@ -48,9 +48,9 @@ rgb_ycc_convert_internal (j_compress_ptr cinfo,
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      r = inptr[RGB_RED];
+      g = inptr[RGB_GREEN];
+      b = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -58,17 +58,14 @@ rgb_ycc_convert_internal (j_compress_ptr cinfo,
        * need the general RIGHT_SHIFT macro.
        */
       /* Y */
-      outptr0[col] = (JSAMPLE)
-                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-                 >> SCALEBITS);
+      outptr0[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                                ctab[b + B_Y_OFF]) >> SCALEBITS);
       /* Cb */
-      outptr1[col] = (JSAMPLE)
-                ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
-                 >> SCALEBITS);
+      outptr1[col] = (JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
+                                ctab[b + B_CB_OFF]) >> SCALEBITS);
       /* Cr */
-      outptr2[col] = (JSAMPLE)
-                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
-                 >> SCALEBITS);
+      outptr2[col] = (JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
+                                ctab[b + B_CR_OFF]) >> SCALEBITS);
     }
   }
 }
@@ -86,13 +83,13 @@ rgb_ycc_convert_internal (j_compress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-rgb_gray_convert_internal (j_compress_ptr cinfo,
-                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                           JDIMENSION output_row, int num_rows)
+rgb_gray_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                          JSAMPIMAGE output_buf, JDIMENSION output_row,
+                          int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
-  register JLONG * ctab = cconvert->rgb_ycc_tab;
+  register JLONG *ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr;
   register JDIMENSION col;
@@ -103,14 +100,13 @@ rgb_gray_convert_internal (j_compress_ptr cinfo,
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      r = inptr[RGB_RED];
+      g = inptr[RGB_GREEN];
+      b = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
       /* Y */
-      outptr[col] = (JSAMPLE)
-                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-                 >> SCALEBITS);
+      outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                               ctab[b + B_Y_OFF]) >> SCALEBITS);
     }
   }
 }
@@ -123,9 +119,9 @@ rgb_gray_convert_internal (j_compress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-rgb_rgb_convert_internal (j_compress_ptr cinfo,
-                          JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                          JDIMENSION output_row, int num_rows)
+rgb_rgb_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                         JSAMPIMAGE output_buf, JDIMENSION output_row,
+                         int num_rows)
 {
   register JSAMPROW inptr;
   register JSAMPROW outptr0, outptr1, outptr2;
@@ -139,9 +135,9 @@ rgb_rgb_convert_internal (j_compress_ptr cinfo,
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr0[col] = GETJSAMPLE(inptr[RGB_RED]);
-      outptr1[col] = GETJSAMPLE(inptr[RGB_GREEN]);
-      outptr2[col] = GETJSAMPLE(inptr[RGB_BLUE]);
+      outptr0[col] = inptr[RGB_RED];
+      outptr1[col] = inptr[RGB_GREEN];
+      outptr2[col] = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
     }
   }
diff --git a/media/libjpeg/jccolor.c b/media/libjpeg/jccolor.c
index b973d101d6..bdc563c723 100644
--- a/media/libjpeg/jccolor.c
+++ b/media/libjpeg/jccolor.c
@@ -63,9 +63,9 @@ typedef my_color_converter *my_cconvert_ptr;
  */
 
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define CBCR_OFFSET     ((JLONG) CENTERJSAMPLE << SCALEBITS)
-#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
-#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
+#define CBCR_OFFSET     ((JLONG)CENTERJSAMPLE << SCALEBITS)
+#define ONE_HALF        ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x)          ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
 
 /* We allocate one big table and divide it up into eight parts, instead of
  * doing eight alloc_small requests.  This lets us use a single table base
@@ -74,15 +74,15 @@ typedef my_color_converter *my_cconvert_ptr;
  */
 
 #define R_Y_OFF         0                       /* offset to R => Y section */
-#define G_Y_OFF         (1*(MAXJSAMPLE+1))      /* offset to G => Y section */
-#define B_Y_OFF         (2*(MAXJSAMPLE+1))      /* etc. */
-#define R_CB_OFF        (3*(MAXJSAMPLE+1))
-#define G_CB_OFF        (4*(MAXJSAMPLE+1))
-#define B_CB_OFF        (5*(MAXJSAMPLE+1))
+#define G_Y_OFF         (1 * (MAXJSAMPLE + 1))  /* offset to G => Y section */
+#define B_Y_OFF         (2 * (MAXJSAMPLE + 1))  /* etc. */
+#define R_CB_OFF        (3 * (MAXJSAMPLE + 1))
+#define G_CB_OFF        (4 * (MAXJSAMPLE + 1))
+#define B_CB_OFF        (5 * (MAXJSAMPLE + 1))
 #define R_CR_OFF        B_CB_OFF                /* B=>Cb, R=>Cr are the same */
-#define G_CR_OFF        (6*(MAXJSAMPLE+1))
-#define B_CR_OFF        (7*(MAXJSAMPLE+1))
-#define TABLE_SIZE      (8*(MAXJSAMPLE+1))
+#define G_CR_OFF        (6 * (MAXJSAMPLE + 1))
+#define B_CR_OFF        (7 * (MAXJSAMPLE + 1))
+#define TABLE_SIZE      (8 * (MAXJSAMPLE + 1))
 
 
 /* Include inline routines for colorspace extensions */
@@ -93,13 +93,13 @@ typedef my_color_converter *my_cconvert_ptr;
 #undef RGB_BLUE
 #undef RGB_PIXELSIZE
 
-#define RGB_RED EXT_RGB_RED
-#define RGB_GREEN EXT_RGB_GREEN
-#define RGB_BLUE EXT_RGB_BLUE
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define rgb_ycc_convert_internal extrgb_ycc_convert_internal
-#define rgb_gray_convert_internal extrgb_gray_convert_internal
-#define rgb_rgb_convert_internal extrgb_rgb_convert_internal
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define rgb_ycc_convert_internal  extrgb_ycc_convert_internal
+#define rgb_gray_convert_internal  extrgb_gray_convert_internal
+#define rgb_rgb_convert_internal  extrgb_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -109,13 +109,13 @@ typedef my_color_converter *my_cconvert_ptr;
 #undef rgb_gray_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_RGBX_RED
-#define RGB_GREEN EXT_RGBX_GREEN
-#define RGB_BLUE EXT_RGBX_BLUE
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define rgb_ycc_convert_internal extrgbx_ycc_convert_internal
-#define rgb_gray_convert_internal extrgbx_gray_convert_internal
-#define rgb_rgb_convert_internal extrgbx_rgb_convert_internal
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define rgb_ycc_convert_internal  extrgbx_ycc_convert_internal
+#define rgb_gray_convert_internal  extrgbx_gray_convert_internal
+#define rgb_rgb_convert_internal  extrgbx_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -125,13 +125,13 @@ typedef my_color_converter *my_cconvert_ptr;
 #undef rgb_gray_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_BGR_RED
-#define RGB_GREEN EXT_BGR_GREEN
-#define RGB_BLUE EXT_BGR_BLUE
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define rgb_ycc_convert_internal extbgr_ycc_convert_internal
-#define rgb_gray_convert_internal extbgr_gray_convert_internal
-#define rgb_rgb_convert_internal extbgr_rgb_convert_internal
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define rgb_ycc_convert_internal  extbgr_ycc_convert_internal
+#define rgb_gray_convert_internal  extbgr_gray_convert_internal
+#define rgb_rgb_convert_internal  extbgr_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -141,13 +141,13 @@ typedef my_color_converter *my_cconvert_ptr;
 #undef rgb_gray_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_BGRX_RED
-#define RGB_GREEN EXT_BGRX_GREEN
-#define RGB_BLUE EXT_BGRX_BLUE
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define rgb_ycc_convert_internal extbgrx_ycc_convert_internal
-#define rgb_gray_convert_internal extbgrx_gray_convert_internal
-#define rgb_rgb_convert_internal extbgrx_rgb_convert_internal
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define rgb_ycc_convert_internal  extbgrx_ycc_convert_internal
+#define rgb_gray_convert_internal  extbgrx_gray_convert_internal
+#define rgb_rgb_convert_internal  extbgrx_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -157,13 +157,13 @@ typedef my_color_converter *my_cconvert_ptr;
 #undef rgb_gray_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_XBGR_RED
-#define RGB_GREEN EXT_XBGR_GREEN
-#define RGB_BLUE EXT_XBGR_BLUE
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define rgb_ycc_convert_internal extxbgr_ycc_convert_internal
-#define rgb_gray_convert_internal extxbgr_gray_convert_internal
-#define rgb_rgb_convert_internal extxbgr_rgb_convert_internal
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define rgb_ycc_convert_internal  extxbgr_ycc_convert_internal
+#define rgb_gray_convert_internal  extxbgr_gray_convert_internal
+#define rgb_rgb_convert_internal  extxbgr_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -173,13 +173,13 @@ typedef my_color_converter *my_cconvert_ptr;
 #undef rgb_gray_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_XRGB_RED
-#define RGB_GREEN EXT_XRGB_GREEN
-#define RGB_BLUE EXT_XRGB_BLUE
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define rgb_ycc_convert_internal extxrgb_ycc_convert_internal
-#define rgb_gray_convert_internal extxrgb_gray_convert_internal
-#define rgb_rgb_convert_internal extxrgb_rgb_convert_internal
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define rgb_ycc_convert_internal  extxrgb_ycc_convert_internal
+#define rgb_gray_convert_internal  extxrgb_gray_convert_internal
+#define rgb_rgb_convert_internal  extxrgb_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -195,33 +195,33 @@ typedef my_color_converter *my_cconvert_ptr;
  */
 
 METHODDEF(void)
-rgb_ycc_start (j_compress_ptr cinfo)
+rgb_ycc_start(j_compress_ptr cinfo)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   JLONG *rgb_ycc_tab;
   JLONG i;
 
   /* Allocate and fill in the conversion tables. */
   cconvert->rgb_ycc_tab = rgb_ycc_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 (TABLE_SIZE * sizeof(JLONG)));
 
   for (i = 0; i <= MAXJSAMPLE; i++) {
-    rgb_ycc_tab[i+R_Y_OFF] = FIX(0.29900) * i;
-    rgb_ycc_tab[i+G_Y_OFF] = FIX(0.58700) * i;
-    rgb_ycc_tab[i+B_Y_OFF] = FIX(0.11400) * i     + ONE_HALF;
-    rgb_ycc_tab[i+R_CB_OFF] = (-FIX(0.16874)) * i;
-    rgb_ycc_tab[i+G_CB_OFF] = (-FIX(0.33126)) * i;
+    rgb_ycc_tab[i + R_Y_OFF] = FIX(0.29900) * i;
+    rgb_ycc_tab[i + G_Y_OFF] = FIX(0.58700) * i;
+    rgb_ycc_tab[i + B_Y_OFF] = FIX(0.11400) * i   + ONE_HALF;
+    rgb_ycc_tab[i + R_CB_OFF] = (-FIX(0.16874)) * i;
+    rgb_ycc_tab[i + G_CB_OFF] = (-FIX(0.33126)) * i;
     /* We use a rounding fudge-factor of 0.5-epsilon for Cb and Cr.
      * This ensures that the maximum output will round to MAXJSAMPLE
      * not MAXJSAMPLE+1, and thus that we don't have to range-limit.
      */
-    rgb_ycc_tab[i+B_CB_OFF] = FIX(0.50000) * i    + CBCR_OFFSET + ONE_HALF-1;
+    rgb_ycc_tab[i + B_CB_OFF] = FIX(0.50000) * i  + CBCR_OFFSET + ONE_HALF - 1;
 /*  B=>Cb and R=>Cr tables are the same
-    rgb_ycc_tab[i+R_CR_OFF] = FIX(0.50000) * i    + CBCR_OFFSET + ONE_HALF-1;
+    rgb_ycc_tab[i + R_CR_OFF] = FIX(0.50000) * i  + CBCR_OFFSET + ONE_HALF - 1;
 */
-    rgb_ycc_tab[i+G_CR_OFF] = (-FIX(0.41869)) * i;
-    rgb_ycc_tab[i+B_CR_OFF] = (-FIX(0.08131)) * i;
+    rgb_ycc_tab[i + G_CR_OFF] = (-FIX(0.41869)) * i;
+    rgb_ycc_tab[i + B_CR_OFF] = (-FIX(0.08131)) * i;
   }
 }
 
@@ -231,43 +231,42 @@ rgb_ycc_start (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-rgb_ycc_convert (j_compress_ptr cinfo,
-                 JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                 JDIMENSION output_row, int num_rows)
+rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                  num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                  num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    default:
-      rgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                               num_rows);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  default:
+    rgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                             num_rows);
+    break;
   }
 }
 
@@ -280,43 +279,42 @@ rgb_ycc_convert (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-rgb_gray_convert (j_compress_ptr cinfo,
-                  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                  JDIMENSION output_row, int num_rows)
+rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                 JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                    num_rows);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                    num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                    num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                    num_rows);
-      break;
-    default:
-      rgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                num_rows);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+    break;
+  default:
+    rgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                              num_rows);
+    break;
   }
 }
 
@@ -326,43 +324,42 @@ rgb_gray_convert (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-rgb_rgb_convert (j_compress_ptr cinfo,
-                  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                  JDIMENSION output_row, int num_rows)
+rgb_rgb_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                  num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                  num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    default:
-      rgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                               num_rows);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  default:
+    rgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                             num_rows);
+    break;
   }
 }
 
@@ -376,11 +373,10 @@ rgb_rgb_convert (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-cmyk_ycck_convert (j_compress_ptr cinfo,
-                   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                   JDIMENSION output_row, int num_rows)
+cmyk_ycck_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                  JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
   register JLONG *ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
@@ -396,11 +392,11 @@ cmyk_ycck_convert (j_compress_ptr cinfo,
     outptr3 = output_buf[3][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
-      g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
-      b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
+      r = MAXJSAMPLE - inptr[0];
+      g = MAXJSAMPLE - inptr[1];
+      b = MAXJSAMPLE - inptr[2];
       /* K passes through as-is */
-      outptr3[col] = inptr[3];  /* don't need GETJSAMPLE here */
+      outptr3[col] = inptr[3];
       inptr += 4;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -408,17 +404,14 @@ cmyk_ycck_convert (j_compress_ptr cinfo,
        * need the general RIGHT_SHIFT macro.
        */
       /* Y */
-      outptr0[col] = (JSAMPLE)
-                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-                 >> SCALEBITS);
+      outptr0[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                                ctab[b + B_Y_OFF]) >> SCALEBITS);
       /* Cb */
-      outptr1[col] = (JSAMPLE)
-                ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
-                 >> SCALEBITS);
+      outptr1[col] = (JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
+                                ctab[b + B_CB_OFF]) >> SCALEBITS);
       /* Cr */
-      outptr2[col] = (JSAMPLE)
-                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
-                 >> SCALEBITS);
+      outptr2[col] = (JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
+                                ctab[b + B_CR_OFF]) >> SCALEBITS);
     }
   }
 }
@@ -431,9 +424,8 @@ cmyk_ycck_convert (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-grayscale_convert (j_compress_ptr cinfo,
-                   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                   JDIMENSION output_row, int num_rows)
+grayscale_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                  JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   register JSAMPROW inptr;
   register JSAMPROW outptr;
@@ -446,7 +438,7 @@ grayscale_convert (j_compress_ptr cinfo,
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr[col] = inptr[0];   /* don't need GETJSAMPLE() here */
+      outptr[col] = inptr[0];
       inptr += instride;
     }
   }
@@ -460,9 +452,8 @@ grayscale_convert (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-null_convert (j_compress_ptr cinfo,
-              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-              JDIMENSION output_row, int num_rows)
+null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows)
 {
   register JSAMPROW inptr;
   register JSAMPROW outptr, outptr0, outptr1, outptr2, outptr3;
@@ -506,7 +497,7 @@ null_convert (j_compress_ptr cinfo,
         inptr = *input_buf;
         outptr = output_buf[ci][output_row];
         for (col = 0; col < num_cols; col++) {
-          outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */
+          outptr[col] = inptr[ci];
           inptr += nc;
         }
       }
@@ -522,7 +513,7 @@ null_convert (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-null_method (j_compress_ptr cinfo)
+null_method(j_compress_ptr cinfo)
 {
   /* no work needed */
 }
@@ -533,14 +524,14 @@ null_method (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_color_converter (j_compress_ptr cinfo)
+jinit_color_converter(j_compress_ptr cinfo)
 {
   my_cconvert_ptr cconvert;
 
   cconvert = (my_cconvert_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_color_converter));
-  cinfo->cconvert = (struct jpeg_color_converter *) cconvert;
+  cinfo->cconvert = (struct jpeg_color_converter *)cconvert;
   /* set start_pass to null method until we find out differently */
   cconvert->pub.start_pass = null_method;
 
diff --git a/media/libjpeg/jcdctmgr.c b/media/libjpeg/jcdctmgr.c
index aef8517f9c..7dae17a6e1 100644
--- a/media/libjpeg/jcdctmgr.c
+++ b/media/libjpeg/jcdctmgr.c
@@ -41,7 +41,7 @@ typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block,
                                            FAST_FLOAT *divisors,
                                            FAST_FLOAT *workspace);
 
-METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
+METHODDEF(void) quantize(JCOEFPTR, DCTELEM *, DCTELEM *);
 
 typedef struct {
   struct jpeg_forward_dct pub;  /* public fields */
@@ -80,7 +80,7 @@ typedef my_fdct_controller *my_fdct_ptr;
  */
 
 LOCAL(int)
-flss (UINT16 val)
+flss(UINT16 val)
 {
   int bit;
 
@@ -170,7 +170,7 @@ flss (UINT16 val)
  */
 
 LOCAL(int)
-compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
+compute_reciprocal(UINT16 divisor, DCTELEM *dtbl)
 {
   UDCTELEM2 fq, fr;
   UDCTELEM c;
@@ -182,10 +182,10 @@ compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
      * identity function.  Since only the C quantization algorithm is used in
      * these cases, the scale value is irrelevant.
      */
-    dtbl[DCTSIZE2 * 0] = (DCTELEM) 1;                       /* reciprocal */
-    dtbl[DCTSIZE2 * 1] = (DCTELEM) 0;                       /* correction */
-    dtbl[DCTSIZE2 * 2] = (DCTELEM) 1;                       /* scale */
-    dtbl[DCTSIZE2 * 3] = -(DCTELEM) (sizeof(DCTELEM) * 8);  /* shift */
+    dtbl[DCTSIZE2 * 0] = (DCTELEM)1;                        /* reciprocal */
+    dtbl[DCTSIZE2 * 1] = (DCTELEM)0;                        /* correction */
+    dtbl[DCTSIZE2 * 2] = (DCTELEM)1;                        /* scale */
+    dtbl[DCTSIZE2 * 3] = -(DCTELEM)(sizeof(DCTELEM) * 8);   /* shift */
     return 0;
   }
 
@@ -195,28 +195,28 @@ compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
   fq = ((UDCTELEM2)1 << r) / divisor;
   fr = ((UDCTELEM2)1 << r) % divisor;
 
-  c = divisor / 2; /* for rounding */
+  c = divisor / 2;                      /* for rounding */
 
-  if (fr == 0) { /* divisor is power of two */
+  if (fr == 0) {                        /* divisor is power of two */
     /* fq will be one bit too large to fit in DCTELEM, so adjust */
     fq >>= 1;
     r--;
-  } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
+  } else if (fr <= (divisor / 2U)) {    /* fractional part is < 0.5 */
     c++;
-  } else { /* fractional part is > 0.5 */
+  } else {                              /* fractional part is > 0.5 */
     fq++;
   }
 
-  dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;      /* reciprocal */
-  dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
+  dtbl[DCTSIZE2 * 0] = (DCTELEM)fq;     /* reciprocal */
+  dtbl[DCTSIZE2 * 1] = (DCTELEM)c;      /* correction + roundfactor */
 #ifdef WITH_SIMD
-  dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
+  dtbl[DCTSIZE2 * 2] = (DCTELEM)(1 << (sizeof(DCTELEM) * 8 * 2 - r)); /* scale */
 #else
   dtbl[DCTSIZE2 * 2] = 1;
 #endif
-  dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+  dtbl[DCTSIZE2 * 3] = (DCTELEM)r - sizeof(DCTELEM) * 8; /* shift */
 
-  if(r <= 16) return 0;
+  if (r <= 16) return 0;
   else return 1;
 }
 
@@ -233,9 +233,9 @@ compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
  */
 
 METHODDEF(void)
-start_pass_fdctmgr (j_compress_ptr cinfo)
+start_pass_fdctmgr(j_compress_ptr cinfo)
 {
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
   int ci, qtblno, i;
   jpeg_component_info *compptr;
   JQUANT_TBL *qtbl;
@@ -259,7 +259,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
        */
       if (fdct->divisors[qtblno] == NULL) {
         fdct->divisors[qtblno] = (DCTELEM *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       (DCTSIZE2 * 4) * sizeof(DCTELEM));
       }
       dtbl = fdct->divisors[qtblno];
@@ -269,7 +269,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
             fdct->quantize == jsimd_quantize)
           fdct->quantize = quantize;
 #else
-        dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
+        dtbl[i] = ((DCTELEM)qtbl->quantval[i]) << 3;
 #endif
       }
       break;
@@ -283,7 +283,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
          *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
          * We apply a further scale factor of 8.
          */
-#define CONST_BITS 14
+#define CONST_BITS  14
         static const INT16 aanscales[DCTSIZE2] = {
           /* precomputed values scaled up by 14 bits */
           16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
@@ -299,23 +299,23 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 
         if (fdct->divisors[qtblno] == NULL) {
           fdct->divisors[qtblno] = (DCTELEM *)
-            (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+            (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                         (DCTSIZE2 * 4) * sizeof(DCTELEM));
         }
         dtbl = fdct->divisors[qtblno];
         for (i = 0; i < DCTSIZE2; i++) {
 #if BITS_IN_JSAMPLE == 8
           if (!compute_reciprocal(
-                DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
-                                      (JLONG) aanscales[i]),
-                        CONST_BITS-3), &dtbl[i]) &&
+                DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+                                      (JLONG)aanscales[i]),
+                        CONST_BITS - 3), &dtbl[i]) &&
               fdct->quantize == jsimd_quantize)
             fdct->quantize = quantize;
 #else
-           dtbl[i] = (DCTELEM)
-             DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
-                                   (JLONG) aanscales[i]),
-                     CONST_BITS-3);
+          dtbl[i] = (DCTELEM)
+            DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+                                  (JLONG)aanscales[i]),
+                    CONST_BITS - 3);
 #endif
         }
       }
@@ -341,7 +341,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 
         if (fdct->float_divisors[qtblno] == NULL) {
           fdct->float_divisors[qtblno] = (FAST_FLOAT *)
-            (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+            (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                         DCTSIZE2 * sizeof(FAST_FLOAT));
         }
         fdtbl = fdct->float_divisors[qtblno];
@@ -349,7 +349,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
         for (row = 0; row < DCTSIZE; row++) {
           for (col = 0; col < DCTSIZE; col++) {
             fdtbl[i] = (FAST_FLOAT)
-              (1.0 / (((double) qtbl->quantval[i] *
+              (1.0 / (((double)qtbl->quantval[i] *
                        aanscalefactor[row] * aanscalefactor[col] * 8.0)));
             i++;
           }
@@ -370,7 +370,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
+convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
 {
   register DCTELEM *workspaceptr;
   register JSAMPROW elemptr;
@@ -381,19 +381,19 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
     elemptr = sample_data[elemr] + start_col;
 
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+        *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
     }
 #endif
   }
@@ -405,7 +405,7 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
  */
 
 METHODDEF(void)
-quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 {
   int i;
   DCTELEM temp;
@@ -426,15 +426,15 @@ quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
     if (temp < 0) {
       temp = -temp;
       product = (UDCTELEM2)(temp + corr) * recip;
-      product >>= shift + sizeof(DCTELEM)*8;
+      product >>= shift + sizeof(DCTELEM) * 8;
       temp = (DCTELEM)product;
       temp = -temp;
     } else {
       product = (UDCTELEM2)(temp + corr) * recip;
-      product >>= shift + sizeof(DCTELEM)*8;
+      product >>= shift + sizeof(DCTELEM) * 8;
       temp = (DCTELEM)product;
     }
-    output_ptr[i] = (JCOEF) temp;
+    output_ptr[i] = (JCOEF)temp;
   }
 
 #else
@@ -457,20 +457,20 @@ quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
      * If your machine's division is fast enough, define FAST_DIVIDE.
      */
 #ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b)  a /= b
+#define DIVIDE_BY(a, b)  a /= b
 #else
-#define DIVIDE_BY(a,b)  if (a >= b) a /= b; else a = 0
+#define DIVIDE_BY(a, b)  if (a >= b) a /= b;  else a = 0
 #endif
     if (temp < 0) {
       temp = -temp;
-      temp += qval>>1;  /* for rounding */
+      temp += qval >> 1;        /* for rounding */
       DIVIDE_BY(temp, qval);
       temp = -temp;
     } else {
-      temp += qval>>1;  /* for rounding */
+      temp += qval >> 1;        /* for rounding */
       DIVIDE_BY(temp, qval);
     }
-    output_ptr[i] = (JCOEF) temp;
+    output_ptr[i] = (JCOEF)temp;
   }
 
 #endif
@@ -487,14 +487,13 @@ quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
  */
 
 METHODDEF(void)
-forward_DCT (j_compress_ptr cinfo, jpeg_component_info *compptr,
-             JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-             JDIMENSION start_row, JDIMENSION start_col,
-             JDIMENSION num_blocks)
+forward_DCT(j_compress_ptr cinfo, jpeg_component_info *compptr,
+            JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+            JDIMENSION start_row, JDIMENSION start_col, JDIMENSION num_blocks)
 /* This version is used for integer DCT implementations. */
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
   DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no];
   DCTELEM *workspace;
   JDIMENSION bi;
@@ -522,9 +521,9 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info *compptr,
 
 #ifdef DCT_FLOAT_SUPPORTED
 
-
 METHODDEF(void)
-convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace)
+convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+               FAST_FLOAT *workspace)
 {
   register FAST_FLOAT *workspaceptr;
   register JSAMPROW elemptr;
@@ -534,20 +533,19 @@ convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *worksp
   for (elemr = 0; elemr < DCTSIZE; elemr++) {
     elemptr = sample_data[elemr] + start_col;
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = (FAST_FLOAT)
-                          (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+        *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
     }
 #endif
   }
@@ -555,7 +553,8 @@ convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *worksp
 
 
 METHODDEF(void)
-quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace)
+quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+               FAST_FLOAT *workspace)
 {
   register FAST_FLOAT temp;
   register int i;
@@ -571,20 +570,20 @@ quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace
      * The maximum coefficient size is +-16K (for 12-bit data), so this
      * code should work for either 16-bit or 32-bit ints.
      */
-    output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
+    output_ptr[i] = (JCOEF)((int)(temp + (FAST_FLOAT)16384.5) - 16384);
   }
 }
 
 
 METHODDEF(void)
-forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-                   JDIMENSION start_row, JDIMENSION start_col,
-                   JDIMENSION num_blocks)
+forward_DCT_float(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                  JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+                  JDIMENSION start_row, JDIMENSION start_col,
+                  JDIMENSION num_blocks)
 /* This version is used for floating-point DCT implementations. */
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
   FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no];
   FAST_FLOAT *workspace;
   JDIMENSION bi;
@@ -618,15 +617,15 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jinit_forward_dct (j_compress_ptr cinfo)
+jinit_forward_dct(j_compress_ptr cinfo)
 {
   my_fdct_ptr fdct;
   int i;
 
   fdct = (my_fdct_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_fdct_controller));
-  cinfo->fdct = (struct jpeg_forward_dct *) fdct;
+  cinfo->fdct = (struct jpeg_forward_dct *)fdct;
   fdct->pub.start_pass = start_pass_fdctmgr;
 
   /* First determine the DCT... */
@@ -703,12 +702,12 @@ jinit_forward_dct (j_compress_ptr cinfo)
 #ifdef DCT_FLOAT_SUPPORTED
   if (cinfo->dct_method == JDCT_FLOAT)
     fdct->float_workspace = (FAST_FLOAT *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(FAST_FLOAT) * DCTSIZE2);
   else
 #endif
     fdct->workspace = (DCTELEM *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(DCTELEM) * DCTSIZE2);
 
   /* Mark divisor tables unallocated */
diff --git a/media/libjpeg/jchuff.c b/media/libjpeg/jchuff.c
index fffaacebce..f4dfa1cb54 100644
--- a/media/libjpeg/jchuff.c
+++ b/media/libjpeg/jchuff.c
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014-2016, 2018-2022, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -16,6 +18,9 @@
  * back up to the start of the current MCU.  To do this, we copy state
  * variables into local working storage, and update them back to the
  * permanent JPEG objects only upon successful completion of an MCU.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
  */
 
 #define JPEG_INTERNALS
@@ -31,32 +36,33 @@
  * memory footprint by 64k, which is important for some mobile applications
  * that create many isolated instances of libjpeg-turbo (web browsers, for
  * instance.)  This may improve performance on some mobile platforms as well.
- * This feature is enabled by default only on ARM processors, because some x86
+ * This feature is enabled by default only on Arm processors, because some x86
  * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
  * shown to have a significant performance impact even on the x86 chips that
- * have a fast implementation of it.  When building for ARMv6, you can
+ * have a fast implementation of it.  When building for Armv6, you can
  * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
  * flags (this defines __thumb__).
  */
 
 /* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-#if !defined __thumb__ || defined __thumb2__
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+    defined(_M_ARM) || defined(_M_ARM64)
+#if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
 #endif
 
 #ifdef USE_CLZ_INTRINSIC
-#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
-#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x)  (32 - _CountLeadingZeros(x))
 #else
-#include "jpeg_nbits_table.h"
-#define JPEG_NBITS(x) (jpeg_nbits_table[x])
-#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
+#define JPEG_NBITS_NONZERO(x)  (32 - __builtin_clz(x))
 #endif
-
-#ifndef min
- #define min(a,b) ((a)<(b)?(a):(b))
+#define JPEG_NBITS(x)          (x ? JPEG_NBITS_NONZERO(x) : 0)
+#else
+#include "jpeg_nbits_table.h"
+#define JPEG_NBITS(x)          (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x)  JPEG_NBITS(x)
 #endif
 
 
@@ -66,31 +72,42 @@
  * but must not be updated permanently until we complete the MCU.
  */
 
-typedef struct {
-  size_t put_buffer;            /* current bit-accumulation buffer */
-  int put_bits;                 /* # of bits now in it */
-  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
-} savable_state;
+#if defined(__x86_64__) && defined(__ILP32__)
+typedef unsigned long long bit_buf_type;
+#else
+typedef size_t bit_buf_type;
+#endif
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
+/* NOTE: The more optimal Huffman encoding algorithm is only used by the
+ * intrinsics implementation of the Arm Neon SIMD extensions, which is why we
+ * retain the old Huffman encoder behavior when using the GAS implementation.
  */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src)  ((dest) = (src))
+#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__) || \
+                            defined(_M_ARM) || defined(_M_ARM64))
+typedef unsigned long long simd_bit_buf_type;
 #else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src)  \
-        ((dest).put_buffer = (src).put_buffer, \
-         (dest).put_bits = (src).put_bits, \
-         (dest).last_dc_val[0] = (src).last_dc_val[0], \
-         (dest).last_dc_val[1] = (src).last_dc_val[1], \
-         (dest).last_dc_val[2] = (src).last_dc_val[2], \
-         (dest).last_dc_val[3] = (src).last_dc_val[3])
+typedef bit_buf_type simd_bit_buf_type;
 #endif
+
+#if (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 8) || defined(_WIN64) || \
+    (defined(__x86_64__) && defined(__ILP32__))
+#define BIT_BUF_SIZE  64
+#elif (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 4) || defined(_WIN32)
+#define BIT_BUF_SIZE  32
+#else
+#error Cannot determine word size
 #endif
+#define SIMD_BIT_BUF_SIZE  (sizeof(simd_bit_buf_type) * 8)
 
+typedef struct {
+  union {
+    bit_buf_type c;
+    simd_bit_buf_type simd;
+  } put_buffer;                         /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+                                        /* (Neon GAS: # of bits now in it) */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
+} savable_state;
 
 typedef struct {
   struct jpeg_entropy_encoder pub; /* public fields */
@@ -124,16 +141,17 @@ typedef struct {
   size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
   savable_state cur;            /* Current bit buffer & DC state */
   j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
 } working_state;
 
 
 /* Forward declarations */
-METHODDEF(boolean) encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_huff (j_compress_ptr cinfo);
+METHODDEF(boolean) encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_huff(j_compress_ptr cinfo);
 #ifdef ENTROPY_OPT_SUPPORTED
-METHODDEF(boolean) encode_mcu_gather (j_compress_ptr cinfo,
-                                      JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_gather (j_compress_ptr cinfo);
+METHODDEF(boolean) encode_mcu_gather(j_compress_ptr cinfo,
+                                     JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_gather(j_compress_ptr cinfo);
 #endif
 
 
@@ -144,9 +162,9 @@ METHODDEF(void) finish_pass_gather (j_compress_ptr cinfo);
  */
 
 METHODDEF(void)
-start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int ci, dctbl, actbl;
   jpeg_component_info *compptr;
 
@@ -180,30 +198,39 @@ start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
       /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
       if (entropy->dc_count_ptrs[dctbl] == NULL)
         entropy->dc_count_ptrs[dctbl] = (long *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
-      MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * sizeof(long));
+      memset(entropy->dc_count_ptrs[dctbl], 0, 257 * sizeof(long));
       if (entropy->ac_count_ptrs[actbl] == NULL)
         entropy->ac_count_ptrs[actbl] = (long *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
-      MEMZERO(entropy->ac_count_ptrs[actbl], 257 * sizeof(long));
+      memset(entropy->ac_count_ptrs[actbl], 0, 257 * sizeof(long));
 #endif
     } else {
       /* Compute derived values for Huffman tables */
       /* We may do this more than once for a table, but it's not expensive */
       jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
-                              & entropy->dc_derived_tbls[dctbl]);
+                              &entropy->dc_derived_tbls[dctbl]);
       jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,
-                              & entropy->ac_derived_tbls[actbl]);
+                              &entropy->ac_derived_tbls[actbl]);
     }
     /* Initialize DC predictions to 0 */
     entropy->saved.last_dc_val[ci] = 0;
   }
 
   /* Initialize bit buffer to empty */
-  entropy->saved.put_buffer = 0;
-  entropy->saved.put_bits = 0;
+  if (entropy->simd) {
+    entropy->saved.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    entropy->saved.free_bits = 0;
+#else
+    entropy->saved.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+  } else {
+    entropy->saved.put_buffer.c = 0;
+    entropy->saved.free_bits = BIT_BUF_SIZE;
+  }
 
   /* Initialize restart stuff */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -219,8 +246,8 @@ start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
  */
 
 GLOBAL(void)
-jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
-                         c_derived_tbl **pdtbl)
+jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno,
+                        c_derived_tbl **pdtbl)
 {
   JHUFF_TBL *htbl;
   c_derived_tbl *dtbl;
@@ -244,7 +271,7 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
   /* Allocate a workspace if we haven't already done so. */
   if (*pdtbl == NULL)
     *pdtbl = (c_derived_tbl *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(c_derived_tbl));
   dtbl = *pdtbl;
 
@@ -252,11 +279,11 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
 
   p = 0;
   for (l = 1; l <= 16; l++) {
-    i = (int) htbl->bits[l];
+    i = (int)htbl->bits[l];
     if (i < 0 || p + i > 256)   /* protect against table overrun */
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     while (i--)
-      huffsize[p++] = (char) l;
+      huffsize[p++] = (char)l;
   }
   huffsize[p] = 0;
   lastp = p;
@@ -268,14 +295,14 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
   si = huffsize[0];
   p = 0;
   while (huffsize[p]) {
-    while (((int) huffsize[p]) == si) {
+    while (((int)huffsize[p]) == si) {
       huffcode[p++] = code;
       code++;
     }
     /* code is now 1 more than the last code used for codelength si; but
      * it must still fit in si bits, since no code is allowed to be all ones.
      */
-    if (((JLONG) code) >= (((JLONG) 1) << si))
+    if (((JLONG)code) >= (((JLONG)1) << si))
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     code <<= 1;
     si++;
@@ -288,7 +315,8 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
    * this lets us detect duplicate VAL entries here, and later
    * allows emit_bits to detect any attempt to emit such symbols.
    */
-  MEMZERO(dtbl->ehufsi, sizeof(dtbl->ehufsi));
+  memset(dtbl->ehufco, 0, sizeof(dtbl->ehufco));
+  memset(dtbl->ehufsi, 0, sizeof(dtbl->ehufsi));
 
   /* This is also a convenient place to check for out-of-range
    * and duplicated VAL entries.  We allow 0..255 for AC symbols
@@ -310,20 +338,21 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
 /* Outputting bytes to the file */
 
 /* Emit a byte, taking 'action' if must suspend. */
-#define emit_byte(state,val,action)  \
-        { *(state)->next_output_byte++ = (JOCTET) (val);  \
-          if (--(state)->free_in_buffer == 0)  \
-            if (! dump_buffer(state))  \
-              { action; } }
+#define emit_byte(state, val, action) { \
+  *(state)->next_output_byte++ = (JOCTET)(val); \
+  if (--(state)->free_in_buffer == 0) \
+    if (!dump_buffer(state)) \
+      { action; } \
+}
 
 
 LOCAL(boolean)
-dump_buffer (working_state *state)
+dump_buffer(working_state *state)
 /* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
 {
   struct jpeg_destination_mgr *dest = state->cinfo->dest;
 
-  if (! (*dest->empty_output_buffer) (state->cinfo))
+  if (!(*dest->empty_output_buffer) (state->cinfo))
     return FALSE;
   /* After a successful buffer dump, must reset buffer pointers */
   state->next_output_byte = dest->next_output_byte;
@@ -334,89 +363,93 @@ dump_buffer (working_state *state)
 
 /* Outputting bits to the file */
 
-/* These macros perform the same task as the emit_bits() function in the
- * original libjpeg code.  In addition to reducing overhead by explicitly
- * inlining the code, additional performance is achieved by taking into
- * account the size of the bit buffer and waiting until it is almost full
- * before emptying it.  This mostly benefits 64-bit platforms, since 6
- * bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+/* Output byte b and, speculatively, an additional 0 byte.  0xFF must be
+ * encoded as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the
+ * byte is 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and
+ * the speculative 0 byte will be overwritten by the next byte.
  */
-
-#define EMIT_BYTE() { \
-  JOCTET c; \
-  put_bits -= 8; \
-  c = (JOCTET)GETJOCTET(put_buffer >> put_bits); \
-  *buffer++ = c; \
-  if (c == 0xFF)  /* need to stuff a zero byte? */ \
-    *buffer++ = 0; \
- }
-
-#define PUT_BITS(code, size) { \
-  put_bits += size; \
-  put_buffer = (put_buffer << size) | code; \
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
 }
 
-#define CHECKBUF15() { \
-  if (put_bits > 15) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if BIT_BUF_SIZE == 64
+
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    buffer[0] = (JOCTET)(put_buffer >> 56); \
+    buffer[1] = (JOCTET)(put_buffer >> 48); \
+    buffer[2] = (JOCTET)(put_buffer >> 40); \
+    buffer[3] = (JOCTET)(put_buffer >> 32); \
+    buffer[4] = (JOCTET)(put_buffer >> 24); \
+    buffer[5] = (JOCTET)(put_buffer >> 16); \
+    buffer[6] = (JOCTET)(put_buffer >> 8); \
+    buffer[7] = (JOCTET)(put_buffer); \
+    buffer += 8; \
   } \
 }
 
-#define CHECKBUF31() { \
-  if (put_bits > 31) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-  } \
-}
+#else
 
-#define CHECKBUF47() { \
-  if (put_bits > 47) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    buffer[0] = (JOCTET)(put_buffer >> 24); \
+    buffer[1] = (JOCTET)(put_buffer >> 16); \
+    buffer[2] = (JOCTET)(put_buffer >> 8); \
+    buffer[3] = (JOCTET)(put_buffer); \
+    buffer += 4; \
   } \
 }
 
-#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T)
-#error Cannot determine word size
 #endif
 
-#if SIZEOF_SIZE_T==8 || defined(_WIN64)
-
-#define EMIT_BITS(code, size) { \
-  CHECKBUF47() \
-  PUT_BITS(code, size) \
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
 }
 
-#define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG) 1)<<nbits) - 1; \
-  CHECKBUF31() \
-  PUT_BITS(code, size) \
-  PUT_BITS(temp2, nbits) \
- }
-
-#else
-
-#define EMIT_BITS(code, size) { \
-  PUT_BITS(code, size) \
-  CHECKBUF15() \
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
 }
 
-#define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG) 1)<<nbits) - 1; \
-  PUT_BITS(code, size) \
-  CHECKBUF15() \
-  PUT_BITS(temp2, nbits) \
-  CHECKBUF15() \
- }
-
-#endif
+#define PUT_CODE(code, size) { \
+  temp &= (((JLONG)1) << nbits) - 1; \
+  temp |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(temp, nbits) \
+}
 
 
 /* Although it is exceedingly rare, it is possible for a Huffman-encoded
@@ -428,55 +461,81 @@ dump_buffer (working_state *state)
  * scanning order-- 1, 8, 16, etc.), then this will produce an encoded block
  * larger than 200 bytes.
  */
-#define BUFSIZE (DCTSIZE2 * 4)
+#define BUFSIZE  (DCTSIZE2 * 8)
 
 #define LOAD_BUFFER() { \
   if (state->free_in_buffer < BUFSIZE) { \
     localbuf = 1; \
     buffer = _buffer; \
-  } \
-  else buffer = state->next_output_byte; \
- }
+  } else \
+    buffer = state->next_output_byte; \
+}
 
 #define STORE_BUFFER() { \
   if (localbuf) { \
+    size_t bytes, bytestocopy; \
     bytes = buffer - _buffer; \
     buffer = _buffer; \
     while (bytes > 0) { \
-      bytestocopy = min(bytes, state->free_in_buffer); \
-      MEMCOPY(state->next_output_byte, buffer, bytestocopy); \
+      bytestocopy = MIN(bytes, state->free_in_buffer); \
+      memcpy(state->next_output_byte, buffer, bytestocopy); \
       state->next_output_byte += bytestocopy; \
       buffer += bytestocopy; \
       state->free_in_buffer -= bytestocopy; \
       if (state->free_in_buffer == 0) \
-        if (! dump_buffer(state)) return FALSE; \
+        if (!dump_buffer(state)) return FALSE; \
       bytes -= bytestocopy; \
     } \
-  } \
-  else { \
+  } else { \
     state->free_in_buffer -= (buffer - state->next_output_byte); \
     state->next_output_byte = buffer; \
   } \
- }
+}
 
 
 LOCAL(boolean)
-flush_bits (working_state *state)
+flush_bits(working_state *state)
 {
-  JOCTET _buffer[BUFSIZE], *buffer;
-  size_t put_buffer;  int put_bits;
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  JOCTET _buffer[BUFSIZE], *buffer, temp;
+  simd_bit_buf_type put_buffer;  int put_bits;
+  int localbuf = 0;
+
+  if (state->simd) {
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    put_bits = state->cur.free_bits;
+#else
+    put_bits = SIMD_BIT_BUF_SIZE - state->cur.free_bits;
+#endif
+    put_buffer = state->cur.put_buffer.simd;
+  } else {
+    put_bits = BIT_BUF_SIZE - state->cur.free_bits;
+    put_buffer = state->cur.put_buffer.c;
+  }
 
-  put_buffer = state->cur.put_buffer;
-  put_bits = state->cur.put_bits;
   LOAD_BUFFER()
 
-  /* fill any partial byte with ones */
-  PUT_BITS(0x7F, 7)
-  while (put_bits >= 8) EMIT_BYTE()
+  while (put_bits >= 8) {
+    put_bits -= 8;
+    temp = (JOCTET)(put_buffer >> put_bits);
+    EMIT_BYTE(temp)
+  }
+  if (put_bits) {
+    /* fill partial byte with ones */
+    temp = (JOCTET)((put_buffer << (8 - put_bits)) | (0xFF >> put_bits));
+    EMIT_BYTE(temp)
+  }
 
-  state->cur.put_buffer = 0;    /* and reset bit-buffer to empty */
-  state->cur.put_bits = 0;
+  if (state->simd) {                    /* and reset bit buffer to empty */
+    state->cur.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    state->cur.free_bits = 0;
+#else
+    state->cur.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+  } else {
+    state->cur.put_buffer.c = 0;
+    state->cur.free_bits = BIT_BUF_SIZE;
+  }
   STORE_BUFFER()
 
   return TRUE;
@@ -486,11 +545,11 @@ flush_bits (working_state *state)
 /* Encode a single block's worth of coefficients */
 
 LOCAL(boolean)
-encode_one_block_simd (working_state *state, JCOEFPTR block, int last_dc_val,
-                       c_derived_tbl *dctbl, c_derived_tbl *actbl)
+encode_one_block_simd(working_state *state, JCOEFPTR block, int last_dc_val,
+                      c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
   JOCTET _buffer[BUFSIZE], *buffer;
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  int localbuf = 0;
 
   LOAD_BUFFER()
 
@@ -503,108 +562,91 @@ encode_one_block_simd (working_state *state, JCOEFPTR block, int last_dc_val,
 }
 
 LOCAL(boolean)
-encode_one_block (working_state *state, JCOEFPTR block, int last_dc_val,
-                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
+                 c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
-  int temp, temp2, temp3;
-  int nbits;
-  int r, code, size;
+  int temp, nbits, free_bits;
+  bit_buf_type put_buffer;
   JOCTET _buffer[BUFSIZE], *buffer;
-  size_t put_buffer;  int put_bits;
-  int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0];
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  int localbuf = 0;
 
-  put_buffer = state->cur.put_buffer;
-  put_bits = state->cur.put_bits;
+  free_bits = state->cur.free_bits;
+  put_buffer = state->cur.put_buffer.c;
   LOAD_BUFFER()
 
   /* Encode the DC coefficient difference per section F.1.2.1 */
 
-  temp = temp2 = block[0] - last_dc_val;
-
- /* This is a well-known technique for obtaining the absolute value without a
-  * branch.  It is derived from an assembly language technique presented in
-  * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
-  * Agner Fog.
-  */
-  temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-  temp ^= temp3;
-  temp -= temp3;
+  temp = block[0] - last_dc_val;
 
-  /* For a negative input, want temp2 = bitwise complement of abs(input) */
-  /* This code assumes we are on a two's complement machine */
-  temp2 += temp3;
+  /* This is a well-known technique for obtaining the absolute value without a
+   * branch.  It is derived from an assembly language technique presented in
+   * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
+   * Agner Fog.  This code assumes we are on a two's complement machine.
+   */
+  nbits = temp >> (CHAR_BIT * sizeof(int) - 1);
+  temp += nbits;
+  nbits ^= temp;
 
   /* Find the number of bits needed for the magnitude of the coefficient */
-  nbits = JPEG_NBITS(temp);
-
-  /* Emit the Huffman-coded symbol for the number of bits */
-  code = dctbl->ehufco[nbits];
-  size = dctbl->ehufsi[nbits];
-  EMIT_BITS(code, size)
+  nbits = JPEG_NBITS(nbits);
 
-  /* Mask off any extra bits in code */
-  temp2 &= (((JLONG) 1)<<nbits) - 1;
-
-  /* Emit that number of bits of the value, if positive, */
-  /* or the complement of its magnitude, if negative. */
-  EMIT_BITS(temp2, nbits)
+  /* Emit the Huffman-coded symbol for the number of bits.
+   * Emit that number of bits of the value, if positive,
+   * or the complement of its magnitude, if negative.
+   */
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits])
 
   /* Encode the AC coefficients per section F.1.2.2 */
 
-  r = 0;                        /* r = run length of zeros */
+  {
+    int r = 0;                  /* r = run length of zeros */
 
 /* Manually unroll the k loop to eliminate the counter variable.  This
  * improves performance greatly on systems with a limited number of
  * registers (such as x86.)
  */
-#define kloop(jpeg_natural_order_of_k) {  \
+#define kloop(jpeg_natural_order_of_k) { \
   if ((temp = block[jpeg_natural_order_of_k]) == 0) { \
-    r++; \
+    r += 16; \
   } else { \
-    temp2 = temp; \
     /* Branch-less absolute value, bitwise complement, etc., same as above */ \
-    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); \
-    temp ^= temp3; \
-    temp -= temp3; \
-    temp2 += temp3; \
-    nbits = JPEG_NBITS_NONZERO(temp); \
+    nbits = temp >> (CHAR_BIT * sizeof(int) - 1); \
+    temp += nbits; \
+    nbits ^= temp; \
+    nbits = JPEG_NBITS_NONZERO(nbits); \
     /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
-    while (r > 15) { \
-      EMIT_BITS(code_0xf0, size_0xf0) \
-      r -= 16; \
+    while (r >= 16 * 16) { \
+      r -= 16 * 16; \
+      PUT_BITS(actbl->ehufco[0xf0], actbl->ehufsi[0xf0]) \
     } \
     /* Emit Huffman symbol for run length / number of bits */ \
-    temp3 = (r << 4) + nbits;  \
-    code = actbl->ehufco[temp3]; \
-    size = actbl->ehufsi[temp3]; \
-    EMIT_CODE(code, size) \
-    r = 0;  \
+    r += nbits; \
+    PUT_CODE(actbl->ehufco[r], actbl->ehufsi[r]) \
+    r = 0; \
   } \
 }
 
-  /* One iteration for each value in jpeg_natural_order[] */
-  kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
-  kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
-  kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
-  kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
-  kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
-  kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
-  kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
-  kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
-  kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
-  kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
-  kloop(55);  kloop(62);  kloop(63);
-
-  /* If the last coef(s) were zero, emit an end-of-block code */
-  if (r > 0) {
-    code = actbl->ehufco[0];
-    size = actbl->ehufsi[0];
-    EMIT_BITS(code, size)
+    /* One iteration for each value in jpeg_natural_order[] */
+    kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
+    kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
+    kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
+    kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
+    kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
+    kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
+    kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
+    kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
+    kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
+    kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
+    kloop(55);  kloop(62);  kloop(63);
+
+    /* If the last coef(s) were zero, emit an end-of-block code */
+    if (r > 0) {
+      PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+    }
   }
 
-  state->cur.put_buffer = put_buffer;
-  state->cur.put_bits = put_bits;
+  state->cur.put_buffer.c = put_buffer;
+  state->cur.free_bits = free_bits;
   STORE_BUFFER()
 
   return TRUE;
@@ -616,11 +658,11 @@ encode_one_block (working_state *state, JCOEFPTR block, int last_dc_val,
  */
 
 LOCAL(boolean)
-emit_restart (working_state *state, int restart_num)
+emit_restart(working_state *state, int restart_num)
 {
   int ci;
 
-  if (! flush_bits(state))
+  if (!flush_bits(state))
     return FALSE;
 
   emit_byte(state, 0xFF, return FALSE);
@@ -641,9 +683,9 @@ emit_restart (working_state *state, int restart_num)
  */
 
 METHODDEF(boolean)
-encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   working_state state;
   int blkn, ci;
   jpeg_component_info *compptr;
@@ -651,13 +693,14 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Load up working state */
   state.next_output_byte = cinfo->dest->next_output_byte;
   state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
+  state.cur = entropy->saved;
   state.cinfo = cinfo;
+  state.simd = entropy->simd;
 
   /* Emit restart marker if needed */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! emit_restart(&state, entropy->next_restart_num))
+      if (!emit_restart(&state, entropy->next_restart_num))
         return FALSE;
   }
 
@@ -666,10 +709,10 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
       ci = cinfo->MCU_membership[blkn];
       compptr = cinfo->cur_comp_info[ci];
-      if (! encode_one_block_simd(&state,
-                                  MCU_data[blkn][0], state.cur.last_dc_val[ci],
-                                  entropy->dc_derived_tbls[compptr->dc_tbl_no],
-                                  entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+      if (!encode_one_block_simd(&state,
+                                 MCU_data[blkn][0], state.cur.last_dc_val[ci],
+                                 entropy->dc_derived_tbls[compptr->dc_tbl_no],
+                                 entropy->ac_derived_tbls[compptr->ac_tbl_no]))
         return FALSE;
       /* Update last_dc_val */
       state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
@@ -678,10 +721,10 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
       ci = cinfo->MCU_membership[blkn];
       compptr = cinfo->cur_comp_info[ci];
-      if (! encode_one_block(&state,
-                             MCU_data[blkn][0], state.cur.last_dc_val[ci],
-                             entropy->dc_derived_tbls[compptr->dc_tbl_no],
-                             entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+      if (!encode_one_block(&state,
+                            MCU_data[blkn][0], state.cur.last_dc_val[ci],
+                            entropy->dc_derived_tbls[compptr->dc_tbl_no],
+                            entropy->ac_derived_tbls[compptr->ac_tbl_no]))
         return FALSE;
       /* Update last_dc_val */
       state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
@@ -691,7 +734,7 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Completed MCU, so update state */
   cinfo->dest->next_output_byte = state.next_output_byte;
   cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+  entropy->saved = state.cur;
 
   /* Update restart-interval state too */
   if (cinfo->restart_interval) {
@@ -712,25 +755,26 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(void)
-finish_pass_huff (j_compress_ptr cinfo)
+finish_pass_huff(j_compress_ptr cinfo)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   working_state state;
 
   /* Load up working state ... flush_bits needs it */
   state.next_output_byte = cinfo->dest->next_output_byte;
   state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
+  state.cur = entropy->saved;
   state.cinfo = cinfo;
+  state.simd = entropy->simd;
 
   /* Flush out the last data */
-  if (! flush_bits(&state))
+  if (!flush_bits(&state))
     ERREXIT(cinfo, JERR_CANT_SUSPEND);
 
   /* Update state */
   cinfo->dest->next_output_byte = state.next_output_byte;
   cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+  entropy->saved = state.cur;
 }
 
 
@@ -751,8 +795,8 @@ finish_pass_huff (j_compress_ptr cinfo)
 /* Process a single block's worth of coefficients */
 
 LOCAL(void)
-htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
-                 long dc_counts[], long ac_counts[])
+htest_one_block(j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
+                long dc_counts[], long ac_counts[])
 {
   register int temp;
   register int nbits;
@@ -773,7 +817,7 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
   /* Check for out-of-range coefficient values.
    * Since we're encoding a difference, the range limit is twice as much.
    */
-  if (nbits > MAX_COEF_BITS+1)
+  if (nbits > MAX_COEF_BITS + 1)
     ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
   /* Count the Huffman symbol for the number of bits */
@@ -824,9 +868,9 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
  */
 
 METHODDEF(boolean)
-encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_gather(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int blkn, ci;
   jpeg_component_info *compptr;
 
@@ -863,13 +907,14 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  * one bits (so that padding bits added at the end of a compressed segment
  * can't look like a valid code).  Because of the canonical ordering of
  * codewords, this just means that there must be an unused slot in the
- * longest codeword length category.  Section K.2 of the JPEG spec suggests
- * reserving such a slot by pretending that symbol 256 is a valid symbol
- * with count 1.  In theory that's not optimal; giving it count zero but
- * including it in the symbol set anyway should give a better Huffman code.
- * But the theoretically better code actually seems to come out worse in
- * practice, because it produces more all-ones bytes (which incur stuffed
- * zero bytes in the final file).  In any case the difference is tiny.
+ * longest codeword length category.  Annex K (Clause K.2) of
+ * Rec. ITU-T T.81 (1992) | ISO/IEC 10918-1:1994 suggests reserving such a slot
+ * by pretending that symbol 256 is a valid symbol with count 1.  In theory
+ * that's not optimal; giving it count zero but including it in the symbol set
+ * anyway should give a better Huffman code.  But the theoretically better code
+ * actually seems to come out worse in practice, because it produces more
+ * all-ones bytes (which incur stuffed zero bytes in the final file).  In any
+ * case the difference is tiny.
  *
  * The JPEG standard requires Huffman codes to be no more than 16 bits long.
  * If some symbols have a very small but nonzero probability, the Huffman tree
@@ -884,10 +929,10 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 GLOBAL(void)
-jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
+jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
 {
-#define MAX_CLEN 32             /* assumed maximum initial code length */
-  UINT8 bits[MAX_CLEN+1];       /* bits[k] = # of symbols with code length k */
+#define MAX_CLEN  32            /* assumed maximum initial code length */
+  UINT8 bits[MAX_CLEN + 1];     /* bits[k] = # of symbols with code length k */
   int codesize[257];            /* codesize[k] = code length of symbol k */
   int others[257];              /* next symbol in current branch of tree */
   int c1, c2;
@@ -896,8 +941,8 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
 
   /* This algorithm is explained in section K.2 of the JPEG standard */
 
-  MEMZERO(bits, sizeof(bits));
-  MEMZERO(codesize, sizeof(codesize));
+  memset(bits, 0, sizeof(bits));
+  memset(codesize, 0, sizeof(codesize));
   for (i = 0; i < 257; i++)
     others[i] = -1;             /* init links to empty */
 
@@ -971,13 +1016,13 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
 
   /* JPEG doesn't allow symbols with code lengths over 16 bits, so if the pure
    * Huffman procedure assigned any such lengths, we must adjust the coding.
-   * Here is what the JPEG spec says about how this next bit works:
-   * Since symbols are paired for the longest Huffman code, the symbols are
-   * removed from this length category two at a time.  The prefix for the pair
-   * (which is one bit shorter) is allocated to one of the pair; then,
-   * skipping the BITS entry for that prefix length, a code word from the next
-   * shortest nonzero BITS entry is converted into a prefix for two code words
-   * one bit longer.
+   * Here is what Rec. ITU-T T.81 | ISO/IEC 10918-1 says about how this next
+   * bit works: Since symbols are paired for the longest Huffman code, the
+   * symbols are removed from this length category two at a time.  The prefix
+   * for the pair (which is one bit shorter) is allocated to one of the pair;
+   * then, skipping the BITS entry for that prefix length, a code word from the
+   * next shortest nonzero BITS entry is converted into a prefix for two code
+   * words one bit longer.
    */
 
   for (i = MAX_CLEN; i > 16; i--) {
@@ -987,8 +1032,8 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
         j--;
 
       bits[i] -= 2;             /* remove two symbols */
-      bits[i-1]++;              /* one goes in this length */
-      bits[j+1] += 2;           /* two new symbols in this length */
+      bits[i - 1]++;            /* one goes in this length */
+      bits[j + 1] += 2;         /* two new symbols in this length */
       bits[j]--;                /* symbol of this length is now a prefix */
     }
   }
@@ -999,17 +1044,18 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
   bits[i]--;
 
   /* Return final symbol counts (only for lengths 0..16) */
-  MEMCOPY(htbl->bits, bits, sizeof(htbl->bits));
+  memcpy(htbl->bits, bits, sizeof(htbl->bits));
 
   /* Return a list of the symbols sorted by code length */
   /* It's not real clear to me why we don't need to consider the codelength
-   * changes made above, but the JPEG spec seems to think this works.
+   * changes made above, but Rec. ITU-T T.81 | ISO/IEC 10918-1 seems to think
+   * this works.
    */
   p = 0;
   for (i = 1; i <= MAX_CLEN; i++) {
     for (j = 0; j <= 255; j++) {
       if (codesize[j] == i) {
-        htbl->huffval[p] = (UINT8) j;
+        htbl->huffval[p] = (UINT8)j;
         p++;
       }
     }
@@ -1025,9 +1071,9 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
  */
 
 METHODDEF(void)
-finish_pass_gather (j_compress_ptr cinfo)
+finish_pass_gather(j_compress_ptr cinfo)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int ci, dctbl, actbl;
   jpeg_component_info *compptr;
   JHUFF_TBL **htblptr;
@@ -1037,24 +1083,24 @@ finish_pass_gather (j_compress_ptr cinfo)
   /* It's important not to apply jpeg_gen_optimal_table more than once
    * per table, because it clobbers the input frequency counts!
    */
-  MEMZERO(did_dc, sizeof(did_dc));
-  MEMZERO(did_ac, sizeof(did_ac));
+  memset(did_dc, 0, sizeof(did_dc));
+  memset(did_ac, 0, sizeof(did_ac));
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     dctbl = compptr->dc_tbl_no;
     actbl = compptr->ac_tbl_no;
-    if (! did_dc[dctbl]) {
-      htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl];
+    if (!did_dc[dctbl]) {
+      htblptr = &cinfo->dc_huff_tbl_ptrs[dctbl];
       if (*htblptr == NULL)
-        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+        *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
       did_dc[dctbl] = TRUE;
     }
-    if (! did_ac[actbl]) {
-      htblptr = & cinfo->ac_huff_tbl_ptrs[actbl];
+    if (!did_ac[actbl]) {
+      htblptr = &cinfo->ac_huff_tbl_ptrs[actbl];
       if (*htblptr == NULL)
-        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+        *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
       did_ac[actbl] = TRUE;
     }
@@ -1070,15 +1116,15 @@ finish_pass_gather (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_huff_encoder (j_compress_ptr cinfo)
+jinit_huff_encoder(j_compress_ptr cinfo)
 {
   huff_entropy_ptr entropy;
   int i;
 
   entropy = (huff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(huff_entropy_encoder));
-  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
   entropy->pub.start_pass = start_pass_huff;
 
   /* Mark tables unallocated */
diff --git a/media/libjpeg/jchuff.h b/media/libjpeg/jchuff.h
index 4236089adc..314a2325c9 100644
--- a/media/libjpeg/jchuff.h
+++ b/media/libjpeg/jchuff.h
@@ -20,9 +20,9 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define MAX_COEF_BITS 10
+#define MAX_COEF_BITS  10
 #else
-#define MAX_COEF_BITS 14
+#define MAX_COEF_BITS  14
 #endif
 
 /* Derived data constructed for each Huffman table */
@@ -34,10 +34,9 @@ typedef struct {
 } c_derived_tbl;
 
 /* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_c_derived_tbl
-        (j_compress_ptr cinfo, boolean isDC, int tblno,
-         c_derived_tbl ** pdtbl);
+EXTERN(void) jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC,
+                                     int tblno, c_derived_tbl **pdtbl);
 
 /* Generate an optimal table definition given the specified counts */
-EXTERN(void) jpeg_gen_optimal_table
-        (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[]);
+EXTERN(void) jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl,
+                                    long freq[]);
diff --git a/media/libjpeg/jcicc.c b/media/libjpeg/jcicc.c
new file mode 100644
index 0000000000..11037ff694
--- /dev/null
+++ b/media/libjpeg/jcicc.c
@@ -0,0 +1,105 @@
+/*
+ * jcicc.c
+ *
+ * Copyright (C) 1997-1998, Thomas G. Lane, Todd Newman.
+ * Copyright (C) 2017, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides code to write International Color Consortium (ICC) device
+ * profiles embedded in JFIF JPEG image files.  The ICC has defined a standard
+ * for including such data in JPEG "APP2" markers.  The code given here does
+ * not know anything about the internal structure of the ICC profile data; it
+ * just knows how to embed the profile data in a JPEG file while writing it.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+
+/*
+ * Since an ICC profile can be larger than the maximum size of a JPEG marker
+ * (64K), we need provisions to split it into multiple markers.  The format
+ * defined by the ICC specifies one or more APP2 markers containing the
+ * following data:
+ *      Identifying string      ASCII "ICC_PROFILE\0"  (12 bytes)
+ *      Marker sequence number  1 for first APP2, 2 for next, etc (1 byte)
+ *      Number of markers       Total number of APP2's used (1 byte)
+ *      Profile data            (remainder of APP2 data)
+ * Decoders should use the marker sequence numbers to reassemble the profile,
+ * rather than assuming that the APP2 markers appear in the correct sequence.
+ */
+
+#define ICC_MARKER  (JPEG_APP0 + 2)     /* JPEG marker code for ICC */
+#define ICC_OVERHEAD_LEN  14            /* size of non-profile data in APP2 */
+#define MAX_BYTES_IN_MARKER  65533      /* maximum data len of a JPEG marker */
+#define MAX_DATA_BYTES_IN_MARKER  (MAX_BYTES_IN_MARKER - ICC_OVERHEAD_LEN)
+
+
+/*
+ * This routine writes the given ICC profile data into a JPEG file.  It *must*
+ * be called AFTER calling jpeg_start_compress() and BEFORE the first call to
+ * jpeg_write_scanlines().  (This ordering ensures that the APP2 marker(s) will
+ * appear after the SOI and JFIF or Adobe markers, but before all else.)
+ */
+
+GLOBAL(void)
+jpeg_write_icc_profile(j_compress_ptr cinfo, const JOCTET *icc_data_ptr,
+                       unsigned int icc_data_len)
+{
+  unsigned int num_markers;     /* total number of markers we'll write */
+  int cur_marker = 1;           /* per spec, counting starts at 1 */
+  unsigned int length;          /* number of bytes to write in this marker */
+
+  if (icc_data_ptr == NULL || icc_data_len == 0)
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+  if (cinfo->global_state < CSTATE_SCANNING)
+    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+  /* Calculate the number of markers we'll need, rounding up of course */
+  num_markers = icc_data_len / MAX_DATA_BYTES_IN_MARKER;
+  if (num_markers * MAX_DATA_BYTES_IN_MARKER != icc_data_len)
+    num_markers++;
+
+  while (icc_data_len > 0) {
+    /* length of profile to put in this marker */
+    length = icc_data_len;
+    if (length > MAX_DATA_BYTES_IN_MARKER)
+      length = MAX_DATA_BYTES_IN_MARKER;
+    icc_data_len -= length;
+
+    /* Write the JPEG marker header (APP2 code and marker length) */
+    jpeg_write_m_header(cinfo, ICC_MARKER,
+                        (unsigned int)(length + ICC_OVERHEAD_LEN));
+
+    /* Write the marker identifying string "ICC_PROFILE" (null-terminated).  We
+     * code it in this less-than-transparent way so that the code works even if
+     * the local character set is not ASCII.
+     */
+    jpeg_write_m_byte(cinfo, 0x49);
+    jpeg_write_m_byte(cinfo, 0x43);
+    jpeg_write_m_byte(cinfo, 0x43);
+    jpeg_write_m_byte(cinfo, 0x5F);
+    jpeg_write_m_byte(cinfo, 0x50);
+    jpeg_write_m_byte(cinfo, 0x52);
+    jpeg_write_m_byte(cinfo, 0x4F);
+    jpeg_write_m_byte(cinfo, 0x46);
+    jpeg_write_m_byte(cinfo, 0x49);
+    jpeg_write_m_byte(cinfo, 0x4C);
+    jpeg_write_m_byte(cinfo, 0x45);
+    jpeg_write_m_byte(cinfo, 0x0);
+
+    /* Add the sequencing info */
+    jpeg_write_m_byte(cinfo, cur_marker);
+    jpeg_write_m_byte(cinfo, (int)num_markers);
+
+    /* Add the profile data */
+    while (length--) {
+      jpeg_write_m_byte(cinfo, *icc_data_ptr);
+      icc_data_ptr++;
+    }
+    cur_marker++;
+  }
+}
diff --git a/media/libjpeg/jcinit.c b/media/libjpeg/jcinit.c
index 463bd8c6dd..157353a22e 100644
--- a/media/libjpeg/jcinit.c
+++ b/media/libjpeg/jcinit.c
@@ -1,8 +1,10 @@
 /*
  * jcinit.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -19,6 +21,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
 
 /*
@@ -28,13 +31,13 @@
  */
 
 GLOBAL(void)
-jinit_compress_master (j_compress_ptr cinfo)
+jinit_compress_master(j_compress_ptr cinfo)
 {
   /* Initialize master control (includes parameter checking/processing) */
   jinit_c_master_control(cinfo, FALSE /* full compression */);
 
   /* Preprocessing */
-  if (! cinfo->raw_data_in) {
+  if (!cinfo->raw_data_in) {
     jinit_color_converter(cinfo);
     jinit_downsampler(cinfo);
     jinit_c_prep_controller(cinfo, FALSE /* never need full buffer here */);
@@ -60,14 +63,14 @@ jinit_compress_master (j_compress_ptr cinfo)
   }
 
   /* Need a full-image coefficient buffer in any multi-pass mode. */
-  jinit_c_coef_controller(cinfo,
-                (boolean) (cinfo->num_scans > 1 || cinfo->optimize_coding));
+  jinit_c_coef_controller(cinfo, (boolean)(cinfo->num_scans > 1 ||
+                                           cinfo->optimize_coding));
   jinit_c_main_controller(cinfo, FALSE /* never need full buffer here */);
 
   jinit_marker_writer(cinfo);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
-  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
   /* Write the datastream header (SOI) immediately.
    * Frame and scan headers are postponed till later.
diff --git a/media/libjpeg/jcmainct.c b/media/libjpeg/jcmainct.c
index d01f46364b..3f23028c46 100644
--- a/media/libjpeg/jcmainct.c
+++ b/media/libjpeg/jcmainct.c
@@ -39,9 +39,10 @@ typedef my_main_controller *my_main_ptr;
 
 
 /* Forward declarations */
-METHODDEF(void) process_data_simple_main
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-         JDIMENSION in_rows_avail);
+METHODDEF(void) process_data_simple_main(j_compress_ptr cinfo,
+                                         JSAMPARRAY input_buf,
+                                         JDIMENSION *in_row_ctr,
+                                         JDIMENSION in_rows_avail);
 
 
 /*
@@ -49,9 +50,9 @@ METHODDEF(void) process_data_simple_main
  */
 
 METHODDEF(void)
-start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_main(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   /* Do nothing in raw-data mode. */
   if (cinfo->raw_data_in)
@@ -75,19 +76,18 @@ start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(void)
-process_data_simple_main (j_compress_ptr cinfo,
-                          JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-                          JDIMENSION in_rows_avail)
+process_data_simple_main(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                         JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
     /* Read input data if we haven't filled the main buffer yet */
     if (main_ptr->rowgroup_ctr < DCTSIZE)
-      (*cinfo->prep->pre_process_data) (cinfo,
-                                        input_buf, in_row_ctr, in_rows_avail,
-                                        main_ptr->buffer, &main_ptr->rowgroup_ctr,
-                                        (JDIMENSION) DCTSIZE);
+      (*cinfo->prep->pre_process_data) (cinfo, input_buf, in_row_ctr,
+                                        in_rows_avail, main_ptr->buffer,
+                                        &main_ptr->rowgroup_ctr,
+                                        (JDIMENSION)DCTSIZE);
 
     /* If we don't have a full iMCU row buffered, return to application for
      * more data.  Note that preprocessor will always pad to fill the iMCU row
@@ -97,14 +97,14 @@ process_data_simple_main (j_compress_ptr cinfo,
       return;
 
     /* Send the completed row to the compressor */
-    if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
+    if (!(*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
       /* If compressor did not consume the whole row, then we must need to
        * suspend processing and return to the application.  In this situation
        * we pretend we didn't yet consume the last input row; otherwise, if
        * it happened to be the last row of the image, the application would
        * think we were done.
        */
-      if (! main_ptr->suspended) {
+      if (!main_ptr->suspended) {
         (*in_row_ctr)--;
         main_ptr->suspended = TRUE;
       }
@@ -128,16 +128,16 @@ process_data_simple_main (j_compress_ptr cinfo,
  */
 
 GLOBAL(void)
-jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
+jinit_c_main_controller(j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_main_ptr main_ptr;
   int ci;
   jpeg_component_info *compptr;
 
   main_ptr = (my_main_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_main_controller));
-  cinfo->main = (struct jpeg_c_main_controller *) main_ptr;
+  cinfo->main = (struct jpeg_c_main_controller *)main_ptr;
   main_ptr->pub.start_pass = start_pass_main;
 
   /* We don't need to create a buffer in raw-data mode. */
@@ -154,9 +154,9 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
          ci++, compptr++) {
       main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+        ((j_common_ptr)cinfo, JPOOL_IMAGE,
          compptr->width_in_blocks * DCTSIZE,
-         (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+         (JDIMENSION)(compptr->v_samp_factor * DCTSIZE));
     }
   }
 }
diff --git a/media/libjpeg/jcmarker.c b/media/libjpeg/jcmarker.c
index 463f665927..801fbab4ef 100644
--- a/media/libjpeg/jcmarker.c
+++ b/media/libjpeg/jcmarker.c
@@ -110,30 +110,30 @@ typedef my_marker_writer *my_marker_ptr;
  */
 
 LOCAL(void)
-emit_byte (j_compress_ptr cinfo, int val)
+emit_byte(j_compress_ptr cinfo, int val)
 /* Emit a byte */
 {
   struct jpeg_destination_mgr *dest = cinfo->dest;
 
-  *(dest->next_output_byte)++ = (JOCTET) val;
+  *(dest->next_output_byte)++ = (JOCTET)val;
   if (--dest->free_in_buffer == 0) {
-    if (! (*dest->empty_output_buffer) (cinfo))
+    if (!(*dest->empty_output_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   }
 }
 
 
 LOCAL(void)
-emit_marker (j_compress_ptr cinfo, JPEG_MARKER mark)
+emit_marker(j_compress_ptr cinfo, JPEG_MARKER mark)
 /* Emit a marker code */
 {
   emit_byte(cinfo, 0xFF);
-  emit_byte(cinfo, (int) mark);
+  emit_byte(cinfo, (int)mark);
 }
 
 
 LOCAL(void)
-emit_2bytes (j_compress_ptr cinfo, int value)
+emit_2bytes(j_compress_ptr cinfo, int value)
 /* Emit a 2-byte integer; these are always MSB first in JPEG files */
 {
   emit_byte(cinfo, (value >> 8) & 0xFF);
@@ -146,7 +146,7 @@ emit_2bytes (j_compress_ptr cinfo, int value)
  */
 
 LOCAL(int)
-emit_dqt (j_compress_ptr cinfo, int index)
+emit_dqt(j_compress_ptr cinfo, int index)
 /* Emit a DQT marker */
 /* Returns the precision used (0 = 8bits, 1 = 16bits) for baseline checking */
 {
@@ -163,19 +163,19 @@ emit_dqt (j_compress_ptr cinfo, int index)
       prec = 1;
   }
 
-  if (! qtbl->sent_table) {
+  if (!qtbl->sent_table) {
     emit_marker(cinfo, M_DQT);
 
-    emit_2bytes(cinfo, prec ? DCTSIZE2*2 + 1 + 2 : DCTSIZE2 + 1 + 2);
+    emit_2bytes(cinfo, prec ? DCTSIZE2 * 2 + 1 + 2 : DCTSIZE2 + 1 + 2);
 
-    emit_byte(cinfo, index + (prec<<4));
+    emit_byte(cinfo, index + (prec << 4));
 
     for (i = 0; i < DCTSIZE2; i++) {
       /* The table entries must be emitted in zigzag order. */
       unsigned int qval = qtbl->quantval[jpeg_natural_order[i]];
       if (prec)
-        emit_byte(cinfo, (int) (qval >> 8));
-      emit_byte(cinfo, (int) (qval & 0xFF));
+        emit_byte(cinfo, (int)(qval >> 8));
+      emit_byte(cinfo, (int)(qval & 0xFF));
     }
 
     qtbl->sent_table = TRUE;
@@ -186,7 +186,7 @@ emit_dqt (j_compress_ptr cinfo, int index)
 
 
 LOCAL(void)
-emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
+emit_dht(j_compress_ptr cinfo, int index, boolean is_ac)
 /* Emit a DHT marker */
 {
   JHUFF_TBL *htbl;
@@ -202,7 +202,7 @@ emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
   if (htbl == NULL)
     ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, index);
 
-  if (! htbl->sent_table) {
+  if (!htbl->sent_table) {
     emit_marker(cinfo, M_DHT);
 
     length = 0;
@@ -224,7 +224,7 @@ emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
 
 
 LOCAL(void)
-emit_dac (j_compress_ptr cinfo)
+emit_dac(j_compress_ptr cinfo)
 /* Emit a DAC marker */
 /* Since the useful info is so small, we want to emit all the tables in */
 /* one DAC marker.  Therefore this routine does its own scan of the table. */
@@ -255,12 +255,12 @@ emit_dac (j_compress_ptr cinfo)
   if (length) {
     emit_marker(cinfo, M_DAC);
 
-    emit_2bytes(cinfo, length*2 + 2);
+    emit_2bytes(cinfo, length * 2 + 2);
 
     for (i = 0; i < NUM_ARITH_TBLS; i++) {
       if (dc_in_use[i]) {
         emit_byte(cinfo, i);
-        emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i]<<4));
+        emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i] << 4));
       }
       if (ac_in_use[i]) {
         emit_byte(cinfo, i + 0x10);
@@ -273,19 +273,19 @@ emit_dac (j_compress_ptr cinfo)
 
 
 LOCAL(void)
-emit_dri (j_compress_ptr cinfo)
+emit_dri(j_compress_ptr cinfo)
 /* Emit a DRI marker */
 {
   emit_marker(cinfo, M_DRI);
 
   emit_2bytes(cinfo, 4);        /* fixed length */
 
-  emit_2bytes(cinfo, (int) cinfo->restart_interval);
+  emit_2bytes(cinfo, (int)cinfo->restart_interval);
 }
 
 
 LOCAL(void)
-emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
+emit_sof(j_compress_ptr cinfo, JPEG_MARKER code)
 /* Emit a SOF marker */
 {
   int ci;
@@ -296,13 +296,12 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
   emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */
 
   /* Make sure image isn't bigger than SOF field can handle */
-  if ((long) cinfo->_jpeg_height > 65535L ||
-      (long) cinfo->_jpeg_width > 65535L)
-    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535);
+  if ((long)cinfo->_jpeg_height > 65535L || (long)cinfo->_jpeg_width > 65535L)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)65535);
 
   emit_byte(cinfo, cinfo->data_precision);
-  emit_2bytes(cinfo, (int) cinfo->_jpeg_height);
-  emit_2bytes(cinfo, (int) cinfo->_jpeg_width);
+  emit_2bytes(cinfo, (int)cinfo->_jpeg_height);
+  emit_2bytes(cinfo, (int)cinfo->_jpeg_width);
 
   emit_byte(cinfo, cinfo->num_components);
 
@@ -316,7 +315,7 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
 
 
 LOCAL(void)
-emit_sos (j_compress_ptr cinfo)
+emit_sos(j_compress_ptr cinfo)
 /* Emit a SOS marker */
 {
   int i, td, ta;
@@ -351,7 +350,7 @@ emit_sos (j_compress_ptr cinfo)
 
 
 LOCAL(void)
-emit_jfif_app0 (j_compress_ptr cinfo)
+emit_jfif_app0(j_compress_ptr cinfo)
 /* Emit a JFIF-compliant APP0 marker */
 {
   /*
@@ -378,15 +377,15 @@ emit_jfif_app0 (j_compress_ptr cinfo)
   emit_byte(cinfo, cinfo->JFIF_major_version); /* Version fields */
   emit_byte(cinfo, cinfo->JFIF_minor_version);
   emit_byte(cinfo, cinfo->density_unit); /* Pixel size information */
-  emit_2bytes(cinfo, (int) cinfo->X_density);
-  emit_2bytes(cinfo, (int) cinfo->Y_density);
+  emit_2bytes(cinfo, (int)cinfo->X_density);
+  emit_2bytes(cinfo, (int)cinfo->Y_density);
   emit_byte(cinfo, 0);          /* No thumbnail image */
   emit_byte(cinfo, 0);
 }
 
 
 LOCAL(void)
-emit_adobe_app14 (j_compress_ptr cinfo)
+emit_adobe_app14(j_compress_ptr cinfo)
 /* Emit an Adobe APP14 marker */
 {
   /*
@@ -440,19 +439,19 @@ emit_adobe_app14 (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-write_marker_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
+write_marker_header(j_compress_ptr cinfo, int marker, unsigned int datalen)
 /* Emit an arbitrary marker header */
 {
-  if (datalen > (unsigned int) 65533)           /* safety check */
+  if (datalen > (unsigned int)65533)            /* safety check */
     ERREXIT(cinfo, JERR_BAD_LENGTH);
 
-  emit_marker(cinfo, (JPEG_MARKER) marker);
+  emit_marker(cinfo, (JPEG_MARKER)marker);
 
-  emit_2bytes(cinfo, (int) (datalen + 2));      /* total length */
+  emit_2bytes(cinfo, (int)(datalen + 2));       /* total length */
 }
 
 METHODDEF(void)
-write_marker_byte (j_compress_ptr cinfo, int val)
+write_marker_byte(j_compress_ptr cinfo, int val)
 /* Emit one byte of marker parameters following write_marker_header */
 {
   emit_byte(cinfo, val);
@@ -471,9 +470,9 @@ write_marker_byte (j_compress_ptr cinfo, int val)
  */
 
 METHODDEF(void)
-write_file_header (j_compress_ptr cinfo)
+write_file_header(j_compress_ptr cinfo)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
 
   emit_marker(cinfo, M_SOI);    /* first the SOI */
 
@@ -496,7 +495,7 @@ write_file_header (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-write_frame_header (j_compress_ptr cinfo)
+write_frame_header(j_compress_ptr cinfo)
 {
   int ci, prec;
   boolean is_baseline;
@@ -556,9 +555,9 @@ write_frame_header (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-write_scan_header (j_compress_ptr cinfo)
+write_scan_header(j_compress_ptr cinfo)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
   int i;
   jpeg_component_info *compptr;
 
@@ -600,7 +599,7 @@ write_scan_header (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-write_file_trailer (j_compress_ptr cinfo)
+write_file_trailer(j_compress_ptr cinfo)
 {
   emit_marker(cinfo, M_EOI);
 }
@@ -614,7 +613,7 @@ write_file_trailer (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-write_tables_only (j_compress_ptr cinfo)
+write_tables_only(j_compress_ptr cinfo)
 {
   int i;
 
@@ -622,10 +621,10 @@ write_tables_only (j_compress_ptr cinfo)
 
   for (i = 0; i < NUM_QUANT_TBLS; i++) {
     if (cinfo->quant_tbl_ptrs[i] != NULL)
-      (void) emit_dqt(cinfo, i);
+      (void)emit_dqt(cinfo, i);
   }
 
-  if (! cinfo->arith_code) {
+  if (!cinfo->arith_code) {
     for (i = 0; i < NUM_HUFF_TBLS; i++) {
       if (cinfo->dc_huff_tbl_ptrs[i] != NULL)
         emit_dht(cinfo, i, FALSE);
@@ -643,15 +642,15 @@ write_tables_only (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_marker_writer (j_compress_ptr cinfo)
+jinit_marker_writer(j_compress_ptr cinfo)
 {
   my_marker_ptr marker;
 
   /* Create the subobject */
   marker = (my_marker_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_marker_writer));
-  cinfo->marker = (struct jpeg_marker_writer *) marker;
+  cinfo->marker = (struct jpeg_marker_writer *)marker;
   /* Initialize method pointers */
   marker->pub.write_file_header = write_file_header;
   marker->pub.write_frame_header = write_frame_header;
diff --git a/media/libjpeg/jcmaster.c b/media/libjpeg/jcmaster.c
index 03a8b40ea9..c2b2600031 100644
--- a/media/libjpeg/jcmaster.c
+++ b/media/libjpeg/jcmaster.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2003-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2016, D. R. Commander.
+ * Copyright (C) 2010, 2016, 2018, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -25,9 +25,9 @@
 /* Private state */
 
 typedef enum {
-        main_pass,              /* input data, also do first output step */
-        huff_opt_pass,          /* Huffman code optimization pass */
-        output_pass             /* data output pass */
+  main_pass,                    /* input data, also do first output step */
+  huff_opt_pass,                /* Huffman code optimization pass */
+  output_pass                   /* data output pass */
 } c_pass_type;
 
 typedef struct {
@@ -66,7 +66,7 @@ typedef my_comp_master *my_master_ptr;
  */
 
 GLOBAL(void)
-jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
+jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo)
 /* Do computations that are needed before master selection phase */
 {
   /* Hardwire it to "no scaling" */
@@ -79,7 +79,7 @@ jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
 
 
 LOCAL(void)
-initial_setup (j_compress_ptr cinfo, boolean transcode_only)
+initial_setup(j_compress_ptr cinfo, boolean transcode_only)
 /* Do computations that are needed before master selection phase */
 {
   int ci;
@@ -95,19 +95,19 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
 #endif
 
   /* Sanity check on image dimensions */
-  if (cinfo->_jpeg_height <= 0 || cinfo->_jpeg_width <= 0
-      || cinfo->num_components <= 0 || cinfo->input_components <= 0)
+  if (cinfo->_jpeg_height <= 0 || cinfo->_jpeg_width <= 0 ||
+      cinfo->num_components <= 0 || cinfo->input_components <= 0)
     ERREXIT(cinfo, JERR_EMPTY_IMAGE);
 
   /* Make sure image isn't bigger than I can handle */
-  if ((long) cinfo->_jpeg_height > (long) JPEG_MAX_DIMENSION ||
-      (long) cinfo->_jpeg_width > (long) JPEG_MAX_DIMENSION)
-    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
+  if ((long)cinfo->_jpeg_height > (long)JPEG_MAX_DIMENSION ||
+      (long)cinfo->_jpeg_width > (long)JPEG_MAX_DIMENSION)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)JPEG_MAX_DIMENSION);
 
   /* Width of an input scanline must be representable as JDIMENSION. */
-  samplesperrow = (long) cinfo->image_width * (long) cinfo->input_components;
-  jd_samplesperrow = (JDIMENSION) samplesperrow;
-  if ((long) jd_samplesperrow != samplesperrow)
+  samplesperrow = (long)cinfo->image_width * (long)cinfo->input_components;
+  jd_samplesperrow = (JDIMENSION)samplesperrow;
+  if ((long)jd_samplesperrow != samplesperrow)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
 
   /* For now, precision must match compiled-in value... */
@@ -124,8 +124,10 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
   cinfo->max_v_samp_factor = 1;
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR ||
-        compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
+    if (compptr->h_samp_factor <= 0 ||
+        compptr->h_samp_factor > MAX_SAMP_FACTOR ||
+        compptr->v_samp_factor <= 0 ||
+        compptr->v_samp_factor > MAX_SAMP_FACTOR)
       ERREXIT(cinfo, JERR_BAD_SAMPLING);
     cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
                                    compptr->h_samp_factor);
@@ -146,18 +148,18 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
 #endif
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
-                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->_jpeg_width * (long)compptr->h_samp_factor,
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     compptr->height_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
-                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->_jpeg_height * (long)compptr->v_samp_factor,
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
-                    (long) cinfo->max_h_samp_factor);
+      jdiv_round_up((long)cinfo->_jpeg_width * (long)compptr->h_samp_factor,
+                    (long)cinfo->max_h_samp_factor);
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
-                    (long) cinfo->max_v_samp_factor);
+      jdiv_round_up((long)cinfo->_jpeg_height * (long)compptr->v_samp_factor,
+                    (long)cinfo->max_v_samp_factor);
     /* Mark component needed (this flag isn't actually used for compression) */
     compptr->component_needed = TRUE;
   }
@@ -166,15 +168,15 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
    * main controller will call coefficient controller).
    */
   cinfo->total_iMCU_rows = (JDIMENSION)
-    jdiv_round_up((long) cinfo->_jpeg_height,
-                  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+    jdiv_round_up((long)cinfo->_jpeg_height,
+                  (long)(cinfo->max_v_samp_factor * DCTSIZE));
 }
 
 
 #ifdef C_MULTISCAN_FILES_SUPPORTED
 
 LOCAL(void)
-validate_script (j_compress_ptr cinfo)
+validate_script(j_compress_ptr cinfo)
 /* Verify that the scan script in cinfo->scan_info[] is valid; also
  * determine whether it uses progressive JPEG, and set cinfo->progressive_mode.
  */
@@ -196,10 +198,10 @@ validate_script (j_compress_ptr cinfo)
    * for progressive JPEG, no scan can have this.
    */
   scanptr = cinfo->scan_info;
-  if (scanptr->Ss != 0 || scanptr->Se != DCTSIZE2-1) {
+  if (scanptr->Ss != 0 || scanptr->Se != DCTSIZE2 - 1) {
 #ifdef C_PROGRESSIVE_SUPPORTED
     cinfo->progressive_mode = TRUE;
-    last_bitpos_ptr = & last_bitpos[0][0];
+    last_bitpos_ptr = &last_bitpos[0][0];
     for (ci = 0; ci < cinfo->num_components; ci++)
       for (coefi = 0; coefi < DCTSIZE2; coefi++)
         *last_bitpos_ptr++ = -1;
@@ -222,7 +224,7 @@ validate_script (j_compress_ptr cinfo)
       if (thisi < 0 || thisi >= cinfo->num_components)
         ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
       /* Components must appear in SOF order within each scan */
-      if (ci > 0 && thisi <= scanptr->component_index[ci-1])
+      if (ci > 0 && thisi <= scanptr->component_index[ci - 1])
         ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
     }
     /* Validate progression parameters */
@@ -232,17 +234,17 @@ validate_script (j_compress_ptr cinfo)
     Al = scanptr->Al;
     if (cinfo->progressive_mode) {
 #ifdef C_PROGRESSIVE_SUPPORTED
-      /* The JPEG spec simply gives the ranges 0..13 for Ah and Al, but that
-       * seems wrong: the upper bound ought to depend on data precision.
-       * Perhaps they really meant 0..N+1 for N-bit precision.
+      /* Rec. ITU-T T.81 | ISO/IEC 10918-1 simply gives the ranges 0..13 for Ah
+       * and Al, but that seems wrong: the upper bound ought to depend on data
+       * precision.  Perhaps they really meant 0..N+1 for N-bit precision.
        * Here we allow 0..10 for 8-bit data; Al larger than 10 results in
        * out-of-range reconstructed DC values during the first DC scan,
        * which might cause problems for some decoders.
        */
 #if BITS_IN_JSAMPLE == 8
-#define MAX_AH_AL 10
+#define MAX_AH_AL  10
 #else
-#define MAX_AH_AL 13
+#define MAX_AH_AL  13
 #endif
       if (Ss < 0 || Ss >= DCTSIZE2 || Se < Ss || Se >= DCTSIZE2 ||
           Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL)
@@ -255,7 +257,7 @@ validate_script (j_compress_ptr cinfo)
           ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       }
       for (ci = 0; ci < ncomps; ci++) {
-        last_bitpos_ptr = & last_bitpos[scanptr->component_index[ci]][0];
+        last_bitpos_ptr = &last_bitpos[scanptr->component_index[ci]][0];
         if (Ss != 0 && last_bitpos_ptr[0] < 0) /* AC without prior DC scan */
           ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
         for (coefi = Ss; coefi <= Se; coefi++) {
@@ -265,7 +267,7 @@ validate_script (j_compress_ptr cinfo)
               ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
           } else {
             /* not first scan */
-            if (Ah != last_bitpos_ptr[coefi] || Al != Ah-1)
+            if (Ah != last_bitpos_ptr[coefi] || Al != Ah - 1)
               ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
           }
           last_bitpos_ptr[coefi] = Al;
@@ -274,7 +276,7 @@ validate_script (j_compress_ptr cinfo)
 #endif
     } else {
       /* For sequential JPEG, all progression parameters must be these: */
-      if (Ss != 0 || Se != DCTSIZE2-1 || Ah != 0 || Al != 0)
+      if (Ss != 0 || Se != DCTSIZE2 - 1 || Ah != 0 || Al != 0)
         ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       /* Make sure components are not sent twice */
       for (ci = 0; ci < ncomps; ci++) {
@@ -301,7 +303,7 @@ validate_script (j_compress_ptr cinfo)
 #endif
   } else {
     for (ci = 0; ci < cinfo->num_components; ci++) {
-      if (! component_sent[ci])
+      if (!component_sent[ci])
         ERREXIT(cinfo, JERR_MISSING_DATA);
     }
   }
@@ -311,7 +313,7 @@ validate_script (j_compress_ptr cinfo)
 
 
 LOCAL(void)
-select_scan_parameters (j_compress_ptr cinfo)
+select_scan_parameters(j_compress_ptr cinfo)
 /* Set up the scan parameters for the current scan */
 {
   int ci;
@@ -319,7 +321,7 @@ select_scan_parameters (j_compress_ptr cinfo)
 #ifdef C_MULTISCAN_FILES_SUPPORTED
   if (cinfo->scan_info != NULL) {
     /* Prepare for current scan --- the script is already validated */
-    my_master_ptr master = (my_master_ptr) cinfo->master;
+    my_master_ptr master = (my_master_ptr)cinfo->master;
     const jpeg_scan_info *scanptr = cinfo->scan_info + master->scan_number;
 
     cinfo->comps_in_scan = scanptr->comps_in_scan;
@@ -331,8 +333,7 @@ select_scan_parameters (j_compress_ptr cinfo)
     cinfo->Se = scanptr->Se;
     cinfo->Ah = scanptr->Ah;
     cinfo->Al = scanptr->Al;
-  }
-  else
+  } else
 #endif
   {
     /* Prepare for single sequential-JPEG scan containing all components */
@@ -344,7 +345,7 @@ select_scan_parameters (j_compress_ptr cinfo)
       cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci];
     }
     cinfo->Ss = 0;
-    cinfo->Se = DCTSIZE2-1;
+    cinfo->Se = DCTSIZE2 - 1;
     cinfo->Ah = 0;
     cinfo->Al = 0;
   }
@@ -352,7 +353,7 @@ select_scan_parameters (j_compress_ptr cinfo)
 
 
 LOCAL(void)
-per_scan_setup (j_compress_ptr cinfo)
+per_scan_setup(j_compress_ptr cinfo)
 /* Do computations that are needed before processing a JPEG scan */
 /* cinfo->comps_in_scan and cinfo->cur_comp_info[] are already set */
 {
@@ -377,7 +378,7 @@ per_scan_setup (j_compress_ptr cinfo)
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
      */
-    tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+    tmp = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
     if (tmp == 0) tmp = compptr->v_samp_factor;
     compptr->last_row_height = tmp;
 
@@ -394,11 +395,11 @@ per_scan_setup (j_compress_ptr cinfo)
 
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_width,
-                    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+      jdiv_round_up((long)cinfo->_jpeg_width,
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_height,
-                    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+      jdiv_round_up((long)cinfo->_jpeg_height,
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
 
     cinfo->blocks_in_MCU = 0;
 
@@ -410,10 +411,10 @@ per_scan_setup (j_compress_ptr cinfo)
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
       compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE;
       /* Figure number of non-dummy blocks in last MCU column & row */
-      tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
+      tmp = (int)(compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
       compptr->last_col_width = tmp;
-      tmp = (int) (compptr->height_in_blocks % compptr->MCU_height);
+      tmp = (int)(compptr->height_in_blocks % compptr->MCU_height);
       if (tmp == 0) tmp = compptr->MCU_height;
       compptr->last_row_height = tmp;
       /* Prepare array describing MCU composition */
@@ -430,8 +431,8 @@ per_scan_setup (j_compress_ptr cinfo)
   /* Convert restart specified in rows to actual MCU count. */
   /* Note that count must fit in 16 bits, so we provide limiting. */
   if (cinfo->restart_in_rows > 0) {
-    long nominal = (long) cinfo->restart_in_rows * (long) cinfo->MCUs_per_row;
-    cinfo->restart_interval = (unsigned int) MIN(nominal, 65535L);
+    long nominal = (long)cinfo->restart_in_rows * (long)cinfo->MCUs_per_row;
+    cinfo->restart_interval = (unsigned int)MIN(nominal, 65535L);
   }
 }
 
@@ -445,9 +446,9 @@ per_scan_setup (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-prepare_for_pass (j_compress_ptr cinfo)
+prepare_for_pass(j_compress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   switch (master->pass_type) {
   case main_pass:
@@ -456,7 +457,7 @@ prepare_for_pass (j_compress_ptr cinfo)
      */
     select_scan_parameters(cinfo);
     per_scan_setup(cinfo);
-    if (! cinfo->raw_data_in) {
+    if (!cinfo->raw_data_in) {
       (*cinfo->cconvert->start_pass) (cinfo);
       (*cinfo->downsample->start_pass) (cinfo);
       (*cinfo->prep->start_pass) (cinfo, JBUF_PASS_THRU);
@@ -491,12 +492,12 @@ prepare_for_pass (j_compress_ptr cinfo)
      */
     master->pass_type = output_pass;
     master->pass_number++;
-    /*FALLTHROUGH*/
 #endif
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case output_pass:
     /* Do a data-output pass. */
     /* We need not repeat per-scan setup if prior optimization pass did it. */
-    if (! cinfo->optimize_coding) {
+    if (!cinfo->optimize_coding) {
       select_scan_parameters(cinfo);
       per_scan_setup(cinfo);
     }
@@ -512,7 +513,7 @@ prepare_for_pass (j_compress_ptr cinfo)
     ERREXIT(cinfo, JERR_NOT_COMPILED);
   }
 
-  master->pub.is_last_pass = (master->pass_number == master->total_passes-1);
+  master->pub.is_last_pass = (master->pass_number == master->total_passes - 1);
 
   /* Set up progress monitor's pass info if present */
   if (cinfo->progress != NULL) {
@@ -533,7 +534,7 @@ prepare_for_pass (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-pass_startup (j_compress_ptr cinfo)
+pass_startup(j_compress_ptr cinfo)
 {
   cinfo->master->call_pass_startup = FALSE; /* reset flag so call only once */
 
@@ -547,9 +548,9 @@ pass_startup (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-finish_pass_master (j_compress_ptr cinfo)
+finish_pass_master(j_compress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   /* The entropy coder always needs an end-of-pass call,
    * either to analyze statistics or to flush its output buffer.
@@ -563,7 +564,7 @@ finish_pass_master (j_compress_ptr cinfo)
      * or output of scan 1 (if no optimization).
      */
     master->pass_type = output_pass;
-    if (! cinfo->optimize_coding)
+    if (!cinfo->optimize_coding)
       master->scan_number++;
     break;
   case huff_opt_pass:
@@ -587,14 +588,14 @@ finish_pass_master (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
+jinit_c_master_control(j_compress_ptr cinfo, boolean transcode_only)
 {
   my_master_ptr master;
 
   master = (my_master_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(my_comp_master));
-  cinfo->master = (struct jpeg_comp_master *) master;
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(my_comp_master));
+  cinfo->master = (struct jpeg_comp_master *)master;
   master->pub.prepare_for_pass = prepare_for_pass;
   master->pub.pass_startup = pass_startup;
   master->pub.finish_pass = finish_pass_master;
diff --git a/media/libjpeg/jcomapi.c b/media/libjpeg/jcomapi.c
index 6e5bf3dba9..efbb8357b0 100644
--- a/media/libjpeg/jcomapi.c
+++ b/media/libjpeg/jcomapi.c
@@ -29,7 +29,7 @@
  */
 
 GLOBAL(void)
-jpeg_abort (j_common_ptr cinfo)
+jpeg_abort(j_common_ptr cinfo)
 {
   int pool;
 
@@ -40,7 +40,7 @@ jpeg_abort (j_common_ptr cinfo)
   /* Releasing pools in reverse order might help avoid fragmentation
    * with some (brain-damaged) malloc libraries.
    */
-  for (pool = JPOOL_NUMPOOLS-1; pool > JPOOL_PERMANENT; pool--) {
+  for (pool = JPOOL_NUMPOOLS - 1; pool > JPOOL_PERMANENT; pool--) {
     (*cinfo->mem->free_pool) (cinfo, pool);
   }
 
@@ -50,7 +50,7 @@ jpeg_abort (j_common_ptr cinfo)
     /* Try to keep application from accessing now-deleted marker list.
      * A bit kludgy to do it here, but this is the most central place.
      */
-    ((j_decompress_ptr) cinfo)->marker_list = NULL;
+    ((j_decompress_ptr)cinfo)->marker_list = NULL;
   } else {
     cinfo->global_state = CSTATE_START;
   }
@@ -69,7 +69,7 @@ jpeg_abort (j_common_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_destroy (j_common_ptr cinfo)
+jpeg_destroy(j_common_ptr cinfo)
 {
   /* We need only tell the memory manager to release everything. */
   /* NB: mem pointer is NULL if memory mgr failed to initialize. */
@@ -86,7 +86,7 @@ jpeg_destroy (j_common_ptr cinfo)
  */
 
 GLOBAL(JQUANT_TBL *)
-jpeg_alloc_quant_table (j_common_ptr cinfo)
+jpeg_alloc_quant_table(j_common_ptr cinfo)
 {
   JQUANT_TBL *tbl;
 
@@ -98,7 +98,7 @@ jpeg_alloc_quant_table (j_common_ptr cinfo)
 
 
 GLOBAL(JHUFF_TBL *)
-jpeg_alloc_huff_table (j_common_ptr cinfo)
+jpeg_alloc_huff_table(j_common_ptr cinfo)
 {
   JHUFF_TBL *tbl;
 
diff --git a/media/libjpeg/jconfigint.h b/media/libjpeg/jconfigint.h
index 1289399f94..02be97857f 100644
--- a/media/libjpeg/jconfigint.h
+++ b/media/libjpeg/jconfigint.h
@@ -1,7 +1,47 @@
-#define VERSION "1.5.1"
-#define BUILD "2016-09-20"
-#define PACKAGE_NAME "libjpeg-turbo"
+/* libjpeg-turbo build number */
+#define BUILD "20220624"
 
 /* Need to use Mozilla-specific function inlining. */
 #include "mozilla/Attributes.h"
 #define INLINE MOZ_ALWAYS_INLINE
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "libjpeg-turbo"
+
+/* Version number of package */
+#define VERSION "2.1.3"
+
+/* The size of `size_t', as computed by sizeof. */
+#ifdef HAVE_64BIT_BUILD
+#define SIZEOF_SIZE_T 8
+#else
+#define SIZEOF_SIZE_T 4
+#endif
+
+/* Define if your compiler has __builtin_ctzl() and sizeof(unsigned long) == sizeof(size_t). */
+#ifndef _MSC_VER
+#define HAVE_BUILTIN_CTZL 1
+#endif
+
+/* Define to 1 if you have the <intrin.h> header file. */
+#ifdef _MSC_VER
+#define HAVE_INTRIN_H 1
+#endif
+
+#if defined(_MSC_VER) && defined(HAVE_INTRIN_H)
+#if (SIZEOF_SIZE_T == 8)
+#define HAVE_BITSCANFORWARD64
+#elif (SIZEOF_SIZE_T == 4)
+#define HAVE_BITSCANFORWARD
+#endif
+#endif
+
+#if defined(__has_attribute)
+#if __has_attribute(fallthrough)
+#define FALLTHROUGH  __attribute__((fallthrough));
+#else
+#define FALLTHROUGH
+#endif
+#else
+#define FALLTHROUGH
+#endif
diff --git a/media/libjpeg/jcparam.c b/media/libjpeg/jcparam.c
index 18b2d487ae..5bc7174dcb 100644
--- a/media/libjpeg/jcparam.c
+++ b/media/libjpeg/jcparam.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2003-2008 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, D. R. Commander.
+ * Copyright (C) 2009-2011, 2018, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -25,9 +25,9 @@
  */
 
 GLOBAL(void)
-jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
-                      const unsigned int *basic_table,
-                      int scale_factor, boolean force_baseline)
+jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                     const unsigned int *basic_table, int scale_factor,
+                     boolean force_baseline)
 /* Define a quantization table equal to the basic_table times
  * a scale factor (given as a percentage).
  * If force_baseline is TRUE, the computed quantization table entries
@@ -45,19 +45,19 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
   if (which_tbl < 0 || which_tbl >= NUM_QUANT_TBLS)
     ERREXIT1(cinfo, JERR_DQT_INDEX, which_tbl);
 
-  qtblptr = & cinfo->quant_tbl_ptrs[which_tbl];
+  qtblptr = &cinfo->quant_tbl_ptrs[which_tbl];
 
   if (*qtblptr == NULL)
-    *qtblptr = jpeg_alloc_quant_table((j_common_ptr) cinfo);
+    *qtblptr = jpeg_alloc_quant_table((j_common_ptr)cinfo);
 
   for (i = 0; i < DCTSIZE2; i++) {
-    temp = ((long) basic_table[i] * scale_factor + 50L) / 100L;
+    temp = ((long)basic_table[i] * scale_factor + 50L) / 100L;
     /* limit the values to the valid range */
     if (temp <= 0L) temp = 1L;
     if (temp > 32767L) temp = 32767L; /* max quantizer needed for 12 bits */
     if (force_baseline && temp > 255L)
       temp = 255L;              /* limit to baseline range if requested */
-    (*qtblptr)->quantval[i] = (UINT16) temp;
+    (*qtblptr)->quantval[i] = (UINT16)temp;
   }
 
   /* Initialize sent_table FALSE so table will be written to JPEG file. */
@@ -65,7 +65,8 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
 }
 
 
-/* These are the sample quantization tables given in JPEG spec section K.1.
+/* These are the sample quantization tables given in Annex K (Clause K.1) of
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
@@ -93,7 +94,7 @@ static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
 
 #if JPEG_LIB_VERSION >= 70
 GLOBAL(void)
-jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+jpeg_default_qtables(j_compress_ptr cinfo, boolean force_baseline)
 /* Set or change the 'quality' (quantization) setting, using default tables
  * and straight percentage-scaling quality scales.
  * This entry point allows different scalings for luminance and chrominance.
@@ -109,8 +110,8 @@ jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
 
 
 GLOBAL(void)
-jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
-                         boolean force_baseline)
+jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                        boolean force_baseline)
 /* Set or change the 'quality' (quantization) setting, using default tables
  * and a straight percentage-scaling quality scale.  In most cases it's better
  * to use jpeg_set_quality (below); this entry point is provided for
@@ -126,7 +127,7 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
 
 
 GLOBAL(int)
-jpeg_quality_scaling (int quality)
+jpeg_quality_scaling(int quality)
 /* Convert a user-specified quality rating to a percentage scaling factor
  * for an underlying quantization table, using our recommended scaling curve.
  * The input 'quality' factor should be 0 (terrible) to 100 (very good).
@@ -145,14 +146,14 @@ jpeg_quality_scaling (int quality)
   if (quality < 50)
     quality = 5000 / quality;
   else
-    quality = 200 - quality*2;
+    quality = 200 - quality * 2;
 
   return quality;
 }
 
 
 GLOBAL(void)
-jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
+jpeg_set_quality(j_compress_ptr cinfo, int quality, boolean force_baseline)
 /* Set or change the 'quality' (quantization) setting, using default tables.
  * This is the standard quality-adjusting entry point for typical user
  * interfaces; only those who want detailed control over quantization tables
@@ -178,7 +179,7 @@ jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
  */
 
 GLOBAL(void)
-jpeg_set_defaults (j_compress_ptr cinfo)
+jpeg_set_defaults(j_compress_ptr cinfo)
 {
   int i;
 
@@ -192,7 +193,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
    */
   if (cinfo->comp_info == NULL)
     cinfo->comp_info = (jpeg_component_info *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   MAX_COMPONENTS * sizeof(jpeg_component_info));
 
   /* Initialize everything not dependent on the color space */
@@ -205,7 +206,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   /* Set up two quantization tables using default quality of 75 */
   jpeg_set_quality(cinfo, 75, TRUE);
   /* Set up two Huffman tables */
-  std_huff_tables((j_common_ptr) cinfo);
+  std_huff_tables((j_common_ptr)cinfo);
 
   /* Initialize default arithmetic coding conditioning */
   for (i = 0; i < NUM_ARITH_TBLS; i++) {
@@ -278,7 +279,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_default_colorspace (j_compress_ptr cinfo)
+jpeg_default_colorspace(j_compress_ptr cinfo)
 {
   switch (cinfo->in_color_space) {
   case JCS_GRAYSCALE:
@@ -320,12 +321,12 @@ jpeg_default_colorspace (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
+jpeg_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
 {
   jpeg_component_info *compptr;
   int ci;
 
-#define SET_COMP(index,id,hsamp,vsamp,quant,dctbl,actbl)  \
+#define SET_COMP(index, id, hsamp, vsamp, quant, dctbl, actbl) \
   (compptr = &cinfo->comp_info[index], \
    compptr->component_id = (id), \
    compptr->h_samp_factor = (hsamp), \
@@ -352,39 +353,39 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
     cinfo->write_JFIF_header = TRUE; /* Write a JFIF marker */
     cinfo->num_components = 1;
     /* JFIF specifies component ID 1 */
-    SET_COMP(0, 1, 1,1, 0, 0,0);
+    SET_COMP(0, 1, 1, 1, 0, 0, 0);
     break;
   case JCS_RGB:
     cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag RGB */
     cinfo->num_components = 3;
-    SET_COMP(0, 0x52 /* 'R' */, 1,1, 0, 0,0);
-    SET_COMP(1, 0x47 /* 'G' */, 1,1, 0, 0,0);
-    SET_COMP(2, 0x42 /* 'B' */, 1,1, 0, 0,0);
+    SET_COMP(0, 0x52 /* 'R' */, 1, 1, 0, 0, 0);
+    SET_COMP(1, 0x47 /* 'G' */, 1, 1, 0, 0, 0);
+    SET_COMP(2, 0x42 /* 'B' */, 1, 1, 0, 0, 0);
     break;
   case JCS_YCbCr:
     cinfo->write_JFIF_header = TRUE; /* Write a JFIF marker */
     cinfo->num_components = 3;
     /* JFIF specifies component IDs 1,2,3 */
     /* We default to 2x2 subsamples of chrominance */
-    SET_COMP(0, 1, 2,2, 0, 0,0);
-    SET_COMP(1, 2, 1,1, 1, 1,1);
-    SET_COMP(2, 3, 1,1, 1, 1,1);
+    SET_COMP(0, 1, 2, 2, 0, 0, 0);
+    SET_COMP(1, 2, 1, 1, 1, 1, 1);
+    SET_COMP(2, 3, 1, 1, 1, 1, 1);
     break;
   case JCS_CMYK:
     cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag CMYK */
     cinfo->num_components = 4;
-    SET_COMP(0, 0x43 /* 'C' */, 1,1, 0, 0,0);
-    SET_COMP(1, 0x4D /* 'M' */, 1,1, 0, 0,0);
-    SET_COMP(2, 0x59 /* 'Y' */, 1,1, 0, 0,0);
-    SET_COMP(3, 0x4B /* 'K' */, 1,1, 0, 0,0);
+    SET_COMP(0, 0x43 /* 'C' */, 1, 1, 0, 0, 0);
+    SET_COMP(1, 0x4D /* 'M' */, 1, 1, 0, 0, 0);
+    SET_COMP(2, 0x59 /* 'Y' */, 1, 1, 0, 0, 0);
+    SET_COMP(3, 0x4B /* 'K' */, 1, 1, 0, 0, 0);
     break;
   case JCS_YCCK:
     cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag YCCK */
     cinfo->num_components = 4;
-    SET_COMP(0, 1, 2,2, 0, 0,0);
-    SET_COMP(1, 2, 1,1, 1, 1,1);
-    SET_COMP(2, 3, 1,1, 1, 1,1);
-    SET_COMP(3, 4, 2,2, 0, 0,0);
+    SET_COMP(0, 1, 2, 2, 0, 0, 0);
+    SET_COMP(1, 2, 1, 1, 1, 1, 1);
+    SET_COMP(2, 3, 1, 1, 1, 1, 1);
+    SET_COMP(3, 4, 2, 2, 0, 0, 0);
     break;
   case JCS_UNKNOWN:
     cinfo->num_components = cinfo->input_components;
@@ -392,7 +393,7 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
       ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
                MAX_COMPONENTS);
     for (ci = 0; ci < cinfo->num_components; ci++) {
-      SET_COMP(ci, ci, 1,1, 0, 0,0);
+      SET_COMP(ci, ci, 1, 1, 0, 0, 0);
     }
     break;
   default:
@@ -404,8 +405,7 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
 #ifdef C_PROGRESSIVE_SUPPORTED
 
 LOCAL(jpeg_scan_info *)
-fill_a_scan (jpeg_scan_info *scanptr, int ci,
-             int Ss, int Se, int Ah, int Al)
+fill_a_scan(jpeg_scan_info *scanptr, int ci, int Ss, int Se, int Ah, int Al)
 /* Support routine: generate one scan for specified component */
 {
   scanptr->comps_in_scan = 1;
@@ -419,8 +419,7 @@ fill_a_scan (jpeg_scan_info *scanptr, int ci,
 }
 
 LOCAL(jpeg_scan_info *)
-fill_scans (jpeg_scan_info *scanptr, int ncomps,
-            int Ss, int Se, int Ah, int Al)
+fill_scans(jpeg_scan_info *scanptr, int ncomps, int Ss, int Se, int Ah, int Al)
 /* Support routine: generate one scan for each component */
 {
   int ci;
@@ -438,7 +437,7 @@ fill_scans (jpeg_scan_info *scanptr, int ncomps,
 }
 
 LOCAL(jpeg_scan_info *)
-fill_dc_scans (jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
+fill_dc_scans(jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
 /* Support routine: generate interleaved DC scan if possible, else N scans */
 {
   int ci;
@@ -466,7 +465,7 @@ fill_dc_scans (jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
  */
 
 GLOBAL(void)
-jpeg_simple_progression (j_compress_ptr cinfo)
+jpeg_simple_progression(j_compress_ptr cinfo)
 {
   int ncomps = cinfo->num_components;
   int nscans;
@@ -498,7 +497,7 @@ jpeg_simple_progression (j_compress_ptr cinfo)
   if (cinfo->script_space == NULL || cinfo->script_space_size < nscans) {
     cinfo->script_space_size = MAX(nscans, 10);
     cinfo->script_space = (jpeg_scan_info *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                         cinfo->script_space_size * sizeof(jpeg_scan_info));
   }
   scanptr = cinfo->script_space;
diff --git a/media/libjpeg/jcphuff.c b/media/libjpeg/jcphuff.c
index 046e2e18d4..872e570bff 100644
--- a/media/libjpeg/jcphuff.c
+++ b/media/libjpeg/jcphuff.c
@@ -4,7 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2018, 2021-2022, D. R. Commander.
+ * Copyright (C) 2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ * Copyright (C) 2021, Alex Richardson.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -18,15 +21,74 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jchuff.h"             /* Declarations shared with jchuff.c */
+#include "jsimd.h"
+#include "jconfigint.h"
+#include <limits.h>
+
+#ifdef HAVE_INTRIN_H
+#include <intrin.h>
+#ifdef _MSC_VER
+#ifdef HAVE_BITSCANFORWARD64
+#pragma intrinsic(_BitScanForward64)
+#endif
+#ifdef HAVE_BITSCANFORWARD
+#pragma intrinsic(_BitScanForward)
+#endif
+#endif
+#endif
 
 #ifdef C_PROGRESSIVE_SUPPORTED
 
+/*
+ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
+ * used for bit counting rather than the lookup table.  This will reduce the
+ * memory footprint by 64k, which is important for some mobile applications
+ * that create many isolated instances of libjpeg-turbo (web browsers, for
+ * instance.)  This may improve performance on some mobile platforms as well.
+ * This feature is enabled by default only on Arm processors, because some x86
+ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
+ * shown to have a significant performance impact even on the x86 chips that
+ * have a fast implementation of it.  When building for Armv6, you can
+ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
+ * flags (this defines __thumb__).
+ */
+
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+    defined(_M_ARM) || defined(_M_ARM64)
+#if !defined(__thumb__) || defined(__thumb2__)
+#define USE_CLZ_INTRINSIC
+#endif
+#endif
+
+#ifdef USE_CLZ_INTRINSIC
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x)  (32 - _CountLeadingZeros(x))
+#else
+#define JPEG_NBITS_NONZERO(x)  (32 - __builtin_clz(x))
+#endif
+#define JPEG_NBITS(x)          (x ? JPEG_NBITS_NONZERO(x) : 0)
+#else
+#include "jpeg_nbits_table.h"
+#define JPEG_NBITS(x)          (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x)  JPEG_NBITS(x)
+#endif
+
+
 /* Expanded entropy encoder object for progressive Huffman encoding. */
 
 typedef struct {
   struct jpeg_entropy_encoder pub; /* public fields */
 
+  /* Pointer to routine to prepare data for encode_mcu_AC_first() */
+  void (*AC_first_prepare) (const JCOEF *block,
+                            const int *jpeg_natural_order_start, int Sl,
+                            int Al, JCOEF *values, size_t *zerobits);
+  /* Pointer to routine to prepare data for encode_mcu_AC_refine() */
+  int (*AC_refine_prepare) (const JCOEF *block,
+                            const int *jpeg_natural_order_start, int Sl,
+                            int Al, JCOEF *absvalues, size_t *bits);
+
   /* Mode flag: TRUE for optimization, FALSE for actual data output */
   boolean gather_statistics;
 
@@ -79,26 +141,62 @@ typedef phuff_entropy_encoder *phuff_entropy_ptr;
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
 #define ISHIFT_TEMPS    int ishift_temp;
-#define IRIGHT_SHIFT(x,shft)  \
-        ((ishift_temp = (x)) < 0 ? \
-         (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
-         (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+  ((ishift_temp = (x)) < 0 ? \
+   (ishift_temp >> (shft)) | ((~0) << (16 - (shft))) : \
+   (ishift_temp >> (shft)))
 #else
 #define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft)   ((x) >> (shft))
 #endif
 
+#define PAD(v, p)  ((v + (p) - 1) & (~((p) - 1)))
+
 /* Forward declarations */
-METHODDEF(boolean) encode_mcu_DC_first (j_compress_ptr cinfo,
+METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
+                                       JBLOCKROW *MCU_data);
+METHODDEF(void) encode_mcu_AC_first_prepare
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
+                                       JBLOCKROW *MCU_data);
+METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_AC_first (j_compress_ptr cinfo,
+METHODDEF(int) encode_mcu_AC_refine_prepare
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
+METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_DC_refine (j_compress_ptr cinfo,
-                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_AC_refine (j_compress_ptr cinfo,
-                                         JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_phuff (j_compress_ptr cinfo);
-METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo);
+METHODDEF(void) finish_pass_phuff(j_compress_ptr cinfo);
+METHODDEF(void) finish_pass_gather_phuff(j_compress_ptr cinfo);
+
+
+/* Count bit loop zeroes */
+INLINE
+METHODDEF(int)
+count_zeroes(size_t *x)
+{
+#if defined(HAVE_BUILTIN_CTZL)
+  int result;
+  result = __builtin_ctzl(*x);
+  *x >>= result;
+#elif defined(HAVE_BITSCANFORWARD64)
+  unsigned long result;
+  _BitScanForward64(&result, *x);
+  *x >>= result;
+#elif defined(HAVE_BITSCANFORWARD)
+  unsigned long result;
+  _BitScanForward(&result, *x);
+  *x >>= result;
+#else
+  int result = 0;
+  while ((*x & 1) == 0) {
+    ++result;
+    *x >>= 1;
+  }
+#endif
+  return (int)result;
+}
 
 
 /*
@@ -106,9 +204,9 @@ METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo);
  */
 
 METHODDEF(void)
-start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   boolean is_DC_band;
   int ci, tbl;
   jpeg_component_info *compptr;
@@ -126,15 +224,23 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
       entropy->pub.encode_mcu = encode_mcu_DC_first;
     else
       entropy->pub.encode_mcu = encode_mcu_AC_first;
+    if (jsimd_can_encode_mcu_AC_first_prepare())
+      entropy->AC_first_prepare = jsimd_encode_mcu_AC_first_prepare;
+    else
+      entropy->AC_first_prepare = encode_mcu_AC_first_prepare;
   } else {
     if (is_DC_band)
       entropy->pub.encode_mcu = encode_mcu_DC_refine;
     else {
       entropy->pub.encode_mcu = encode_mcu_AC_refine;
+      if (jsimd_can_encode_mcu_AC_refine_prepare())
+        entropy->AC_refine_prepare = jsimd_encode_mcu_AC_refine_prepare;
+      else
+        entropy->AC_refine_prepare = encode_mcu_AC_refine_prepare;
       /* AC refinement needs a correction bit buffer */
       if (entropy->bit_buffer == NULL)
         entropy->bit_buffer = (char *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       MAX_CORR_BITS * sizeof(char));
     }
   }
@@ -167,14 +273,14 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
       /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
       if (entropy->count_ptrs[tbl] == NULL)
         entropy->count_ptrs[tbl] = (long *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
-      MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long));
+      memset(entropy->count_ptrs[tbl], 0, 257 * sizeof(long));
     } else {
       /* Compute derived values for Huffman table */
       /* We may do this more than once for a table, but it's not expensive */
       jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl,
-                              & entropy->derived_tbls[tbl]);
+                              &entropy->derived_tbls[tbl]);
     }
   }
 
@@ -198,19 +304,20 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
  */
 
 /* Emit a byte */
-#define emit_byte(entropy,val)  \
-        { *(entropy)->next_output_byte++ = (JOCTET) (val);  \
-          if (--(entropy)->free_in_buffer == 0)  \
-            dump_buffer(entropy); }
+#define emit_byte(entropy, val) { \
+  *(entropy)->next_output_byte++ = (JOCTET)(val); \
+  if (--(entropy)->free_in_buffer == 0) \
+    dump_buffer(entropy); \
+}
 
 
 LOCAL(void)
-dump_buffer (phuff_entropy_ptr entropy)
+dump_buffer(phuff_entropy_ptr entropy)
 /* Empty the output buffer; we do not support suspension in this module. */
 {
   struct jpeg_destination_mgr *dest = entropy->cinfo->dest;
 
-  if (! (*dest->empty_output_buffer) (entropy->cinfo))
+  if (!(*dest->empty_output_buffer) (entropy->cinfo))
     ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
   /* After a successful buffer dump, must reset buffer pointers */
   entropy->next_output_byte = dest->next_output_byte;
@@ -227,11 +334,11 @@ dump_buffer (phuff_entropy_ptr entropy)
  */
 
 LOCAL(void)
-emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
+emit_bits(phuff_entropy_ptr entropy, unsigned int code, int size)
 /* Emit some bits, unless we are in gather mode */
 {
   /* This routine is heavily used, so it's worth coding tightly. */
-  register size_t put_buffer = (size_t) code;
+  register size_t put_buffer = (size_t)code;
   register int put_bits = entropy->put_bits;
 
   /* if size is 0, caller used an invalid Huffman table entry */
@@ -241,7 +348,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
   if (entropy->gather_statistics)
     return;                     /* do nothing if we're only getting stats */
 
-  put_buffer &= (((size_t) 1)<<size) - 1; /* mask off any extra bits in code */
+  put_buffer &= (((size_t)1) << size) - 1; /* mask off any extra bits in code */
 
   put_bits += size;             /* new number of bits in buffer */
 
@@ -250,7 +357,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
   put_buffer |= entropy->put_buffer; /* and merge with old buffer contents */
 
   while (put_bits >= 8) {
-    int c = (int) ((put_buffer >> 16) & 0xFF);
+    int c = (int)((put_buffer >> 16) & 0xFF);
 
     emit_byte(entropy, c);
     if (c == 0xFF) {            /* need to stuff a zero byte? */
@@ -266,7 +373,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
 
 
 LOCAL(void)
-flush_bits (phuff_entropy_ptr entropy)
+flush_bits(phuff_entropy_ptr entropy)
 {
   emit_bits(entropy, 0x7F, 7); /* fill any partial byte with ones */
   entropy->put_buffer = 0;     /* and reset bit-buffer to empty */
@@ -279,7 +386,7 @@ flush_bits (phuff_entropy_ptr entropy)
  */
 
 LOCAL(void)
-emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
+emit_symbol(phuff_entropy_ptr entropy, int tbl_no, int symbol)
 {
   if (entropy->gather_statistics)
     entropy->count_ptrs[tbl_no][symbol]++;
@@ -295,14 +402,14 @@ emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
  */
 
 LOCAL(void)
-emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart,
-                    unsigned int nbits)
+emit_buffered_bits(phuff_entropy_ptr entropy, char *bufstart,
+                   unsigned int nbits)
 {
   if (entropy->gather_statistics)
     return;                     /* no real work */
 
   while (nbits > 0) {
-    emit_bits(entropy, (unsigned int) (*bufstart), 1);
+    emit_bits(entropy, (unsigned int)(*bufstart), 1);
     bufstart++;
     nbits--;
   }
@@ -314,15 +421,13 @@ emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart,
  */
 
 LOCAL(void)
-emit_eobrun (phuff_entropy_ptr entropy)
+emit_eobrun(phuff_entropy_ptr entropy)
 {
   register int temp, nbits;
 
   if (entropy->EOBRUN > 0) {    /* if there is any pending EOBRUN */
     temp = entropy->EOBRUN;
-    nbits = 0;
-    while ((temp >>= 1))
-      nbits++;
+    nbits = JPEG_NBITS_NONZERO(temp) - 1;
     /* safety check: shouldn't happen given limited correction-bit buffer */
     if (nbits > 14)
       ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
@@ -345,13 +450,13 @@ emit_eobrun (phuff_entropy_ptr entropy)
  */
 
 LOCAL(void)
-emit_restart (phuff_entropy_ptr entropy, int restart_num)
+emit_restart(phuff_entropy_ptr entropy, int restart_num)
 {
   int ci;
 
   emit_eobrun(entropy);
 
-  if (! entropy->gather_statistics) {
+  if (!entropy->gather_statistics) {
     flush_bits(entropy);
     emit_byte(entropy, 0xFF);
     emit_byte(entropy, JPEG_RST0 + restart_num);
@@ -375,10 +480,10 @@ emit_restart (phuff_entropy_ptr entropy, int restart_num)
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp, temp2;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+  register int temp, temp2, temp3;
   register int nbits;
   int blkn, ci;
   int Al = cinfo->Al;
@@ -403,31 +508,31 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     /* Compute the DC value after the required point transform by Al.
      * This is simply an arithmetic right shift.
      */
-    temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al);
+    temp2 = IRIGHT_SHIFT((int)((*block)[0]), Al);
 
     /* DC differences are figured on the point-transformed values. */
     temp = temp2 - entropy->last_dc_val[ci];
     entropy->last_dc_val[ci] = temp2;
 
     /* Encode the DC coefficient difference per section G.1.2.1 */
-    temp2 = temp;
-    if (temp < 0) {
-      temp = -temp;             /* temp is abs value of input */
-      /* For a negative input, want temp2 = bitwise complement of abs(input) */
-      /* This code assumes we are on a two's complement machine */
-      temp2--;
-    }
+
+    /* This is a well-known technique for obtaining the absolute value without
+     * a branch.  It is derived from an assembly language technique presented
+     * in "How to Optimize for the Pentium Processors", Copyright (c) 1996,
+     * 1997 by Agner Fog.
+     */
+    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+    temp ^= temp3;
+    temp -= temp3;              /* temp is abs value of input */
+    /* For a negative input, want temp2 = bitwise complement of abs(input) */
+    temp2 = temp ^ temp3;
 
     /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 0;
-    while (temp) {
-      nbits++;
-      temp >>= 1;
-    }
+    nbits = JPEG_NBITS(temp);
     /* Check for out-of-range coefficient values.
      * Since we're encoding a difference, the range limit is twice as much.
      */
-    if (nbits > MAX_COEF_BITS+1)
+    if (nbits > MAX_COEF_BITS + 1)
       ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
     /* Count/emit the Huffman-coded symbol for the number of bits */
@@ -436,7 +541,7 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     /* Emit that number of bits of the value, if positive, */
     /* or the complement of its magnitude, if negative. */
     if (nbits)                  /* emit_bits rejects calls with size 0 */
-      emit_bits(entropy, (unsigned int) temp2, nbits);
+      emit_bits(entropy, (unsigned int)temp2, nbits);
   }
 
   cinfo->dest->next_output_byte = entropy->next_output_byte;
@@ -457,20 +562,115 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 
 
 /*
+ * Data preparation for encode_mcu_AC_first().
+ */
+
+#define COMPUTE_ABSVALUES_AC_FIRST(Sl) { \
+  for (k = 0; k < Sl; k++) { \
+    temp = block[jpeg_natural_order_start[k]]; \
+    if (temp == 0) \
+      continue; \
+    /* We must apply the point transform by Al.  For AC coefficients this \
+     * is an integer division with rounding towards 0.  To do this portably \
+     * in C, we shift after obtaining the absolute value; so the code is \
+     * interwoven with finding the abs value (temp) and output bits (temp2). \
+     */ \
+    temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
+    temp ^= temp2; \
+    temp -= temp2;              /* temp is abs value of input */ \
+    temp >>= Al;                /* apply the point transform */ \
+    /* Watch out for case that nonzero coef is zero after point transform */ \
+    if (temp == 0) \
+      continue; \
+    /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
+    temp2 ^= temp; \
+    values[k] = (JCOEF)temp; \
+    values[k + DCTSIZE2] = (JCOEF)temp2; \
+    zerobits |= ((size_t)1U) << k; \
+  } \
+}
+
+METHODDEF(void)
+encode_mcu_AC_first_prepare(const JCOEF *block,
+                            const int *jpeg_natural_order_start, int Sl,
+                            int Al, JCOEF *values, size_t *bits)
+{
+  register int k, temp, temp2;
+  size_t zerobits = 0U;
+  int Sl0 = Sl;
+
+#if SIZEOF_SIZE_T == 4
+  if (Sl0 > 32)
+    Sl0 = 32;
+#endif
+
+  COMPUTE_ABSVALUES_AC_FIRST(Sl0);
+
+  bits[0] = zerobits;
+#if SIZEOF_SIZE_T == 4
+  zerobits = 0U;
+
+  if (Sl > 32) {
+    Sl -= 32;
+    jpeg_natural_order_start += 32;
+    values += 32;
+
+    COMPUTE_ABSVALUES_AC_FIRST(Sl);
+  }
+  bits[1] = zerobits;
+#endif
+}
+
+/*
  * MCU encoding for AC initial scan (either spectral selection,
  * or first pass of successive approximation).
  */
 
+#define ENCODE_COEFS_AC_FIRST(label) { \
+  while (zerobits) { \
+    r = count_zeroes(&zerobits); \
+    cvalue += r; \
+label \
+    temp  = cvalue[0]; \
+    temp2 = cvalue[DCTSIZE2]; \
+    \
+    /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
+    while (r > 15) { \
+      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
+      r -= 16; \
+    } \
+    \
+    /* Find the number of bits needed for the magnitude of the coefficient */ \
+    nbits = JPEG_NBITS_NONZERO(temp);  /* there must be at least one 1 bit */ \
+    /* Check for out-of-range coefficient values */ \
+    if (nbits > MAX_COEF_BITS) \
+      ERREXIT(cinfo, JERR_BAD_DCT_COEF); \
+    \
+    /* Count/emit Huffman symbol for run length / number of bits */ \
+    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); \
+    \
+    /* Emit that number of bits of the value, if positive, */ \
+    /* or the complement of its magnitude, if negative. */ \
+    emit_bits(entropy, (unsigned int)temp2, nbits); \
+    \
+    cvalue++; \
+    zerobits >>= 1; \
+  } \
+}
+
 METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   register int temp, temp2;
-  register int nbits;
-  register int r, k;
-  int Se = cinfo->Se;
+  register int nbits, r;
+  int Sl = cinfo->Se - cinfo->Ss + 1;
   int Al = cinfo->Al;
-  JBLOCKROW block;
+  JCOEF values_unaligned[2 * DCTSIZE2 + 15];
+  JCOEF *values;
+  const JCOEF *cvalue;
+  size_t zerobits;
+  size_t bits[8 / SIZEOF_SIZE_T];
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
   entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -480,66 +680,48 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     if (entropy->restarts_to_go == 0)
       emit_restart(entropy, entropy->next_restart_num);
 
-  /* Encode the MCU data block */
-  block = MCU_data[0];
-
-  /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
+#ifdef WITH_SIMD
+  cvalue = values = (JCOEF *)PAD((JUINTPTR)values_unaligned, 16);
+#else
+  /* Not using SIMD, so alignment is not needed */
+  cvalue = values = values_unaligned;
+#endif
 
-  r = 0;                        /* r = run length of zeros */
+  /* Prepare data */
+  entropy->AC_first_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
+                            Sl, Al, values, bits);
 
-  for (k = cinfo->Ss; k <= Se; k++) {
-    if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
-      r++;
-      continue;
-    }
-    /* We must apply the point transform by Al.  For AC coefficients this
-     * is an integer division with rounding towards 0.  To do this portably
-     * in C, we shift after obtaining the absolute value; so the code is
-     * interwoven with finding the abs value (temp) and output bits (temp2).
-     */
-    if (temp < 0) {
-      temp = -temp;             /* temp is abs value of input */
-      temp >>= Al;              /* apply the point transform */
-      /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
-      temp2 = ~temp;
-    } else {
-      temp >>= Al;              /* apply the point transform */
-      temp2 = temp;
-    }
-    /* Watch out for case that nonzero coef is zero after point transform */
-    if (temp == 0) {
-      r++;
-      continue;
-    }
+  zerobits = bits[0];
+#if SIZEOF_SIZE_T == 4
+  zerobits |= bits[1];
+#endif
 
-    /* Emit any pending EOBRUN */
-    if (entropy->EOBRUN > 0)
-      emit_eobrun(entropy);
-    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
-    while (r > 15) {
-      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
-      r -= 16;
-    }
+  /* Emit any pending EOBRUN */
+  if (zerobits && (entropy->EOBRUN > 0))
+    emit_eobrun(entropy);
 
-    /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 1;                  /* there must be at least one 1 bit */
-    while ((temp >>= 1))
-      nbits++;
-    /* Check for out-of-range coefficient values */
-    if (nbits > MAX_COEF_BITS)
-      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
+#if SIZEOF_SIZE_T == 4
+  zerobits = bits[0];
+#endif
 
-    /* Count/emit Huffman symbol for run length / number of bits */
-    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
+  /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
 
-    /* Emit that number of bits of the value, if positive, */
-    /* or the complement of its magnitude, if negative. */
-    emit_bits(entropy, (unsigned int) temp2, nbits);
+  ENCODE_COEFS_AC_FIRST((void)0;);
 
-    r = 0;                      /* reset zero run length */
+#if SIZEOF_SIZE_T == 4
+  zerobits = bits[1];
+  if (zerobits) {
+    int diff = ((values + DCTSIZE2 / 2) - cvalue);
+    r = count_zeroes(&zerobits);
+    r += diff;
+    cvalue += r;
+    goto first_iter_ac_first;
   }
 
-  if (r > 0) {                  /* If there are trailing zeroes, */
+  ENCODE_COEFS_AC_FIRST(first_iter_ac_first:);
+#endif
+
+  if (cvalue < (values + Sl)) { /* If there are trailing zeroes, */
     entropy->EOBRUN++;          /* count an EOB */
     if (entropy->EOBRUN == 0x7FFF)
       emit_eobrun(entropy);     /* force it out to avoid overflow */
@@ -569,9 +751,9 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   register int temp;
   int blkn;
   int Al = cinfo->Al;
@@ -591,7 +773,7 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 
     /* We simply emit the Al'th bit of the DC coefficient value. */
     temp = (*block)[0];
-    emit_bits(entropy, (unsigned int) (temp >> Al), 1);
+    emit_bits(entropy, (unsigned int)(temp >> Al), 1);
   }
 
   cinfo->dest->next_output_byte = entropy->next_output_byte;
@@ -612,22 +794,148 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 
 
 /*
+ * Data preparation for encode_mcu_AC_refine().
+ */
+
+#define COMPUTE_ABSVALUES_AC_REFINE(Sl, koffset) { \
+  /* It is convenient to make a pre-pass to determine the transformed \
+   * coefficients' absolute values and the EOB position. \
+   */ \
+  for (k = 0; k < Sl; k++) { \
+    temp = block[jpeg_natural_order_start[k]]; \
+    /* We must apply the point transform by Al.  For AC coefficients this \
+     * is an integer division with rounding towards 0.  To do this portably \
+     * in C, we shift after obtaining the absolute value. \
+     */ \
+    temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
+    temp ^= temp2; \
+    temp -= temp2;              /* temp is abs value of input */ \
+    temp >>= Al;                /* apply the point transform */ \
+    if (temp != 0) { \
+      zerobits |= ((size_t)1U) << k; \
+      signbits |= ((size_t)(temp2 + 1)) << k; \
+    } \
+    absvalues[k] = (JCOEF)temp; /* save abs value for main pass */ \
+    if (temp == 1) \
+      EOB = k + koffset;        /* EOB = index of last newly-nonzero coef */ \
+  } \
+}
+
+METHODDEF(int)
+encode_mcu_AC_refine_prepare(const JCOEF *block,
+                             const int *jpeg_natural_order_start, int Sl,
+                             int Al, JCOEF *absvalues, size_t *bits)
+{
+  register int k, temp, temp2;
+  int EOB = 0;
+  size_t zerobits = 0U, signbits = 0U;
+  int Sl0 = Sl;
+
+#if SIZEOF_SIZE_T == 4
+  if (Sl0 > 32)
+    Sl0 = 32;
+#endif
+
+  COMPUTE_ABSVALUES_AC_REFINE(Sl0, 0);
+
+  bits[0] = zerobits;
+#if SIZEOF_SIZE_T == 8
+  bits[1] = signbits;
+#else
+  bits[2] = signbits;
+
+  zerobits = 0U;
+  signbits = 0U;
+
+  if (Sl > 32) {
+    Sl -= 32;
+    jpeg_natural_order_start += 32;
+    absvalues += 32;
+
+    COMPUTE_ABSVALUES_AC_REFINE(Sl, 32);
+  }
+
+  bits[1] = zerobits;
+  bits[3] = signbits;
+#endif
+
+  return EOB;
+}
+
+
+/*
  * MCU encoding for AC successive approximation refinement scan.
  */
 
+#define ENCODE_COEFS_AC_REFINE(label) { \
+  while (zerobits) { \
+    idx = count_zeroes(&zerobits); \
+    r += idx; \
+    cabsvalue += idx; \
+    signbits >>= idx; \
+label \
+    /* Emit any required ZRLs, but not if they can be folded into EOB */ \
+    while (r > 15 && (cabsvalue <= EOBPTR)) { \
+      /* emit any pending EOBRUN and the BE correction bits */ \
+      emit_eobrun(entropy); \
+      /* Emit ZRL */ \
+      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
+      r -= 16; \
+      /* Emit buffered correction bits that must be associated with ZRL */ \
+      emit_buffered_bits(entropy, BR_buffer, BR); \
+      BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
+      BR = 0; \
+    } \
+    \
+    temp = *cabsvalue++; \
+    \
+    /* If the coef was previously nonzero, it only needs a correction bit. \
+     * NOTE: a straight translation of the spec's figure G.7 would suggest \
+     * that we also need to test r > 15.  But if r > 15, we can only get here \
+     * if k > EOB, which implies that this coefficient is not 1. \
+     */ \
+    if (temp > 1) { \
+      /* The correction bit is the next bit of the absolute value. */ \
+      BR_buffer[BR++] = (char)(temp & 1); \
+      signbits >>= 1; \
+      zerobits >>= 1; \
+      continue; \
+    } \
+    \
+    /* Emit any pending EOBRUN and the BE correction bits */ \
+    emit_eobrun(entropy); \
+    \
+    /* Count/emit Huffman symbol for run length / number of bits */ \
+    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); \
+    \
+    /* Emit output bit for newly-nonzero coef */ \
+    temp = signbits & 1; /* ((*block)[jpeg_natural_order_start[k]] < 0) ? 0 : 1 */ \
+    emit_bits(entropy, (unsigned int)temp, 1); \
+    \
+    /* Emit buffered correction bits that must be associated with this code */ \
+    emit_buffered_bits(entropy, BR_buffer, BR); \
+    BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
+    BR = 0; \
+    r = 0;                      /* reset zero run length */ \
+    signbits >>= 1; \
+    zerobits >>= 1; \
+  } \
+}
+
 METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp;
-  register int r, k;
-  int EOB;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+  register int temp, r, idx;
   char *BR_buffer;
   unsigned int BR;
-  int Se = cinfo->Se;
+  int Sl = cinfo->Se - cinfo->Ss + 1;
   int Al = cinfo->Al;
-  JBLOCKROW block;
-  int absvalues[DCTSIZE2];
+  JCOEF absvalues_unaligned[DCTSIZE2 + 15];
+  JCOEF *absvalues;
+  const JCOEF *cabsvalue, *EOBPTR;
+  size_t zerobits, signbits;
+  size_t bits[16 / SIZEOF_SIZE_T];
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
   entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -637,26 +945,17 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     if (entropy->restarts_to_go == 0)
       emit_restart(entropy, entropy->next_restart_num);
 
-  /* Encode the MCU data block */
-  block = MCU_data[0];
+#ifdef WITH_SIMD
+  cabsvalue = absvalues = (JCOEF *)PAD((JUINTPTR)absvalues_unaligned, 16);
+#else
+  /* Not using SIMD, so alignment is not needed */
+  cabsvalue = absvalues = absvalues_unaligned;
+#endif
 
-  /* It is convenient to make a pre-pass to determine the transformed
-   * coefficients' absolute values and the EOB position.
-   */
-  EOB = 0;
-  for (k = cinfo->Ss; k <= Se; k++) {
-    temp = (*block)[jpeg_natural_order[k]];
-    /* We must apply the point transform by Al.  For AC coefficients this
-     * is an integer division with rounding towards 0.  To do this portably
-     * in C, we shift after obtaining the absolute value.
-     */
-    if (temp < 0)
-      temp = -temp;             /* temp is abs value of input */
-    temp >>= Al;                /* apply the point transform */
-    absvalues[k] = temp;        /* save abs value for main pass */
-    if (temp == 1)
-      EOB = k;                  /* EOB = index of last newly-nonzero coef */
-  }
+  /* Prepare data */
+  EOBPTR = absvalues +
+    entropy->AC_refine_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
+                               Sl, Al, absvalues, bits);
 
   /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
 
@@ -664,52 +963,32 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   BR = 0;                       /* BR = count of buffered bits added now */
   BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
 
-  for (k = cinfo->Ss; k <= Se; k++) {
-    if ((temp = absvalues[k]) == 0) {
-      r++;
-      continue;
-    }
-
-    /* Emit any required ZRLs, but not if they can be folded into EOB */
-    while (r > 15 && k <= EOB) {
-      /* emit any pending EOBRUN and the BE correction bits */
-      emit_eobrun(entropy);
-      /* Emit ZRL */
-      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
-      r -= 16;
-      /* Emit buffered correction bits that must be associated with ZRL */
-      emit_buffered_bits(entropy, BR_buffer, BR);
-      BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
-      BR = 0;
-    }
-
-    /* If the coef was previously nonzero, it only needs a correction bit.
-     * NOTE: a straight translation of the spec's figure G.7 would suggest
-     * that we also need to test r > 15.  But if r > 15, we can only get here
-     * if k > EOB, which implies that this coefficient is not 1.
-     */
-    if (temp > 1) {
-      /* The correction bit is the next bit of the absolute value. */
-      BR_buffer[BR++] = (char) (temp & 1);
-      continue;
-    }
-
-    /* Emit any pending EOBRUN and the BE correction bits */
-    emit_eobrun(entropy);
-
-    /* Count/emit Huffman symbol for run length / number of bits */
-    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
+  zerobits = bits[0];
+#if SIZEOF_SIZE_T == 8
+  signbits = bits[1];
+#else
+  signbits = bits[2];
+#endif
+  ENCODE_COEFS_AC_REFINE((void)0;);
+
+#if SIZEOF_SIZE_T == 4
+  zerobits = bits[1];
+  signbits = bits[3];
+
+  if (zerobits) {
+    int diff = ((absvalues + DCTSIZE2 / 2) - cabsvalue);
+    idx = count_zeroes(&zerobits);
+    signbits >>= idx;
+    idx += diff;
+    r += idx;
+    cabsvalue += idx;
+    goto first_iter_ac_refine;
+  }
 
-    /* Emit output bit for newly-nonzero coef */
-    temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
-    emit_bits(entropy, (unsigned int) temp, 1);
+  ENCODE_COEFS_AC_REFINE(first_iter_ac_refine:);
+#endif
 
-    /* Emit buffered correction bits that must be associated with this code */
-    emit_buffered_bits(entropy, BR_buffer, BR);
-    BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
-    BR = 0;
-    r = 0;                      /* reset zero run length */
-  }
+  r |= (int)((absvalues + Sl) - cabsvalue);
 
   if (r > 0 || BR > 0) {        /* If there are trailing zeroes, */
     entropy->EOBRUN++;          /* count an EOB */
@@ -718,7 +997,8 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
      * 1. overflow of the EOB counter;
      * 2. overflow of the correction bit buffer during the next MCU.
      */
-    if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1))
+    if (entropy->EOBRUN == 0x7FFF ||
+        entropy->BE > (MAX_CORR_BITS - DCTSIZE2 + 1))
       emit_eobrun(entropy);
   }
 
@@ -744,9 +1024,9 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(void)
-finish_pass_phuff (j_compress_ptr cinfo)
+finish_pass_phuff(j_compress_ptr cinfo)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
   entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -765,9 +1045,9 @@ finish_pass_phuff (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-finish_pass_gather_phuff (j_compress_ptr cinfo)
+finish_pass_gather_phuff(j_compress_ptr cinfo)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   boolean is_DC_band;
   int ci, tbl;
   jpeg_component_info *compptr;
@@ -782,7 +1062,7 @@ finish_pass_gather_phuff (j_compress_ptr cinfo)
   /* It's important not to apply jpeg_gen_optimal_table more than once
    * per table, because it clobbers the input frequency counts!
    */
-  MEMZERO(did, sizeof(did));
+  memset(did, 0, sizeof(did));
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
@@ -793,13 +1073,13 @@ finish_pass_gather_phuff (j_compress_ptr cinfo)
     } else {
       tbl = compptr->ac_tbl_no;
     }
-    if (! did[tbl]) {
+    if (!did[tbl]) {
       if (is_DC_band)
-        htblptr = & cinfo->dc_huff_tbl_ptrs[tbl];
+        htblptr = &cinfo->dc_huff_tbl_ptrs[tbl];
       else
-        htblptr = & cinfo->ac_huff_tbl_ptrs[tbl];
+        htblptr = &cinfo->ac_huff_tbl_ptrs[tbl];
       if (*htblptr == NULL)
-        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+        *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]);
       did[tbl] = TRUE;
     }
@@ -812,15 +1092,15 @@ finish_pass_gather_phuff (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_phuff_encoder (j_compress_ptr cinfo)
+jinit_phuff_encoder(j_compress_ptr cinfo)
 {
   phuff_entropy_ptr entropy;
   int i;
 
   entropy = (phuff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(phuff_entropy_encoder));
-  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
   entropy->pub.start_pass = start_pass_phuff;
 
   /* Mark tables unallocated */
diff --git a/media/libjpeg/jcprepct.c b/media/libjpeg/jcprepct.c
index e72ebd87d2..f27cc34507 100644
--- a/media/libjpeg/jcprepct.c
+++ b/media/libjpeg/jcprepct.c
@@ -3,8 +3,8 @@
  *
  * This file is part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -78,9 +78,9 @@ typedef my_prep_controller *my_prep_ptr;
  */
 
 METHODDEF(void)
-start_pass_prep (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_prep(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+  my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
 
   if (pass_mode != JBUF_PASS_THRU)
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -106,14 +106,14 @@ start_pass_prep (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 LOCAL(void)
-expand_bottom_edge (JSAMPARRAY image_data, JDIMENSION num_cols,
-                    int input_rows, int output_rows)
+expand_bottom_edge(JSAMPARRAY image_data, JDIMENSION num_cols, int input_rows,
+                   int output_rows)
 {
   register int row;
 
   for (row = input_rows; row < output_rows; row++) {
-    jcopy_sample_rows(image_data, input_rows-1, image_data, row,
-                      1, num_cols);
+    jcopy_sample_rows(image_data, input_rows - 1, image_data, row, 1,
+                      num_cols);
   }
 }
 
@@ -128,13 +128,12 @@ expand_bottom_edge (JSAMPARRAY image_data, JDIMENSION num_cols,
  */
 
 METHODDEF(void)
-pre_process_data (j_compress_ptr cinfo,
-                  JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-                  JDIMENSION in_rows_avail,
-                  JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
-                  JDIMENSION out_row_groups_avail)
+pre_process_data(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                 JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
+                 JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+                 JDIMENSION out_row_groups_avail)
 {
-  my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+  my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
   int numrows, ci;
   JDIMENSION inrows;
   jpeg_component_info *compptr;
@@ -144,10 +143,10 @@ pre_process_data (j_compress_ptr cinfo,
     /* Do color conversion to fill the conversion buffer. */
     inrows = in_rows_avail - *in_row_ctr;
     numrows = cinfo->max_v_samp_factor - prep->next_buf_row;
-    numrows = (int) MIN((JDIMENSION) numrows, inrows);
+    numrows = (int)MIN((JDIMENSION)numrows, inrows);
     (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
                                        prep->color_buf,
-                                       (JDIMENSION) prep->next_buf_row,
+                                       (JDIMENSION)prep->next_buf_row,
                                        numrows);
     *in_row_ctr += numrows;
     prep->next_buf_row += numrows;
@@ -164,7 +163,7 @@ pre_process_data (j_compress_ptr cinfo,
     /* If we've filled the conversion buffer, empty it. */
     if (prep->next_buf_row == cinfo->max_v_samp_factor) {
       (*cinfo->downsample->downsample) (cinfo,
-                                        prep->color_buf, (JDIMENSION) 0,
+                                        prep->color_buf, (JDIMENSION)0,
                                         output_buf, *out_row_group_ctr);
       prep->next_buf_row = 0;
       (*out_row_group_ctr)++;
@@ -172,14 +171,12 @@ pre_process_data (j_compress_ptr cinfo,
     /* If at bottom of image, pad the output to a full iMCU height.
      * Note we assume the caller is providing a one-iMCU-height output buffer!
      */
-    if (prep->rows_to_go == 0 &&
-        *out_row_group_ctr < out_row_groups_avail) {
+    if (prep->rows_to_go == 0 && *out_row_group_ctr < out_row_groups_avail) {
       for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
            ci++, compptr++) {
-        expand_bottom_edge(output_buf[ci],
-                           compptr->width_in_blocks * DCTSIZE,
-                           (int) (*out_row_group_ctr * compptr->v_samp_factor),
-                           (int) (out_row_groups_avail * compptr->v_samp_factor));
+        expand_bottom_edge(output_buf[ci], compptr->width_in_blocks * DCTSIZE,
+                           (int)(*out_row_group_ctr * compptr->v_samp_factor),
+                           (int)(out_row_groups_avail * compptr->v_samp_factor));
       }
       *out_row_group_ctr = out_row_groups_avail;
       break;                    /* can exit outer loop without test */
@@ -195,13 +192,12 @@ pre_process_data (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-pre_process_context (j_compress_ptr cinfo,
-                     JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-                     JDIMENSION in_rows_avail,
-                     JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
-                     JDIMENSION out_row_groups_avail)
+pre_process_context(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                    JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
+                    JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+                    JDIMENSION out_row_groups_avail)
 {
-  my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+  my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
   int numrows, ci;
   int buf_height = cinfo->max_v_samp_factor * 3;
   JDIMENSION inrows;
@@ -211,19 +207,18 @@ pre_process_context (j_compress_ptr cinfo,
       /* Do color conversion to fill the conversion buffer. */
       inrows = in_rows_avail - *in_row_ctr;
       numrows = prep->next_buf_stop - prep->next_buf_row;
-      numrows = (int) MIN((JDIMENSION) numrows, inrows);
+      numrows = (int)MIN((JDIMENSION)numrows, inrows);
       (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
                                          prep->color_buf,
-                                         (JDIMENSION) prep->next_buf_row,
+                                         (JDIMENSION)prep->next_buf_row,
                                          numrows);
       /* Pad at top of image, if first time through */
       if (prep->rows_to_go == cinfo->image_height) {
         for (ci = 0; ci < cinfo->num_components; ci++) {
           int row;
           for (row = 1; row <= cinfo->max_v_samp_factor; row++) {
-            jcopy_sample_rows(prep->color_buf[ci], 0,
-                              prep->color_buf[ci], -row,
-                              1, cinfo->image_width);
+            jcopy_sample_rows(prep->color_buf[ci], 0, prep->color_buf[ci],
+                              -row, 1, cinfo->image_width);
           }
         }
       }
@@ -245,9 +240,8 @@ pre_process_context (j_compress_ptr cinfo,
     }
     /* If we've gotten enough data, downsample a row group. */
     if (prep->next_buf_row == prep->next_buf_stop) {
-      (*cinfo->downsample->downsample) (cinfo,
-                                        prep->color_buf,
-                                        (JDIMENSION) prep->this_row_group,
+      (*cinfo->downsample->downsample) (cinfo, prep->color_buf,
+                                        (JDIMENSION)prep->this_row_group,
                                         output_buf, *out_row_group_ctr);
       (*out_row_group_ctr)++;
       /* Advance pointers with wraparound as necessary. */
@@ -267,9 +261,9 @@ pre_process_context (j_compress_ptr cinfo,
  */
 
 LOCAL(void)
-create_context_buffer (j_compress_ptr cinfo)
+create_context_buffer(j_compress_ptr cinfo)
 {
-  my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+  my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
   int rgroup_height = cinfo->max_v_samp_factor;
   int ci, i;
   jpeg_component_info *compptr;
@@ -279,7 +273,7 @@ create_context_buffer (j_compress_ptr cinfo)
    * we need five row groups' worth of pointers for each component.
    */
   fake_buffer = (JSAMPARRAY)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 (cinfo->num_components * 5 * rgroup_height) *
                                 sizeof(JSAMPROW));
 
@@ -290,13 +284,13 @@ create_context_buffer (j_compress_ptr cinfo)
      * horizontally within the buffer, if it so chooses.
      */
     true_buffer = (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
-                      cinfo->max_h_samp_factor) / compptr->h_samp_factor),
-       (JDIMENSION) (3 * rgroup_height));
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (JDIMENSION)(((long)compptr->width_in_blocks * DCTSIZE *
+                     cinfo->max_h_samp_factor) / compptr->h_samp_factor),
+       (JDIMENSION)(3 * rgroup_height));
     /* Copy true buffer row pointers into the middle of the fake row array */
-    MEMCOPY(fake_buffer + rgroup_height, true_buffer,
-            3 * rgroup_height * sizeof(JSAMPROW));
+    memcpy(fake_buffer + rgroup_height, true_buffer,
+           3 * rgroup_height * sizeof(JSAMPROW));
     /* Fill in the above and below wraparound pointers */
     for (i = 0; i < rgroup_height; i++) {
       fake_buffer[i] = true_buffer[2 * rgroup_height + i];
@@ -315,7 +309,7 @@ create_context_buffer (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
+jinit_c_prep_controller(j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_prep_ptr prep;
   int ci;
@@ -325,9 +319,9 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 
   prep = (my_prep_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_prep_controller));
-  cinfo->prep = (struct jpeg_c_prep_controller *) prep;
+  cinfo->prep = (struct jpeg_c_prep_controller *)prep;
   prep->pub.start_pass = start_pass_prep;
 
   /* Allocate the color conversion buffer.
@@ -348,10 +342,10 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
          ci++, compptr++) {
       prep->color_buf[ci] = (*cinfo->mem->alloc_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE,
-         (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
-                        cinfo->max_h_samp_factor) / compptr->h_samp_factor),
-         (JDIMENSION) cinfo->max_v_samp_factor);
+        ((j_common_ptr)cinfo, JPOOL_IMAGE,
+         (JDIMENSION)(((long)compptr->width_in_blocks * DCTSIZE *
+                       cinfo->max_h_samp_factor) / compptr->h_samp_factor),
+         (JDIMENSION)cinfo->max_v_samp_factor);
     }
   }
 }
diff --git a/media/libjpeg/jcsample.c b/media/libjpeg/jcsample.c
index c4b4991487..e8515ebf0f 100644
--- a/media/libjpeg/jcsample.c
+++ b/media/libjpeg/jcsample.c
@@ -6,7 +6,7 @@
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -79,7 +79,7 @@ typedef my_downsampler *my_downsample_ptr;
  */
 
 METHODDEF(void)
-start_pass_downsample (j_compress_ptr cinfo)
+start_pass_downsample(j_compress_ptr cinfo)
 {
   /* no work for now */
 }
@@ -91,19 +91,19 @@ start_pass_downsample (j_compress_ptr cinfo)
  */
 
 LOCAL(void)
-expand_right_edge (JSAMPARRAY image_data, int num_rows,
-                   JDIMENSION input_cols, JDIMENSION output_cols)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
 {
   register JSAMPROW ptr;
   register JSAMPLE pixval;
   register int count;
   int row;
-  int numcols = (int) (output_cols - input_cols);
+  int numcols = (int)(output_cols - input_cols);
 
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
@@ -118,11 +118,11 @@ expand_right_edge (JSAMPARRAY image_data, int num_rows,
  */
 
 METHODDEF(void)
-sep_downsample (j_compress_ptr cinfo,
-                JSAMPIMAGE input_buf, JDIMENSION in_row_index,
-                JSAMPIMAGE output_buf, JDIMENSION out_row_group_index)
+sep_downsample(j_compress_ptr cinfo, JSAMPIMAGE input_buf,
+               JDIMENSION in_row_index, JSAMPIMAGE output_buf,
+               JDIMENSION out_row_group_index)
 {
-  my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample;
+  my_downsample_ptr downsample = (my_downsample_ptr)cinfo->downsample;
   int ci;
   jpeg_component_info *compptr;
   JSAMPARRAY in_ptr, out_ptr;
@@ -144,8 +144,8 @@ sep_downsample (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                JSAMPARRAY input_data, JSAMPARRAY output_data)
+int_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+               JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
   JDIMENSION outcol, outcol_h;  /* outcol_h == outcol*h_expand */
@@ -156,14 +156,14 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
   h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
   v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor;
   numpix = h_expand * v_expand;
-  numpix2 = numpix/2;
+  numpix2 = numpix / 2;
 
   /* Expand input data enough to let all the output samples be generated
    * by the standard loop.  Special-casing padded output would be more
    * efficient.
    */
-  expand_right_edge(input_data, cinfo->max_v_samp_factor,
-                    cinfo->image_width, output_cols * h_expand);
+  expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    output_cols * h_expand);
 
   inrow = 0;
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
@@ -172,12 +172,12 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
          outcol++, outcol_h += h_expand) {
       outvalue = 0;
       for (v = 0; v < v_expand; v++) {
-        inptr = input_data[inrow+v] + outcol_h;
+        inptr = input_data[inrow + v] + outcol_h;
         for (h = 0; h < h_expand; h++) {
-          outvalue += (JLONG) GETJSAMPLE(*inptr++);
+          outvalue += (JLONG)(*inptr++);
         }
       }
-      *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix);
+      *outptr++ = (JSAMPLE)((outvalue + numpix2) / numpix);
     }
     inrow += v_expand;
   }
@@ -191,15 +191,15 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY output_data)
+fullsize_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   /* Copy the data */
-  jcopy_sample_rows(input_data, 0, output_data, 0,
-                    cinfo->max_v_samp_factor, cinfo->image_width);
+  jcopy_sample_rows(input_data, 0, output_data, 0, cinfo->max_v_samp_factor,
+                    cinfo->image_width);
   /* Edge-expand */
-  expand_right_edge(output_data, cinfo->max_v_samp_factor,
-                    cinfo->image_width, compptr->width_in_blocks * DCTSIZE);
+  expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    compptr->width_in_blocks * DCTSIZE);
 }
 
 
@@ -216,8 +216,8 @@ fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                 JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int outrow;
   JDIMENSION outcol;
@@ -229,16 +229,15 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
    * by the standard loop.  Special-casing padded output would be more
    * efficient.
    */
-  expand_right_edge(input_data, cinfo->max_v_samp_factor,
-                    cinfo->image_width, output_cols * 2);
+  expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    output_cols * 2);
 
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr = input_data[outrow];
     bias = 0;                   /* bias = 0,1,0,1,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
-      *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1])
-                              + bias) >> 1);
+      *outptr++ = (JSAMPLE)((inptr[0] + inptr[1] + bias) >> 1);
       bias ^= 1;                /* 0=>1, 1=>0 */
       inptr += 2;
     }
@@ -253,8 +252,8 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                 JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow;
   JDIMENSION outcol;
@@ -266,21 +265,20 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
    * by the standard loop.  Special-casing padded output would be more
    * efficient.
    */
-  expand_right_edge(input_data, cinfo->max_v_samp_factor,
-                    cinfo->image_width, output_cols * 2);
+  expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    output_cols * 2);
 
   inrow = 0;
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
-    inptr1 = input_data[inrow+1];
+    inptr1 = input_data[inrow + 1];
     bias = 1;                   /* bias = 1,2,1,2,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
-      *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                              GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1])
-                              + bias) >> 2);
+      *outptr++ =
+        (JSAMPLE)((inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1] + bias) >> 2);
       bias ^= 3;                /* 1=>2, 2=>1 */
-      inptr0 += 2; inptr1 += 2;
+      inptr0 += 2;  inptr1 += 2;
     }
     inrow += 2;
   }
@@ -296,8 +294,8 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                        JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow;
   JDIMENSION colctr;
@@ -332,57 +330,45 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
-    inptr1 = input_data[inrow+1];
-    above_ptr = input_data[inrow-1];
-    below_ptr = input_data[inrow+2];
+    inptr1 = input_data[inrow + 1];
+    above_ptr = input_data[inrow - 1];
+    below_ptr = input_data[inrow + 2];
 
     /* Special case for first column: pretend column -1 is same as column 0 */
-    membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
-    neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-               GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[2]) +
-               GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[2]);
+    membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+    neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+               inptr0[0] + inptr0[2] + inptr1[0] + inptr1[2];
     neighsum += neighsum;
-    neighsum += GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[2]) +
-                GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]);
+    neighsum += above_ptr[0] + above_ptr[2] + below_ptr[0] + below_ptr[2];
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
-    inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
+    *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+    inptr0 += 2;  inptr1 += 2;  above_ptr += 2;  below_ptr += 2;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
       /* sum of pixels directly mapped to this output element */
-      membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                  GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
+      membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
       /* sum of edge-neighbor pixels */
-      neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-                 GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-                 GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[2]) +
-                 GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[2]);
+      neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+                 inptr0[-1] + inptr0[2] + inptr1[-1] + inptr1[2];
       /* The edge-neighbors count twice as much as corner-neighbors */
       neighsum += neighsum;
       /* Add in the corner-neighbors */
-      neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[2]) +
-                  GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[2]);
+      neighsum += above_ptr[-1] + above_ptr[2] + below_ptr[-1] + below_ptr[2];
       /* form final output scaled up by 2^16 */
       membersum = membersum * memberscale + neighsum * neighscale;
       /* round, descale and output it */
-      *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
-      inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
+      *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+      inptr0 += 2;  inptr1 += 2;  above_ptr += 2;  below_ptr += 2;
     }
 
     /* Special case for last column */
-    membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
-    neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-               GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[1]) +
-               GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[1]);
+    membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+    neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+               inptr0[-1] + inptr0[1] + inptr1[-1] + inptr1[1];
     neighsum += neighsum;
-    neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[1]) +
-                GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]);
+    neighsum += above_ptr[-1] + above_ptr[1] + below_ptr[-1] + below_ptr[1];
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr = (JSAMPLE) ((membersum + 32768) >> 16);
+    *outptr = (JSAMPLE)((membersum + 32768) >> 16);
 
     inrow += 2;
   }
@@ -396,8 +382,8 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                            JSAMPARRAY input_data, JSAMPARRAY output_data)
+fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                           JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int outrow;
   JDIMENSION colctr;
@@ -425,36 +411,33 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr = input_data[outrow];
-    above_ptr = input_data[outrow-1];
-    below_ptr = input_data[outrow+1];
+    above_ptr = input_data[outrow - 1];
+    below_ptr = input_data[outrow + 1];
 
     /* Special case for first column */
-    colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
-             GETJSAMPLE(*inptr);
-    membersum = GETJSAMPLE(*inptr++);
-    nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-                 GETJSAMPLE(*inptr);
+    colsum = (*above_ptr++) + (*below_ptr++) + inptr[0];
+    membersum = *inptr++;
+    nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
     neighsum = colsum + (colsum - membersum) + nextcolsum;
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
-    lastcolsum = colsum; colsum = nextcolsum;
+    *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+    lastcolsum = colsum;  colsum = nextcolsum;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
-      membersum = GETJSAMPLE(*inptr++);
-      above_ptr++; below_ptr++;
-      nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-                   GETJSAMPLE(*inptr);
+      membersum = *inptr++;
+      above_ptr++;  below_ptr++;
+      nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
       neighsum = lastcolsum + (colsum - membersum) + nextcolsum;
       membersum = membersum * memberscale + neighsum * neighscale;
-      *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
-      lastcolsum = colsum; colsum = nextcolsum;
+      *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+      lastcolsum = colsum;  colsum = nextcolsum;
     }
 
     /* Special case for last column */
-    membersum = GETJSAMPLE(*inptr);
+    membersum = *inptr;
     neighsum = lastcolsum + (colsum - membersum) + colsum;
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr = (JSAMPLE) ((membersum + 32768) >> 16);
+    *outptr = (JSAMPLE)((membersum + 32768) >> 16);
 
   }
 }
@@ -468,7 +451,7 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jinit_downsampler (j_compress_ptr cinfo)
+jinit_downsampler(j_compress_ptr cinfo)
 {
   my_downsample_ptr downsample;
   int ci;
@@ -476,9 +459,9 @@ jinit_downsampler (j_compress_ptr cinfo)
   boolean smoothok = TRUE;
 
   downsample = (my_downsample_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_downsampler));
-  cinfo->downsample = (struct jpeg_downsampler *) downsample;
+  cinfo->downsample = (struct jpeg_downsampler *)downsample;
   downsample->pub.start_pass = start_pass_downsample;
   downsample->pub.downsample = sep_downsample;
   downsample->pub.need_context_rows = FALSE;
diff --git a/media/libjpeg/jctrans.c b/media/libjpeg/jctrans.c
index 6f16b052cf..e121028ec7 100644
--- a/media/libjpeg/jctrans.c
+++ b/media/libjpeg/jctrans.c
@@ -4,8 +4,8 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1998, Thomas G. Lane.
  * Modified 2000-2009 by Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -17,13 +17,14 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
 
 /* Forward declarations */
-LOCAL(void) transencode_master_selection
-        (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays);
-LOCAL(void) transencode_coef_controller
-        (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays);
+LOCAL(void) transencode_master_selection(j_compress_ptr cinfo,
+                                         jvirt_barray_ptr *coef_arrays);
+LOCAL(void) transencode_coef_controller(j_compress_ptr cinfo,
+                                        jvirt_barray_ptr *coef_arrays);
 
 
 /*
@@ -39,14 +40,14 @@ LOCAL(void) transencode_coef_controller
  */
 
 GLOBAL(void)
-jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
+jpeg_write_coefficients(j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
 {
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   /* Mark all tables to be written */
   jpeg_suppress_tables(cinfo, FALSE);
   /* (Re)initialize error mgr and destination modules */
-  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   (*cinfo->dest->init_destination) (cinfo);
   /* Perform master selection of active modules */
   transencode_master_selection(cinfo, coef_arrays);
@@ -64,8 +65,7 @@ jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
  */
 
 GLOBAL(void)
-jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
-                               j_compress_ptr dstinfo)
+jpeg_copy_critical_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo)
 {
   JQUANT_TBL **qtblptr;
   jpeg_component_info *incomp, *outcomp;
@@ -97,12 +97,11 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
   /* Copy the source's quantization tables. */
   for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) {
     if (srcinfo->quant_tbl_ptrs[tblno] != NULL) {
-      qtblptr = & dstinfo->quant_tbl_ptrs[tblno];
+      qtblptr = &dstinfo->quant_tbl_ptrs[tblno];
       if (*qtblptr == NULL)
-        *qtblptr = jpeg_alloc_quant_table((j_common_ptr) dstinfo);
-      MEMCOPY((*qtblptr)->quantval,
-              srcinfo->quant_tbl_ptrs[tblno]->quantval,
-              sizeof((*qtblptr)->quantval));
+        *qtblptr = jpeg_alloc_quant_table((j_common_ptr)dstinfo);
+      memcpy((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval,
+             sizeof((*qtblptr)->quantval));
       (*qtblptr)->sent_table = FALSE;
     }
   }
@@ -165,8 +164,8 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
  */
 
 LOCAL(void)
-transencode_master_selection (j_compress_ptr cinfo,
-                              jvirt_barray_ptr *coef_arrays)
+transencode_master_selection(j_compress_ptr cinfo,
+                             jvirt_barray_ptr *coef_arrays)
 {
   /* Although we don't actually use input_components for transcoding,
    * jcmaster.c's initial_setup will complain if input_components is 0.
@@ -199,7 +198,7 @@ transencode_master_selection (j_compress_ptr cinfo,
   jinit_marker_writer(cinfo);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
-  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
   /* Write the datastream header (SOI, JFIF) immediately.
    * Frame and scan headers are postponed till later.
@@ -238,10 +237,10 @@ typedef my_coef_controller *my_coef_ptr;
 
 
 LOCAL(void)
-start_iMCU_row (j_compress_ptr cinfo)
+start_iMCU_row(j_compress_ptr cinfo)
 /* Reset within-iMCU-row counters for a new row */
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   /* In an interleaved scan, an MCU row is the same as an iMCU row.
    * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
@@ -250,7 +249,7 @@ start_iMCU_row (j_compress_ptr cinfo)
   if (cinfo->comps_in_scan > 1) {
     coef->MCU_rows_per_iMCU_row = 1;
   } else {
-    if (coef->iMCU_row_num < (cinfo->total_iMCU_rows-1))
+    if (coef->iMCU_row_num < (cinfo->total_iMCU_rows - 1))
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
     else
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
@@ -266,9 +265,9 @@ start_iMCU_row (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_coef(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   if (pass_mode != JBUF_CRANK_DEST)
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -289,9 +288,9 @@ start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(boolean)
-compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -306,9 +305,9 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     buffer[ci] = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+      ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
        coef->iMCU_row_num * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, FALSE);
+       (JDIMENSION)compptr->v_samp_factor, FALSE);
   }
 
   /* Loop to process one whole iMCU row */
@@ -321,13 +320,13 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
         compptr = cinfo->cur_comp_info[ci];
         start_col = MCU_col_num * compptr->MCU_width;
-        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-                                                : compptr->last_col_width;
+        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width :
+                                                  compptr->last_col_width;
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
           if (coef->iMCU_row_num < last_iMCU_row ||
-              yindex+yoffset < compptr->last_row_height) {
+              yindex + yoffset < compptr->last_row_height) {
             /* Fill in pointers to real blocks in this row */
-            buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+            buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
             for (xindex = 0; xindex < blockcnt; xindex++)
               MCU_buffer[blkn++] = buffer_ptr++;
           } else {
@@ -342,13 +341,13 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
            */
           for (; xindex < compptr->MCU_width; xindex++) {
             MCU_buffer[blkn] = coef->dummy_buffer[blkn];
-            MCU_buffer[blkn][0][0] = MCU_buffer[blkn-1][0][0];
+            MCU_buffer[blkn][0][0] = MCU_buffer[blkn - 1][0][0];
             blkn++;
           }
         }
       }
       /* Try to write the MCU. */
-      if (! (*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) {
+      if (!(*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->mcu_ctr = MCU_col_num;
@@ -374,17 +373,17 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  */
 
 LOCAL(void)
-transencode_coef_controller (j_compress_ptr cinfo,
-                             jvirt_barray_ptr *coef_arrays)
+transencode_coef_controller(j_compress_ptr cinfo,
+                            jvirt_barray_ptr *coef_arrays)
 {
   my_coef_ptr coef;
   JBLOCKROW buffer;
   int i;
 
   coef = (my_coef_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_coef_controller));
-  cinfo->coef = (struct jpeg_c_coef_controller *) coef;
+  cinfo->coef = (struct jpeg_c_coef_controller *)coef;
   coef->pub.start_pass = start_pass_coef;
   coef->pub.compress_data = compress_output;
 
@@ -393,9 +392,9 @@ transencode_coef_controller (j_compress_ptr cinfo,
 
   /* Allocate and pre-zero space for dummy DCT blocks. */
   buffer = (JBLOCKROW)
-    (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
-  jzero_far((void *) buffer, C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
+  jzero_far((void *)buffer, C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
   for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
     coef->dummy_buffer[i] = buffer + i;
   }
diff --git a/media/libjpeg/jdapimin.c b/media/libjpeg/jdapimin.c
index f80a14667f..f50c27edc3 100644
--- a/media/libjpeg/jdapimin.c
+++ b/media/libjpeg/jdapimin.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2016, D. R. Commander.
+ * Copyright (C) 2016, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -23,6 +23,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdmaster.h"
+#include "jconfigint.h"
 
 
 /*
@@ -31,7 +32,7 @@
  */
 
 GLOBAL(void)
-jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
+jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize)
 {
   int i;
 
@@ -41,7 +42,7 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
     ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version);
   if (structsize != sizeof(struct jpeg_decompress_struct))
     ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE,
-             (int) sizeof(struct jpeg_decompress_struct), (int) structsize);
+             (int)sizeof(struct jpeg_decompress_struct), (int)structsize);
 
   /* For debugging purposes, we zero the whole master structure.
    * But the application has already set the err pointer, and may have set
@@ -50,16 +51,16 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
    * complain here.
    */
   {
-    struct jpeg_error_mgr * err = cinfo->err;
-    void * client_data = cinfo->client_data; /* ignore Purify complaint here */
-    MEMZERO(cinfo, sizeof(struct jpeg_decompress_struct));
+    struct jpeg_error_mgr *err = cinfo->err;
+    void *client_data = cinfo->client_data; /* ignore Purify complaint here */
+    memset(cinfo, 0, sizeof(struct jpeg_decompress_struct));
     cinfo->err = err;
     cinfo->client_data = client_data;
   }
   cinfo->is_decompressor = TRUE;
 
   /* Initialize a memory manager instance for this object */
-  jinit_memory_mgr((j_common_ptr) cinfo);
+  jinit_memory_mgr((j_common_ptr)cinfo);
 
   /* Zero out pointers to permanent structures. */
   cinfo->progress = NULL;
@@ -89,9 +90,9 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
    * here.
    */
   cinfo->master = (struct jpeg_decomp_master *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-                                  sizeof(my_decomp_master));
-  MEMZERO(cinfo->master, sizeof(my_decomp_master));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+                                sizeof(my_decomp_master));
+  memset(cinfo->master, 0, sizeof(my_decomp_master));
 }
 
 
@@ -100,9 +101,9 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
  */
 
 GLOBAL(void)
-jpeg_destroy_decompress (j_decompress_ptr cinfo)
+jpeg_destroy_decompress(j_decompress_ptr cinfo)
 {
-  jpeg_destroy((j_common_ptr) cinfo); /* use common routine */
+  jpeg_destroy((j_common_ptr)cinfo); /* use common routine */
 }
 
 
@@ -112,9 +113,9 @@ jpeg_destroy_decompress (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_abort_decompress (j_decompress_ptr cinfo)
+jpeg_abort_decompress(j_decompress_ptr cinfo)
 {
-  jpeg_abort((j_common_ptr) cinfo); /* use common routine */
+  jpeg_abort((j_common_ptr)cinfo); /* use common routine */
 }
 
 
@@ -123,7 +124,7 @@ jpeg_abort_decompress (j_decompress_ptr cinfo)
  */
 
 LOCAL(void)
-default_decompress_parms (j_decompress_ptr cinfo)
+default_decompress_parms(j_decompress_ptr cinfo)
 {
   /* Guess the input colorspace, and set output colorspace accordingly. */
   /* (Wish JPEG committee had provided a real way to specify this...) */
@@ -250,7 +251,7 @@ default_decompress_parms (j_decompress_ptr cinfo)
  */
 
 GLOBAL(int)
-jpeg_read_header (j_decompress_ptr cinfo, boolean require_image)
+jpeg_read_header(j_decompress_ptr cinfo, boolean require_image)
 {
   int retcode;
 
@@ -271,7 +272,7 @@ jpeg_read_header (j_decompress_ptr cinfo, boolean require_image)
      * call jpeg_abort, but we can't change it now for compatibility reasons.
      * A side effect is to free any temporary memory (there shouldn't be any).
      */
-    jpeg_abort((j_common_ptr) cinfo); /* sets state = DSTATE_START */
+    jpeg_abort((j_common_ptr)cinfo); /* sets state = DSTATE_START */
     retcode = JPEG_HEADER_TABLES_ONLY;
     break;
   case JPEG_SUSPENDED:
@@ -296,7 +297,7 @@ jpeg_read_header (j_decompress_ptr cinfo, boolean require_image)
  */
 
 GLOBAL(int)
-jpeg_consume_input (j_decompress_ptr cinfo)
+jpeg_consume_input(j_decompress_ptr cinfo)
 {
   int retcode = JPEG_SUSPENDED;
 
@@ -308,7 +309,7 @@ jpeg_consume_input (j_decompress_ptr cinfo)
     /* Initialize application's data source module */
     (*cinfo->src->init_source) (cinfo);
     cinfo->global_state = DSTATE_INHEADER;
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case DSTATE_INHEADER:
     retcode = (*cinfo->inputctl->consume_input) (cinfo);
     if (retcode == JPEG_REACHED_SOS) { /* Found SOS, prepare to decompress */
@@ -343,7 +344,7 @@ jpeg_consume_input (j_decompress_ptr cinfo)
  */
 
 GLOBAL(boolean)
-jpeg_input_complete (j_decompress_ptr cinfo)
+jpeg_input_complete(j_decompress_ptr cinfo)
 {
   /* Check for valid jpeg object */
   if (cinfo->global_state < DSTATE_START ||
@@ -358,7 +359,7 @@ jpeg_input_complete (j_decompress_ptr cinfo)
  */
 
 GLOBAL(boolean)
-jpeg_has_multiple_scans (j_decompress_ptr cinfo)
+jpeg_has_multiple_scans(j_decompress_ptr cinfo)
 {
   /* Only valid after jpeg_read_header completes */
   if (cinfo->global_state < DSTATE_READY ||
@@ -378,10 +379,10 @@ jpeg_has_multiple_scans (j_decompress_ptr cinfo)
  */
 
 GLOBAL(boolean)
-jpeg_finish_decompress (j_decompress_ptr cinfo)
+jpeg_finish_decompress(j_decompress_ptr cinfo)
 {
   if ((cinfo->global_state == DSTATE_SCANNING ||
-       cinfo->global_state == DSTATE_RAW_OK) && ! cinfo->buffered_image) {
+       cinfo->global_state == DSTATE_RAW_OK) && !cinfo->buffered_image) {
     /* Terminate final pass of non-buffered mode */
     if (cinfo->output_scanline < cinfo->output_height)
       ERREXIT(cinfo, JERR_TOO_LITTLE_DATA);
@@ -395,13 +396,13 @@ jpeg_finish_decompress (j_decompress_ptr cinfo)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   }
   /* Read until EOI */
-  while (! cinfo->inputctl->eoi_reached) {
+  while (!cinfo->inputctl->eoi_reached) {
     if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
       return FALSE;             /* Suspend, come back later */
   }
   /* Do final cleanup */
   (*cinfo->src->term_source) (cinfo);
   /* We can use jpeg_abort to release memory and reset global_state */
-  jpeg_abort((j_common_ptr) cinfo);
+  jpeg_abort((j_common_ptr)cinfo);
   return TRUE;
 }
diff --git a/media/libjpeg/jdapistd.c b/media/libjpeg/jdapistd.c
index 37afc8448b..8827d8abf5 100644
--- a/media/libjpeg/jdapistd.c
+++ b/media/libjpeg/jdapistd.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010, 2015-2020, 2022, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -21,11 +21,13 @@
 #include "jinclude.h"
 #include "jdmainct.h"
 #include "jdcoefct.h"
+#include "jdmaster.h"
+#include "jdmerge.h"
 #include "jdsample.h"
 #include "jmemsys.h"
 
 /* Forward declarations */
-LOCAL(boolean) output_pass_setup (j_decompress_ptr cinfo);
+LOCAL(boolean) output_pass_setup(j_decompress_ptr cinfo);
 
 
 /*
@@ -40,7 +42,7 @@ LOCAL(boolean) output_pass_setup (j_decompress_ptr cinfo);
  */
 
 GLOBAL(boolean)
-jpeg_start_decompress (j_decompress_ptr cinfo)
+jpeg_start_decompress(j_decompress_ptr cinfo)
 {
   if (cinfo->global_state == DSTATE_READY) {
     /* First call: initialize master control, select active modules */
@@ -60,7 +62,7 @@ jpeg_start_decompress (j_decompress_ptr cinfo)
         int retcode;
         /* Call progress monitor hook if present */
         if (cinfo->progress != NULL)
-          (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+          (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
         /* Absorb some more input */
         retcode = (*cinfo->inputctl->consume_input) (cinfo);
         if (retcode == JPEG_SUSPENDED)
@@ -72,7 +74,7 @@ jpeg_start_decompress (j_decompress_ptr cinfo)
             (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
           if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
             /* jdmaster underestimated number of scans; ratchet up one scan */
-            cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
+            cinfo->progress->pass_limit += (long)cinfo->total_iMCU_rows;
           }
         }
       }
@@ -97,7 +99,7 @@ jpeg_start_decompress (j_decompress_ptr cinfo)
  */
 
 LOCAL(boolean)
-output_pass_setup (j_decompress_ptr cinfo)
+output_pass_setup(j_decompress_ptr cinfo)
 {
   if (cinfo->global_state != DSTATE_PRESCAN) {
     /* First call: do pass setup */
@@ -113,14 +115,14 @@ output_pass_setup (j_decompress_ptr cinfo)
       JDIMENSION last_scanline;
       /* Call progress monitor hook if present */
       if (cinfo->progress != NULL) {
-        cinfo->progress->pass_counter = (long) cinfo->output_scanline;
-        cinfo->progress->pass_limit = (long) cinfo->output_height;
-        (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+        cinfo->progress->pass_limit = (long)cinfo->output_height;
+        (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
       }
       /* Process some data */
       last_scanline = cinfo->output_scanline;
-      (*cinfo->main->process_data) (cinfo, (JSAMPARRAY) NULL,
-                                    &cinfo->output_scanline, (JDIMENSION) 0);
+      (*cinfo->main->process_data) (cinfo, (JSAMPARRAY)NULL,
+                                    &cinfo->output_scanline, (JDIMENSION)0);
       if (cinfo->output_scanline == last_scanline)
         return FALSE;           /* No progress made, must suspend */
     }
@@ -150,13 +152,14 @@ output_pass_setup (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
-                    JDIMENSION *width)
+jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                   JDIMENSION *width)
 {
   int ci, align, orig_downsampled_width;
   JDIMENSION input_xoffset;
   boolean reinit_upsampler = FALSE;
   jpeg_component_info *compptr;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   if (cinfo->global_state != DSTATE_SCANNING || cinfo->output_scanline != 0)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
@@ -190,7 +193,10 @@ jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
    * single-pass decompression case, allowing us to use the same MCU column
    * width for all of the components.
    */
-  align = cinfo->_min_DCT_scaled_size * cinfo->max_h_samp_factor;
+  if (cinfo->comps_in_scan == 1 && cinfo->num_components == 1)
+    align = cinfo->_min_DCT_scaled_size;
+  else
+    align = cinfo->_min_DCT_scaled_size * cinfo->max_h_samp_factor;
 
   /* Adjust xoffset to the nearest iMCU boundary <= the requested value */
   input_xoffset = *xoffset;
@@ -203,24 +209,31 @@ jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
    */
   *width = *width + input_xoffset - *xoffset;
   cinfo->output_width = *width;
+  if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+    my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+    upsample->out_row_width =
+      cinfo->output_width * cinfo->out_color_components;
+  }
 
   /* Set the first and last iMCU columns that we must decompress.  These values
    * will be used in single-scan decompressions.
    */
-  cinfo->master->first_iMCU_col =
-    (JDIMENSION) (long) (*xoffset) / (long) align;
+  cinfo->master->first_iMCU_col = (JDIMENSION)(long)(*xoffset) / (long)align;
   cinfo->master->last_iMCU_col =
-    (JDIMENSION) jdiv_round_up((long) (*xoffset + cinfo->output_width),
-                               (long) align) - 1;
+    (JDIMENSION)jdiv_round_up((long)(*xoffset + cinfo->output_width),
+                              (long)align) - 1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
+    int hsf = (cinfo->comps_in_scan == 1 && cinfo->num_components == 1) ?
+              1 : compptr->h_samp_factor;
+
     /* Set downsampled_width to the new output width. */
     orig_downsampled_width = compptr->downsampled_width;
     compptr->downsampled_width =
-      (JDIMENSION) jdiv_round_up((long) (cinfo->output_width *
-                                         compptr->h_samp_factor),
-                                 (long) cinfo->max_h_samp_factor);
+      (JDIMENSION)jdiv_round_up((long)(cinfo->output_width *
+                                       compptr->h_samp_factor),
+                                (long)cinfo->max_h_samp_factor);
     if (compptr->downsampled_width < 2 && orig_downsampled_width >= 2)
       reinit_upsampler = TRUE;
 
@@ -228,12 +241,10 @@ jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
      * values will be used in multi-scan decompressions.
      */
     cinfo->master->first_MCU_col[ci] =
-      (JDIMENSION) (long) (*xoffset * compptr->h_samp_factor) /
-                   (long) align;
+      (JDIMENSION)(long)(*xoffset * hsf) / (long)align;
     cinfo->master->last_MCU_col[ci] =
-      (JDIMENSION) jdiv_round_up((long) ((*xoffset + cinfo->output_width) *
-                                         compptr->h_samp_factor),
-                                 (long) align) - 1;
+      (JDIMENSION)jdiv_round_up((long)((*xoffset + cinfo->output_width) * hsf),
+                                (long)align) - 1;
   }
 
   if (reinit_upsampler) {
@@ -258,8 +269,8 @@ jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
  */
 
 GLOBAL(JDIMENSION)
-jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines,
-                     JDIMENSION max_lines)
+jpeg_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+                    JDIMENSION max_lines)
 {
   JDIMENSION row_ctr;
 
@@ -272,9 +283,9 @@ jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines,
 
   /* Call progress monitor hook if present */
   if (cinfo->progress != NULL) {
-    cinfo->progress->pass_counter = (long) cinfo->output_scanline;
-    cinfo->progress->pass_limit = (long) cinfo->output_height;
-    (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+    cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+    cinfo->progress->pass_limit = (long)cinfo->output_height;
+    (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
   }
 
   /* Process some data */
@@ -287,8 +298,16 @@ jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines,
 
 /* Dummy color convert function used by jpeg_skip_scanlines() */
 LOCAL(void)
-noop_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-              JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+noop_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+             JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+
+/* Dummy quantize function used by jpeg_skip_scanlines() */
+LOCAL(void)
+noop_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+              JSAMPARRAY output_buf, int num_rows)
 {
 }
 
@@ -302,20 +321,46 @@ noop_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
  */
 
 LOCAL(void)
-read_and_discard_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
+read_and_discard_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
 {
   JDIMENSION n;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
+  JSAMPLE dummy_sample[1] = { 0 };
+  JSAMPROW dummy_row = dummy_sample;
+  JSAMPARRAY scanlines = NULL;
   void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                          JDIMENSION input_row, JSAMPARRAY output_buf,
-                         int num_rows);
+                         int num_rows) = NULL;
+  void (*color_quantize) (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                          JSAMPARRAY output_buf, int num_rows) = NULL;
+
+  if (cinfo->cconvert && cinfo->cconvert->color_convert) {
+    color_convert = cinfo->cconvert->color_convert;
+    cinfo->cconvert->color_convert = noop_convert;
+    /* This just prevents UBSan from complaining about adding 0 to a NULL
+     * pointer.  The pointer isn't actually used.
+     */
+    scanlines = &dummy_row;
+  }
+
+  if (cinfo->cquantize && cinfo->cquantize->color_quantize) {
+    color_quantize = cinfo->cquantize->color_quantize;
+    cinfo->cquantize->color_quantize = noop_quantize;
+  }
 
-  color_convert = cinfo->cconvert->color_convert;
-  cinfo->cconvert->color_convert = noop_convert;
+  if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+    my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+    scanlines = &upsample->spare_row;
+  }
 
   for (n = 0; n < num_lines; n++)
-    jpeg_read_scanlines(cinfo, NULL, 1);
+    jpeg_read_scanlines(cinfo, scanlines, 1);
 
-  cinfo->cconvert->color_convert = color_convert;
+  if (color_convert)
+    cinfo->cconvert->color_convert = color_convert;
+
+  if (color_quantize)
+    cinfo->cquantize->color_quantize = color_quantize;
 }
 
 
@@ -325,10 +370,16 @@ read_and_discard_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
  */
 
 LOCAL(void)
-increment_simple_rowgroup_ctr (j_decompress_ptr cinfo, JDIMENSION rows)
+increment_simple_rowgroup_ctr(j_decompress_ptr cinfo, JDIMENSION rows)
 {
   JDIMENSION rows_left;
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
+
+  if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+    read_and_discard_scanlines(cinfo, rows);
+    return;
+  }
 
   /* Increment the counter to the next row group after the skipped rows. */
   main_ptr->rowgroup_ctr += rows / cinfo->max_v_samp_factor;
@@ -354,23 +405,31 @@ increment_simple_rowgroup_ctr (j_decompress_ptr cinfo, JDIMENSION rows)
  */
 
 GLOBAL(JDIMENSION)
-jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
+jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   JDIMENSION i, x;
   int y;
   JDIMENSION lines_per_iMCU_row, lines_left_in_iMCU_row, lines_after_iMCU_row;
   JDIMENSION lines_to_skip, lines_to_read;
 
+  /* Two-pass color quantization is not supported. */
+  if (cinfo->quantize_colors && cinfo->two_pass_quantize)
+    ERREXIT(cinfo, JERR_NOTIMPL);
+
   if (cinfo->global_state != DSTATE_SCANNING)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
 
   /* Do not skip past the bottom of the image. */
   if (cinfo->output_scanline + num_lines >= cinfo->output_height) {
+    num_lines = cinfo->output_height - cinfo->output_scanline;
     cinfo->output_scanline = cinfo->output_height;
-    return cinfo->output_height - cinfo->output_scanline;
+    (*cinfo->inputctl->finish_input_pass) (cinfo);
+    cinfo->inputctl->eoi_reached = TRUE;
+    return num_lines;
   }
 
   if (num_lines == 0)
@@ -419,8 +478,10 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
     main_ptr->buffer_full = FALSE;
     main_ptr->rowgroup_ctr = 0;
     main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
-    upsample->next_row_out = cinfo->max_v_samp_factor;
-    upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+    if (!master->using_merged_upsample) {
+      upsample->next_row_out = cinfo->max_v_samp_factor;
+      upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+    }
   }
 
   /* Skipping is much simpler when context rows are not required. */
@@ -432,8 +493,10 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
       cinfo->output_scanline += lines_left_in_iMCU_row;
       main_ptr->buffer_full = FALSE;
       main_ptr->rowgroup_ctr = 0;
-      upsample->next_row_out = cinfo->max_v_samp_factor;
-      upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+      if (!master->using_merged_upsample) {
+        upsample->next_row_out = cinfo->max_v_samp_factor;
+        upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+      }
     }
   }
 
@@ -458,7 +521,7 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
     if (cinfo->upsample->need_context_rows) {
       cinfo->output_scanline += lines_to_skip;
       cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
-      main_ptr->iMCU_row_ctr += lines_after_iMCU_row / lines_per_iMCU_row;
+      main_ptr->iMCU_row_ctr += lines_to_skip / lines_per_iMCU_row;
       /* It is complex to properly move to the middle of a context block, so
        * read the remaining lines instead of skipping them.
        */
@@ -468,7 +531,8 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
       cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
       increment_simple_rowgroup_ctr(cinfo, lines_to_read);
     }
-    upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+    if (!master->using_merged_upsample)
+      upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
     return num_lines;
   }
 
@@ -480,6 +544,8 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
          * decoded coefficients.  This is ~5% faster for large subsets, but
          * it's tough to tell a difference for smaller images.
          */
+        if (!cinfo->entropy->insufficient_data)
+          cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
         (*cinfo->entropy->decode_mcu) (cinfo, NULL);
       }
     }
@@ -509,7 +575,8 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
    * bit odd, since "rows_to_go" seems to be redundantly keeping track of
    * output_scanline.
    */
-  upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+  if (!master->using_merged_upsample)
+    upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
 
   /* Always skip the requested number of lines. */
   return num_lines;
@@ -521,8 +588,8 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
  */
 
 GLOBAL(JDIMENSION)
-jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
-                    JDIMENSION max_lines)
+jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                   JDIMENSION max_lines)
 {
   JDIMENSION lines_per_iMCU_row;
 
@@ -535,9 +602,9 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
 
   /* Call progress monitor hook if present */
   if (cinfo->progress != NULL) {
-    cinfo->progress->pass_counter = (long) cinfo->output_scanline;
-    cinfo->progress->pass_limit = (long) cinfo->output_height;
-    (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+    cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+    cinfo->progress->pass_limit = (long)cinfo->output_height;
+    (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
   }
 
   /* Verify that at least one iMCU row can be returned. */
@@ -546,7 +613,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
   /* Decompress directly into user's buffer. */
-  if (! (*cinfo->coef->decompress_data) (cinfo, data))
+  if (!(*cinfo->coef->decompress_data) (cinfo, data))
     return 0;                   /* suspension forced, can do nothing more */
 
   /* OK, we processed one iMCU row. */
@@ -564,7 +631,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
  */
 
 GLOBAL(boolean)
-jpeg_start_output (j_decompress_ptr cinfo, int scan_number)
+jpeg_start_output(j_decompress_ptr cinfo, int scan_number)
 {
   if (cinfo->global_state != DSTATE_BUFIMAGE &&
       cinfo->global_state != DSTATE_PRESCAN)
@@ -572,8 +639,7 @@ jpeg_start_output (j_decompress_ptr cinfo, int scan_number)
   /* Limit scan number to valid range */
   if (scan_number <= 0)
     scan_number = 1;
-  if (cinfo->inputctl->eoi_reached &&
-      scan_number > cinfo->input_scan_number)
+  if (cinfo->inputctl->eoi_reached && scan_number > cinfo->input_scan_number)
     scan_number = cinfo->input_scan_number;
   cinfo->output_scan_number = scan_number;
   /* Perform any dummy output passes, and set up for the real pass */
@@ -589,7 +655,7 @@ jpeg_start_output (j_decompress_ptr cinfo, int scan_number)
  */
 
 GLOBAL(boolean)
-jpeg_finish_output (j_decompress_ptr cinfo)
+jpeg_finish_output(j_decompress_ptr cinfo)
 {
   if ((cinfo->global_state == DSTATE_SCANNING ||
        cinfo->global_state == DSTATE_RAW_OK) && cinfo->buffered_image) {
@@ -603,7 +669,7 @@ jpeg_finish_output (j_decompress_ptr cinfo)
   }
   /* Read markers looking for SOS or EOI */
   while (cinfo->input_scan_number <= cinfo->output_scan_number &&
-         ! cinfo->inputctl->eoi_reached) {
+         !cinfo->inputctl->eoi_reached) {
     if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
       return FALSE;             /* Suspend, come back later */
   }
diff --git a/media/libjpeg/jdarith.c b/media/libjpeg/jdarith.c
index df3540eef5..21575e80c7 100644
--- a/media/libjpeg/jdarith.c
+++ b/media/libjpeg/jdarith.c
@@ -4,16 +4,19 @@
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2015 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
- * This file contains portable arithmetic entropy decoding routines for JPEG
- * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ * This file contains portable arithmetic entropy encoding routines for JPEG
+ * (implementing Recommendation ITU-T T.81 | ISO/IEC 10918-1).
  *
  * Both sequential and progressive modes are supported in this single module.
  *
  * Suspension is not currently supported in this module.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
  */
 
 #define JPEG_INTERNALS
@@ -21,6 +24,9 @@
 #include "jpeglib.h"
 
 
+#define NEG_1  ((unsigned int)-1)
+
+
 /* Expanded entropy decoder object for arithmetic decoding. */
 
 typedef struct {
@@ -60,21 +66,21 @@ typedef arith_entropy_decoder *arith_entropy_ptr;
  * in the lower bits (mask 0x7F).
  */
 
-#define DC_STAT_BINS 64
-#define AC_STAT_BINS 256
+#define DC_STAT_BINS  64
+#define AC_STAT_BINS  256
 
 
 LOCAL(int)
-get_byte (j_decompress_ptr cinfo)
+get_byte(j_decompress_ptr cinfo)
 /* Read next input byte; we do not support suspension in this module. */
 {
   struct jpeg_source_mgr *src = cinfo->src;
 
   if (src->bytes_in_buffer == 0)
-    if (! (*src->fill_input_buffer) (cinfo))
+    if (!(*src->fill_input_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   src->bytes_in_buffer--;
-  return GETJOCTET(*src->next_input_byte++);
+  return *src->next_input_byte++;
 }
 
 
@@ -106,9 +112,9 @@ get_byte (j_decompress_ptr cinfo)
  */
 
 LOCAL(int)
-arith_decode (j_decompress_ptr cinfo, unsigned char *st)
+arith_decode(j_decompress_ptr cinfo, unsigned char *st)
 {
-  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
   register unsigned char nl, nm;
   register JLONG qe, temp;
   register int sv, data;
@@ -153,8 +159,8 @@ arith_decode (j_decompress_ptr cinfo, unsigned char *st)
    */
   sv = *st;
   qe = jpeg_aritab[sv & 0x7F];  /* => Qe_Value */
-  nl = qe & 0xFF; qe >>= 8;     /* Next_Index_LPS + Switch_MPS */
-  nm = qe & 0xFF; qe >>= 8;     /* Next_Index_MPS */
+  nl = qe & 0xFF;  qe >>= 8;    /* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF;  qe >>= 8;    /* Next_Index_MPS */
 
   /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
   temp = e->a - qe;
@@ -190,27 +196,27 @@ arith_decode (j_decompress_ptr cinfo, unsigned char *st)
  */
 
 LOCAL(void)
-process_restart (j_decompress_ptr cinfo)
+process_restart(j_decompress_ptr cinfo)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   int ci;
   jpeg_component_info *compptr;
 
   /* Advance past the RSTn marker */
-  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+  if (!(*cinfo->marker->read_restart_marker) (cinfo))
     ERREXIT(cinfo, JERR_CANT_SUSPEND);
 
   /* Re-initialize statistics areas */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
-      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
       /* Reset DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
     }
     if (!cinfo->progressive_mode || cinfo->Ss) {
-      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+      memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
     }
   }
 
@@ -241,9 +247,9 @@ process_restart (j_decompress_ptr cinfo)
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int blkn, ci, tbl, sign;
@@ -277,7 +283,7 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       /* Figure F.21: Decoding nonzero value v */
       /* Figure F.22: Decoding the sign of v */
       sign = arith_decode(cinfo, st + 1);
-      st += 2; st += sign;
+      st += 2;  st += sign;
       /* Figure F.23: Decoding the magnitude category of v */
       if ((m = arith_decode(cinfo, st)) != 0) {
         st = entropy->dc_stats[tbl] + 20;       /* Table F.4: X1 = 20 */
@@ -291,9 +297,9 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
         }
       }
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
-      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+      if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
         entropy->dc_context[ci] = 0;               /* zero diff category */
-      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+      else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
         entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
       else
         entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
@@ -302,12 +308,12 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       st += 14;
       while (m >>= 1)
         if (arith_decode(cinfo, st)) v |= m;
-      v += 1; if (sign) v = -v;
-      entropy->last_dc_val[ci] += v;
+      v += 1;  if (sign) v = -v;
+      entropy->last_dc_val[ci] = (entropy->last_dc_val[ci] + v) & 0xffff;
     }
 
     /* Scale and output the DC coefficient (assumes jpeg_natural_order[0]=0) */
-    (*block)[0] = (JCOEF) LEFT_SHIFT(entropy->last_dc_val[ci], cinfo->Al);
+    (*block)[0] = (JCOEF)LEFT_SHIFT(entropy->last_dc_val[ci], cinfo->Al);
   }
 
   return TRUE;
@@ -320,9 +326,9 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int tbl, sign, k;
@@ -348,7 +354,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     st = entropy->ac_stats[tbl] + 3 * (k - 1);
     if (arith_decode(cinfo, st)) break;         /* EOB flag */
     while (arith_decode(cinfo, st + 1) == 0) {
-      st += 3; k++;
+      st += 3;  k++;
       if (k > cinfo->Se) {
         WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
         entropy->ct = -1;                       /* spectral overflow */
@@ -380,9 +386,9 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     st += 14;
     while (m >>= 1)
       if (arith_decode(cinfo, st)) v |= m;
-    v += 1; if (sign) v = -v;
+    v += 1;  if (sign) v = -v;
     /* Scale and output coefficient in natural (dezigzagged) order */
-    (*block)[jpeg_natural_order[k]] = (JCOEF) ((unsigned)v << cinfo->Al);
+    (*block)[jpeg_natural_order[k]] = (JCOEF)((unsigned)v << cinfo->Al);
   }
 
   return TRUE;
@@ -394,9 +400,9 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   unsigned char *st;
   int p1, blkn;
 
@@ -427,9 +433,9 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   JCOEFPTR thiscoef;
   unsigned char *st;
@@ -450,7 +456,7 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
 
   p1 = 1 << cinfo->Al;          /* 1 in the bit position being coded */
-  m1 = (-1) << cinfo->Al;       /* -1 in the bit position being coded */
+  m1 = (NEG_1) << cinfo->Al;    /* -1 in the bit position being coded */
 
   /* Establish EOBx (previous stage end-of-block) index */
   for (kex = cinfo->Se; kex > 0; kex--)
@@ -465,20 +471,20 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       if (*thiscoef) {                          /* previously nonzero coef */
         if (arith_decode(cinfo, st + 2)) {
           if (*thiscoef < 0)
-            *thiscoef += m1;
+            *thiscoef += (JCOEF)m1;
           else
-            *thiscoef += p1;
+            *thiscoef += (JCOEF)p1;
         }
         break;
       }
       if (arith_decode(cinfo, st + 1)) {        /* newly nonzero coef */
         if (arith_decode(cinfo, entropy->fixed_bin))
-          *thiscoef = m1;
+          *thiscoef = (JCOEF)m1;
         else
-          *thiscoef = p1;
+          *thiscoef = (JCOEF)p1;
         break;
       }
-      st += 3; k++;
+      st += 3;  k++;
       if (k > cinfo->Se) {
         WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
         entropy->ct = -1;                       /* spectral overflow */
@@ -496,9 +502,9 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   jpeg_component_info *compptr;
   JBLOCKROW block;
   unsigned char *st;
@@ -535,7 +541,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       /* Figure F.21: Decoding nonzero value v */
       /* Figure F.22: Decoding the sign of v */
       sign = arith_decode(cinfo, st + 1);
-      st += 2; st += sign;
+      st += 2;  st += sign;
       /* Figure F.23: Decoding the magnitude category of v */
       if ((m = arith_decode(cinfo, st)) != 0) {
         st = entropy->dc_stats[tbl] + 20;       /* Table F.4: X1 = 20 */
@@ -549,9 +555,9 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
         }
       }
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
-      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+      if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
         entropy->dc_context[ci] = 0;               /* zero diff category */
-      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+      else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
         entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
       else
         entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
@@ -560,12 +566,12 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       st += 14;
       while (m >>= 1)
         if (arith_decode(cinfo, st)) v |= m;
-      v += 1; if (sign) v = -v;
-      entropy->last_dc_val[ci] += v;
+      v += 1;  if (sign) v = -v;
+      entropy->last_dc_val[ci] = (entropy->last_dc_val[ci] + v) & 0xffff;
     }
 
     if (block)
-      (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
+      (*block)[0] = (JCOEF)entropy->last_dc_val[ci];
 
     /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
 
@@ -576,7 +582,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       st = entropy->ac_stats[tbl] + 3 * (k - 1);
       if (arith_decode(cinfo, st)) break;       /* EOB flag */
       while (arith_decode(cinfo, st + 1) == 0) {
-        st += 3; k++;
+        st += 3;  k++;
         if (k > DCTSIZE2 - 1) {
           WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
           entropy->ct = -1;                     /* spectral overflow */
@@ -608,9 +614,9 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       st += 14;
       while (m >>= 1)
         if (arith_decode(cinfo, st)) v |= m;
-      v += 1; if (sign) v = -v;
+      v += 1;  if (sign) v = -v;
       if (block)
-        (*block)[jpeg_natural_order[k]] = (JCOEF) v;
+        (*block)[jpeg_natural_order[k]] = (JCOEF)v;
     }
   }
 
@@ -623,9 +629,9 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(void)
-start_pass (j_decompress_ptr cinfo)
+start_pass(j_decompress_ptr cinfo)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   int ci, tbl;
   jpeg_component_info *compptr;
 
@@ -644,11 +650,11 @@ start_pass (j_decompress_ptr cinfo)
     }
     if (cinfo->Ah != 0) {
       /* Successive approximation refinement scan: must have Al = Ah-1. */
-      if (cinfo->Ah-1 != cinfo->Al)
+      if (cinfo->Ah - 1 != cinfo->Al)
         goto bad;
     }
     if (cinfo->Al > 13) {       /* need not check for < 0 */
-      bad:
+bad:
       ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
                cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
     }
@@ -658,9 +664,17 @@ start_pass (j_decompress_ptr cinfo)
      */
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
-      int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+      int *coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+      int *prev_coef_bit_ptr =
+        &cinfo->coef_bits[cindex + cinfo->num_components][0];
       if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
         WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+      for (coefi = MIN(cinfo->Ss, 1); coefi <= MAX(cinfo->Se, 9); coefi++) {
+        if (cinfo->input_scan_number > 1)
+          prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
+        else
+          prev_coef_bit_ptr[coefi] = 0;
+      }
       for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
         int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
         if (cinfo->Ah != expected)
@@ -684,8 +698,8 @@ start_pass (j_decompress_ptr cinfo)
     /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
      * This ought to be an error condition, but we make it a warning.
      */
-    if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
-        (cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1))
+    if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 ||
+        cinfo->Ah != 0 || cinfo->Al != 0)
       WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
     /* Select MCU decoding routine */
     entropy->pub.decode_mcu = decode_mcu;
@@ -699,9 +713,9 @@ start_pass (j_decompress_ptr cinfo)
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->dc_stats[tbl] == NULL)
-        entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
-      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+        entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
       /* Initialize DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
@@ -711,9 +725,9 @@ start_pass (j_decompress_ptr cinfo)
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->ac_stats[tbl] == NULL)
-        entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
-      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+        entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
     }
   }
 
@@ -721,6 +735,7 @@ start_pass (j_decompress_ptr cinfo)
   entropy->c = 0;
   entropy->a = 0;
   entropy->ct = -16;    /* force reading 2 initial bytes to fill C */
+  entropy->pub.insufficient_data = FALSE;
 
   /* Initialize restart counter */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -732,15 +747,15 @@ start_pass (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_arith_decoder (j_decompress_ptr cinfo)
+jinit_arith_decoder(j_decompress_ptr cinfo)
 {
   arith_entropy_ptr entropy;
   int i;
 
   entropy = (arith_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(arith_entropy_decoder));
-  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
   entropy->pub.start_pass = start_pass;
 
   /* Mark tables unallocated */
@@ -756,9 +771,10 @@ jinit_arith_decoder (j_decompress_ptr cinfo)
     /* Create progression status table */
     int *coef_bit_ptr, ci;
     cinfo->coef_bits = (int (*)[DCTSIZE2])
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components*DCTSIZE2*sizeof(int));
-    coef_bit_ptr = & cinfo->coef_bits[0][0];
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  cinfo->num_components * 2 * DCTSIZE2 *
+                                  sizeof(int));
+    coef_bit_ptr = &cinfo->coef_bits[0][0];
     for (ci = 0; ci < cinfo->num_components; ci++)
       for (i = 0; i < DCTSIZE2; i++)
         *coef_bit_ptr++ = -1;
diff --git a/media/libjpeg/jdatadst.c b/media/libjpeg/jdatadst.c
index dcaf6f0f96..6b4fed2339 100644
--- a/media/libjpeg/jdatadst.c
+++ b/media/libjpeg/jdatadst.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2012 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2013, 2016, D. R. Commander.
+ * Copyright (C) 2013, 2016, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -23,11 +23,6 @@
 #include "jpeglib.h"
 #include "jerror.h"
 
-#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
-extern void *malloc (size_t size);
-extern void free (void *ptr);
-#endif
-
 
 /* Expanded data destination object for stdio output */
 
@@ -66,14 +61,14 @@ typedef my_mem_destination_mgr *my_mem_dest_ptr;
  */
 
 METHODDEF(void)
-init_destination (j_compress_ptr cinfo)
+init_destination(j_compress_ptr cinfo)
 {
-  my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
+  my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
 
   /* Allocate the output buffer --- it will be released when done with image */
   dest->buffer = (JOCTET *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  OUTPUT_BUF_SIZE * sizeof(JOCTET));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                OUTPUT_BUF_SIZE * sizeof(JOCTET));
 
   dest->pub.next_output_byte = dest->buffer;
   dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
@@ -81,7 +76,7 @@ init_destination (j_compress_ptr cinfo)
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(void)
-init_mem_destination (j_compress_ptr cinfo)
+init_mem_destination(j_compress_ptr cinfo)
 {
   /* no work necessary here */
 }
@@ -112,12 +107,12 @@ init_mem_destination (j_compress_ptr cinfo)
  */
 
 METHODDEF(boolean)
-empty_output_buffer (j_compress_ptr cinfo)
+empty_output_buffer(j_compress_ptr cinfo)
 {
-  my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
+  my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
 
-  if (JFWRITE(dest->outfile, dest->buffer, OUTPUT_BUF_SIZE) !=
-      (size_t) OUTPUT_BUF_SIZE)
+  if (fwrite(dest->buffer, 1, OUTPUT_BUF_SIZE, dest->outfile) !=
+      (size_t)OUTPUT_BUF_SIZE)
     ERREXIT(cinfo, JERR_FILE_WRITE);
 
   dest->pub.next_output_byte = dest->buffer;
@@ -128,23 +123,22 @@ empty_output_buffer (j_compress_ptr cinfo)
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(boolean)
-empty_mem_output_buffer (j_compress_ptr cinfo)
+empty_mem_output_buffer(j_compress_ptr cinfo)
 {
   size_t nextsize;
   JOCTET *nextbuffer;
-  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
 
   /* Try to allocate new buffer with double size */
   nextsize = dest->bufsize * 2;
-  nextbuffer = (JOCTET *) malloc(nextsize);
+  nextbuffer = (JOCTET *)malloc(nextsize);
 
   if (nextbuffer == NULL)
     ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
 
-  MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
+  memcpy(nextbuffer, dest->buffer, dest->bufsize);
 
-  if (dest->newbuffer != NULL)
-    free(dest->newbuffer);
+  free(dest->newbuffer);
 
   dest->newbuffer = nextbuffer;
 
@@ -169,14 +163,14 @@ empty_mem_output_buffer (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-term_destination (j_compress_ptr cinfo)
+term_destination(j_compress_ptr cinfo)
 {
-  my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
+  my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
   size_t datacount = OUTPUT_BUF_SIZE - dest->pub.free_in_buffer;
 
   /* Write any data remaining in the buffer */
   if (datacount > 0) {
-    if (JFWRITE(dest->outfile, dest->buffer, datacount) != datacount)
+    if (fwrite(dest->buffer, 1, datacount, dest->outfile) != datacount)
       ERREXIT(cinfo, JERR_FILE_WRITE);
   }
   fflush(dest->outfile);
@@ -187,9 +181,9 @@ term_destination (j_compress_ptr cinfo)
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(void)
-term_mem_destination (j_compress_ptr cinfo)
+term_mem_destination(j_compress_ptr cinfo)
 {
-  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
 
   *dest->outbuffer = dest->buffer;
   *dest->outsize = (unsigned long)(dest->bufsize - dest->pub.free_in_buffer);
@@ -204,7 +198,7 @@ term_mem_destination (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
+jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile)
 {
   my_dest_ptr dest;
 
@@ -213,7 +207,7 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
    */
   if (cinfo->dest == NULL) {    /* first time for this JPEG object? */
     cinfo->dest = (struct jpeg_destination_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(my_destination_mgr));
   } else if (cinfo->dest->init_destination != init_destination) {
     /* It is unsafe to reuse the existing destination manager unless it was
@@ -225,7 +219,7 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
-  dest = (my_dest_ptr) cinfo->dest;
+  dest = (my_dest_ptr)cinfo->dest;
   dest->pub.init_destination = init_destination;
   dest->pub.empty_output_buffer = empty_output_buffer;
   dest->pub.term_destination = term_destination;
@@ -249,8 +243,8 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
  */
 
 GLOBAL(void)
-jpeg_mem_dest (j_compress_ptr cinfo,
-               unsigned char **outbuffer, unsigned long *outsize)
+jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+              unsigned long *outsize)
 {
   my_mem_dest_ptr dest;
 
@@ -262,7 +256,7 @@ jpeg_mem_dest (j_compress_ptr cinfo,
    */
   if (cinfo->dest == NULL) {    /* first time for this JPEG object? */
     cinfo->dest = (struct jpeg_destination_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(my_mem_destination_mgr));
   } else if (cinfo->dest->init_destination != init_mem_destination) {
     /* It is unsafe to reuse the existing destination manager unless it was
@@ -271,7 +265,7 @@ jpeg_mem_dest (j_compress_ptr cinfo,
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
-  dest = (my_mem_dest_ptr) cinfo->dest;
+  dest = (my_mem_dest_ptr)cinfo->dest;
   dest->pub.init_destination = init_mem_destination;
   dest->pub.empty_output_buffer = empty_mem_output_buffer;
   dest->pub.term_destination = term_mem_destination;
@@ -281,7 +275,7 @@ jpeg_mem_dest (j_compress_ptr cinfo,
 
   if (*outbuffer == NULL || *outsize == 0) {
     /* Allocate initial buffer */
-    dest->newbuffer = *outbuffer = (unsigned char *) malloc(OUTPUT_BUF_SIZE);
+    dest->newbuffer = *outbuffer = (unsigned char *)malloc(OUTPUT_BUF_SIZE);
     if (dest->newbuffer == NULL)
       ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
     *outsize = OUTPUT_BUF_SIZE;
diff --git a/media/libjpeg/jdatasrc.c b/media/libjpeg/jdatasrc.c
index c83183fe19..e36a30d894 100644
--- a/media/libjpeg/jdatasrc.c
+++ b/media/libjpeg/jdatasrc.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2013, 2016, D. R. Commander.
+ * Copyright (C) 2013, 2016, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -45,9 +45,9 @@ typedef my_source_mgr *my_src_ptr;
  */
 
 METHODDEF(void)
-init_source (j_decompress_ptr cinfo)
+init_source(j_decompress_ptr cinfo)
 {
-  my_src_ptr src = (my_src_ptr) cinfo->src;
+  my_src_ptr src = (my_src_ptr)cinfo->src;
 
   /* We reset the empty-input-file flag for each image,
    * but we don't clear the input buffer.
@@ -58,7 +58,7 @@ init_source (j_decompress_ptr cinfo)
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(void)
-init_mem_source (j_decompress_ptr cinfo)
+init_mem_source(j_decompress_ptr cinfo)
 {
   /* no work necessary here */
 }
@@ -99,20 +99,20 @@ init_mem_source (j_decompress_ptr cinfo)
  */
 
 METHODDEF(boolean)
-fill_input_buffer (j_decompress_ptr cinfo)
+fill_input_buffer(j_decompress_ptr cinfo)
 {
-  my_src_ptr src = (my_src_ptr) cinfo->src;
+  my_src_ptr src = (my_src_ptr)cinfo->src;
   size_t nbytes;
 
-  nbytes = JFREAD(src->infile, src->buffer, INPUT_BUF_SIZE);
+  nbytes = fread(src->buffer, 1, INPUT_BUF_SIZE, src->infile);
 
   if (nbytes <= 0) {
     if (src->start_of_file)     /* Treat empty input file as fatal error */
       ERREXIT(cinfo, JERR_INPUT_EMPTY);
     WARNMS(cinfo, JWRN_JPEG_EOF);
     /* Insert a fake EOI marker */
-    src->buffer[0] = (JOCTET) 0xFF;
-    src->buffer[1] = (JOCTET) JPEG_EOI;
+    src->buffer[0] = (JOCTET)0xFF;
+    src->buffer[1] = (JOCTET)JPEG_EOI;
     nbytes = 2;
   }
 
@@ -125,10 +125,10 @@ fill_input_buffer (j_decompress_ptr cinfo)
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(boolean)
-fill_mem_input_buffer (j_decompress_ptr cinfo)
+fill_mem_input_buffer(j_decompress_ptr cinfo)
 {
   static const JOCTET mybuffer[4] = {
-    (JOCTET) 0xFF, (JOCTET) JPEG_EOI, 0, 0
+    (JOCTET)0xFF, (JOCTET)JPEG_EOI, 0, 0
   };
 
   /* The whole JPEG data is expected to reside in the supplied memory
@@ -160,7 +160,7 @@ fill_mem_input_buffer (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-skip_input_data (j_decompress_ptr cinfo, long num_bytes)
+skip_input_data(j_decompress_ptr cinfo, long num_bytes)
 {
   struct jpeg_source_mgr *src = cinfo->src;
 
@@ -169,15 +169,15 @@ skip_input_data (j_decompress_ptr cinfo, long num_bytes)
    * any trouble anyway --- large skips are infrequent.
    */
   if (num_bytes > 0) {
-    while (num_bytes > (long) src->bytes_in_buffer) {
-      num_bytes -= (long) src->bytes_in_buffer;
-      (void) (*src->fill_input_buffer) (cinfo);
+    while (num_bytes > (long)src->bytes_in_buffer) {
+      num_bytes -= (long)src->bytes_in_buffer;
+      (void)(*src->fill_input_buffer) (cinfo);
       /* note we assume that fill_input_buffer will never return FALSE,
        * so suspension need not be handled.
        */
     }
-    src->next_input_byte += (size_t) num_bytes;
-    src->bytes_in_buffer -= (size_t) num_bytes;
+    src->next_input_byte += (size_t)num_bytes;
+    src->bytes_in_buffer -= (size_t)num_bytes;
   }
 }
 
@@ -201,7 +201,7 @@ skip_input_data (j_decompress_ptr cinfo, long num_bytes)
  */
 
 METHODDEF(void)
-term_source (j_decompress_ptr cinfo)
+term_source(j_decompress_ptr cinfo)
 {
   /* no work necessary here */
 }
@@ -214,7 +214,7 @@ term_source (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
+jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile)
 {
   my_src_ptr src;
 
@@ -225,11 +225,11 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
    */
   if (cinfo->src == NULL) {     /* first time for this JPEG object? */
     cinfo->src = (struct jpeg_source_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(my_source_mgr));
-    src = (my_src_ptr) cinfo->src;
+    src = (my_src_ptr)cinfo->src;
     src->buffer = (JOCTET *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   INPUT_BUF_SIZE * sizeof(JOCTET));
   } else if (cinfo->src->init_source != init_source) {
     /* It is unsafe to reuse the existing source manager unless it was created
@@ -241,7 +241,7 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
-  src = (my_src_ptr) cinfo->src;
+  src = (my_src_ptr)cinfo->src;
   src->pub.init_source = init_source;
   src->pub.fill_input_buffer = fill_input_buffer;
   src->pub.skip_input_data = skip_input_data;
@@ -260,8 +260,8 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
  */
 
 GLOBAL(void)
-jpeg_mem_src (j_decompress_ptr cinfo,
-              const unsigned char *inbuffer, unsigned long insize)
+jpeg_mem_src(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+             unsigned long insize)
 {
   struct jpeg_source_mgr *src;
 
@@ -274,7 +274,7 @@ jpeg_mem_src (j_decompress_ptr cinfo,
    */
   if (cinfo->src == NULL) {     /* first time for this JPEG object? */
     cinfo->src = (struct jpeg_source_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(struct jpeg_source_mgr));
   } else if (cinfo->src->init_source != init_mem_source) {
     /* It is unsafe to reuse the existing source manager unless it was created
@@ -289,7 +289,7 @@ jpeg_mem_src (j_decompress_ptr cinfo,
   src->skip_input_data = skip_input_data;
   src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
   src->term_source = term_source;
-  src->bytes_in_buffer = (size_t) insize;
-  src->next_input_byte = (const JOCTET *) inbuffer;
+  src->bytes_in_buffer = (size_t)insize;
+  src->next_input_byte = (const JOCTET *)inbuffer;
 }
 #endif
diff --git a/media/libjpeg/jdcoefct.c b/media/libjpeg/jdcoefct.c
index 1a48969b83..15e6cded62 100644
--- a/media/libjpeg/jdcoefct.c
+++ b/media/libjpeg/jdcoefct.c
@@ -5,8 +5,8 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015-2016, D. R. Commander.
- * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2010, 2015-2016, 2019-2020, D. R. Commander.
+ * Copyright (C) 2015, 2020, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -25,16 +25,15 @@
 
 
 /* Forward declarations */
-METHODDEF(int) decompress_onepass
-        (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+METHODDEF(int) decompress_onepass(j_decompress_ptr cinfo,
+                                  JSAMPIMAGE output_buf);
 #ifdef D_MULTISCAN_FILES_SUPPORTED
-METHODDEF(int) decompress_data
-        (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+METHODDEF(int) decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
 #endif
 #ifdef BLOCK_SMOOTHING_SUPPORTED
-LOCAL(boolean) smoothing_ok (j_decompress_ptr cinfo);
-METHODDEF(int) decompress_smooth_data
-        (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+LOCAL(boolean) smoothing_ok(j_decompress_ptr cinfo);
+METHODDEF(int) decompress_smooth_data(j_decompress_ptr cinfo,
+                                      JSAMPIMAGE output_buf);
 #endif
 
 
@@ -43,7 +42,7 @@ METHODDEF(int) decompress_smooth_data
  */
 
 METHODDEF(void)
-start_input_pass (j_decompress_ptr cinfo)
+start_input_pass(j_decompress_ptr cinfo)
 {
   cinfo->input_iMCU_row = 0;
   start_iMCU_row(cinfo);
@@ -55,10 +54,10 @@ start_input_pass (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_output_pass (j_decompress_ptr cinfo)
+start_output_pass(j_decompress_ptr cinfo)
 {
 #ifdef BLOCK_SMOOTHING_SUPPORTED
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   /* If multipass, check to see whether to use block smoothing on this pass */
   if (coef->pub.coef_arrays != NULL) {
@@ -83,9 +82,9 @@ start_output_pass (j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_onepass(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -101,9 +100,11 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
          MCU_col_num++) {
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
-      jzero_far((void *) coef->MCU_buffer[0],
-                (size_t) (cinfo->blocks_in_MCU * sizeof(JBLOCK)));
-      if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
+      jzero_far((void *)coef->MCU_buffer[0],
+                (size_t)(cinfo->blocks_in_MCU * sizeof(JBLOCK)));
+      if (!cinfo->entropy->insufficient_data)
+        cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
+      if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->MCU_ctr = MCU_col_num;
@@ -120,28 +121,28 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
          * incremented past them!).  Note the inner loop relies on having
          * allocated the MCU_buffer[] blocks sequentially.
          */
-        blkn = 0;                 /* index of current DCT block within MCU */
+        blkn = 0;               /* index of current DCT block within MCU */
         for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
           compptr = cinfo->cur_comp_info[ci];
           /* Don't bother to IDCT an uninteresting component. */
-          if (! compptr->component_needed) {
+          if (!compptr->component_needed) {
             blkn += compptr->MCU_blocks;
             continue;
           }
           inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
-          useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-                                                      : compptr->last_col_width;
+          useful_width = (MCU_col_num < last_MCU_col) ?
+                         compptr->MCU_width : compptr->last_col_width;
           output_ptr = output_buf[compptr->component_index] +
-            yoffset * compptr->_DCT_scaled_size;
+                       yoffset * compptr->_DCT_scaled_size;
           start_col = (MCU_col_num - cinfo->master->first_iMCU_col) *
-              compptr->MCU_sample_width;
+                      compptr->MCU_sample_width;
           for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
             if (cinfo->input_iMCU_row < last_iMCU_row ||
-                yoffset+yindex < compptr->last_row_height) {
+                yoffset + yindex < compptr->last_row_height) {
               output_col = start_col;
               for (xindex = 0; xindex < useful_width; xindex++) {
                 (*inverse_DCT) (cinfo, compptr,
-                                (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
+                                (JCOEFPTR)coef->MCU_buffer[blkn + xindex],
                                 output_ptr, output_col);
                 output_col += compptr->_DCT_scaled_size;
               }
@@ -172,7 +173,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
  */
 
 METHODDEF(int)
-dummy_consume_data (j_decompress_ptr cinfo)
+dummy_consume_data(j_decompress_ptr cinfo)
 {
   return JPEG_SUSPENDED;        /* Always indicate nothing was done */
 }
@@ -188,9 +189,9 @@ dummy_consume_data (j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-consume_data (j_decompress_ptr cinfo)
+consume_data(j_decompress_ptr cinfo)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   int blkn, ci, xindex, yindex, yoffset;
   JDIMENSION start_col;
@@ -202,9 +203,9 @@ consume_data (j_decompress_ptr cinfo)
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     buffer[ci] = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+      ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
        cinfo->input_iMCU_row * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, TRUE);
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
     /* Note: entropy decoder expects buffer to be zeroed,
      * but this is handled automatically by the memory manager
      * because we requested a pre-zeroed array.
@@ -222,14 +223,16 @@ consume_data (j_decompress_ptr cinfo)
         compptr = cinfo->cur_comp_info[ci];
         start_col = MCU_col_num * compptr->MCU_width;
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-          buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+          buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
           for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
             coef->MCU_buffer[blkn++] = buffer_ptr++;
           }
         }
       }
+      if (!cinfo->entropy->insufficient_data)
+        cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
       /* Try to fetch the MCU. */
-      if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
+      if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->MCU_ctr = MCU_col_num;
@@ -259,9 +262,9 @@ consume_data (j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   JDIMENSION block_num;
   int ci, block_row, block_rows;
@@ -276,7 +279,7 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   while (cinfo->input_scan_number < cinfo->output_scan_number ||
          (cinfo->input_scan_number == cinfo->output_scan_number &&
           cinfo->input_iMCU_row <= cinfo->output_iMCU_row)) {
-    if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED)
+    if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
       return JPEG_SUSPENDED;
   }
 
@@ -284,19 +287,19 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Don't bother to IDCT an uninteresting component. */
-    if (! compptr->component_needed)
+    if (!compptr->component_needed)
       continue;
     /* Align the virtual buffer for this component. */
     buffer = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[ci],
+      ((j_common_ptr)cinfo, coef->whole_image[ci],
        cinfo->output_iMCU_row * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, FALSE);
+       (JDIMENSION)compptr->v_samp_factor, FALSE);
     /* Count non-dummy DCT block rows in this iMCU row. */
     if (cinfo->output_iMCU_row < last_iMCU_row)
       block_rows = compptr->v_samp_factor;
     else {
       /* NB: can't use last_row_height here; it is input-side-dependent! */
-      block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+      block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
       if (block_rows == 0) block_rows = compptr->v_samp_factor;
     }
     inverse_DCT = cinfo->idct->inverse_DCT[ci];
@@ -307,8 +310,8 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
       output_col = 0;
       for (block_num = cinfo->master->first_MCU_col[ci];
            block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
-        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
-                        output_ptr, output_col);
+        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR)buffer_ptr, output_ptr,
+                        output_col);
         buffer_ptr++;
         output_col += compptr->_DCT_scaled_size;
       }
@@ -327,19 +330,22 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 #ifdef BLOCK_SMOOTHING_SUPPORTED
 
 /*
- * This code applies interblock smoothing as described by section K.8
- * of the JPEG standard: the first 5 AC coefficients are estimated from
- * the DC values of a DCT block and its 8 neighboring blocks.
+ * This code applies interblock smoothing; the first 9 AC coefficients are
+ * estimated from the DC values of a DCT block and its 24 neighboring blocks.
  * We apply smoothing only for progressive JPEG decoding, and only if
  * the coefficients it can estimate are not yet known to full precision.
  */
 
-/* Natural-order array positions of the first 5 zigzag-order coefficients */
+/* Natural-order array positions of the first 9 zigzag-order coefficients */
 #define Q01_POS  1
 #define Q10_POS  8
 #define Q20_POS  16
 #define Q11_POS  9
 #define Q02_POS  2
+#define Q03_POS  3
+#define Q12_POS  10
+#define Q21_POS  17
+#define Q30_POS  24
 
 /*
  * Determine whether block smoothing is applicable and safe.
@@ -350,51 +356,64 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
  */
 
 LOCAL(boolean)
-smoothing_ok (j_decompress_ptr cinfo)
+smoothing_ok(j_decompress_ptr cinfo)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   boolean smoothing_useful = FALSE;
   int ci, coefi;
   jpeg_component_info *compptr;
   JQUANT_TBL *qtable;
-  int *coef_bits;
-  int *coef_bits_latch;
+  int *coef_bits, *prev_coef_bits;
+  int *coef_bits_latch, *prev_coef_bits_latch;
 
-  if (! cinfo->progressive_mode || cinfo->coef_bits == NULL)
+  if (!cinfo->progressive_mode || cinfo->coef_bits == NULL)
     return FALSE;
 
   /* Allocate latch area if not already done */
   if (coef->coef_bits_latch == NULL)
     coef->coef_bits_latch = (int *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components *
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  cinfo->num_components * 2 *
                                   (SAVED_COEFS * sizeof(int)));
   coef_bits_latch = coef->coef_bits_latch;
+  prev_coef_bits_latch =
+    &coef->coef_bits_latch[cinfo->num_components * SAVED_COEFS];
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* All components' quantization values must already be latched. */
     if ((qtable = compptr->quant_table) == NULL)
       return FALSE;
-    /* Verify DC & first 5 AC quantizers are nonzero to avoid zero-divide. */
+    /* Verify DC & first 9 AC quantizers are nonzero to avoid zero-divide. */
     if (qtable->quantval[0] == 0 ||
         qtable->quantval[Q01_POS] == 0 ||
         qtable->quantval[Q10_POS] == 0 ||
         qtable->quantval[Q20_POS] == 0 ||
         qtable->quantval[Q11_POS] == 0 ||
-        qtable->quantval[Q02_POS] == 0)
+        qtable->quantval[Q02_POS] == 0 ||
+        qtable->quantval[Q03_POS] == 0 ||
+        qtable->quantval[Q12_POS] == 0 ||
+        qtable->quantval[Q21_POS] == 0 ||
+        qtable->quantval[Q30_POS] == 0)
       return FALSE;
     /* DC values must be at least partly known for all components. */
     coef_bits = cinfo->coef_bits[ci];
+    prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
     if (coef_bits[0] < 0)
       return FALSE;
+    coef_bits_latch[0] = coef_bits[0];
     /* Block smoothing is helpful if some AC coefficients remain inaccurate. */
-    for (coefi = 1; coefi <= 5; coefi++) {
+    for (coefi = 1; coefi < SAVED_COEFS; coefi++) {
+      if (cinfo->input_scan_number > 1)
+        prev_coef_bits_latch[coefi] = prev_coef_bits[coefi];
+      else
+        prev_coef_bits_latch[coefi] = -1;
       coef_bits_latch[coefi] = coef_bits[coefi];
       if (coef_bits[coefi] != 0)
         smoothing_useful = TRUE;
     }
     coef_bits_latch += SAVED_COEFS;
+    prev_coef_bits_latch += SAVED_COEFS;
   }
 
   return smoothing_useful;
@@ -406,24 +425,27 @@ smoothing_ok (j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   JDIMENSION block_num, last_block_column;
   int ci, block_row, block_rows, access_rows;
   JBLOCKARRAY buffer;
-  JBLOCKROW buffer_ptr, prev_block_row, next_block_row;
+  JBLOCKROW buffer_ptr, prev_prev_block_row, prev_block_row;
+  JBLOCKROW next_block_row, next_next_block_row;
   JSAMPARRAY output_ptr;
   JDIMENSION output_col;
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
-  boolean first_row, last_row;
+  boolean change_dc;
   JCOEF *workspace;
   int *coef_bits;
   JQUANT_TBL *quanttbl;
-  JLONG Q00,Q01,Q02,Q10,Q11,Q20, num;
-  int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
+  JLONG Q00, Q01, Q02, Q03 = 0, Q10, Q11, Q12 = 0, Q20, Q21 = 0, Q30 = 0, num;
+  int DC01, DC02, DC03, DC04, DC05, DC06, DC07, DC08, DC09, DC10, DC11, DC12,
+      DC13, DC14, DC15, DC16, DC17, DC18, DC19, DC20, DC21, DC22, DC23, DC24,
+      DC25;
   int Al, pred;
 
   /* Keep a local variable to avoid looking it up more than once */
@@ -431,18 +453,18 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 
   /* Force some input to be done if we are getting ahead of the input. */
   while (cinfo->input_scan_number <= cinfo->output_scan_number &&
-         ! cinfo->inputctl->eoi_reached) {
+         !cinfo->inputctl->eoi_reached) {
     if (cinfo->input_scan_number == cinfo->output_scan_number) {
       /* If input is working on current scan, we ordinarily want it to
        * have completed the current row.  But if input scan is DC,
-       * we want it to keep one row ahead so that next block row's DC
+       * we want it to keep two rows ahead so that next two block rows' DC
        * values are up to date.
        */
-      JDIMENSION delta = (cinfo->Ss == 0) ? 1 : 0;
-      if (cinfo->input_iMCU_row > cinfo->output_iMCU_row+delta)
+      JDIMENSION delta = (cinfo->Ss == 0) ? 2 : 0;
+      if (cinfo->input_iMCU_row > cinfo->output_iMCU_row + delta)
         break;
     }
-    if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED)
+    if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
       return JPEG_SUSPENDED;
   }
 
@@ -450,37 +472,56 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Don't bother to IDCT an uninteresting component. */
-    if (! compptr->component_needed)
+    if (!compptr->component_needed)
       continue;
     /* Count non-dummy DCT block rows in this iMCU row. */
-    if (cinfo->output_iMCU_row < last_iMCU_row) {
+    if (cinfo->output_iMCU_row < last_iMCU_row - 1) {
+      block_rows = compptr->v_samp_factor;
+      access_rows = block_rows * 3; /* this and next two iMCU rows */
+    } else if (cinfo->output_iMCU_row < last_iMCU_row) {
       block_rows = compptr->v_samp_factor;
       access_rows = block_rows * 2; /* this and next iMCU row */
-      last_row = FALSE;
     } else {
       /* NB: can't use last_row_height here; it is input-side-dependent! */
-      block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+      block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
       if (block_rows == 0) block_rows = compptr->v_samp_factor;
       access_rows = block_rows; /* this iMCU row only */
-      last_row = TRUE;
     }
     /* Align the virtual buffer for this component. */
-    if (cinfo->output_iMCU_row > 0) {
-      access_rows += compptr->v_samp_factor; /* prior iMCU row too */
+    if (cinfo->output_iMCU_row > 1) {
+      access_rows += 2 * compptr->v_samp_factor; /* prior two iMCU rows too */
+      buffer = (*cinfo->mem->access_virt_barray)
+        ((j_common_ptr)cinfo, coef->whole_image[ci],
+         (cinfo->output_iMCU_row - 2) * compptr->v_samp_factor,
+         (JDIMENSION)access_rows, FALSE);
+      buffer += 2 * compptr->v_samp_factor; /* point to current iMCU row */
+    } else if (cinfo->output_iMCU_row > 0) {
       buffer = (*cinfo->mem->access_virt_barray)
-        ((j_common_ptr) cinfo, coef->whole_image[ci],
+        ((j_common_ptr)cinfo, coef->whole_image[ci],
          (cinfo->output_iMCU_row - 1) * compptr->v_samp_factor,
-         (JDIMENSION) access_rows, FALSE);
+         (JDIMENSION)access_rows, FALSE);
       buffer += compptr->v_samp_factor; /* point to current iMCU row */
-      first_row = FALSE;
     } else {
       buffer = (*cinfo->mem->access_virt_barray)
-        ((j_common_ptr) cinfo, coef->whole_image[ci],
-         (JDIMENSION) 0, (JDIMENSION) access_rows, FALSE);
-      first_row = TRUE;
+        ((j_common_ptr)cinfo, coef->whole_image[ci],
+         (JDIMENSION)0, (JDIMENSION)access_rows, FALSE);
     }
-    /* Fetch component-dependent info */
-    coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+    /* Fetch component-dependent info.
+     * If the current scan is incomplete, then we use the component-dependent
+     * info from the previous scan.
+     */
+    if (cinfo->output_iMCU_row > cinfo->master->last_good_iMCU_row)
+      coef_bits =
+        coef->coef_bits_latch + ((ci + cinfo->num_components) * SAVED_COEFS);
+    else
+      coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+
+    /* We only do DC interpolation if no AC coefficient data is available. */
+    change_dc =
+      coef_bits[1] == -1 && coef_bits[2] == -1 && coef_bits[3] == -1 &&
+      coef_bits[4] == -1 && coef_bits[5] == -1 && coef_bits[6] == -1 &&
+      coef_bits[7] == -1 && coef_bits[8] == -1 && coef_bits[9] == -1;
+
     quanttbl = compptr->quant_table;
     Q00 = quanttbl->quantval[0];
     Q01 = quanttbl->quantval[Q01_POS];
@@ -488,124 +529,268 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     Q20 = quanttbl->quantval[Q20_POS];
     Q11 = quanttbl->quantval[Q11_POS];
     Q02 = quanttbl->quantval[Q02_POS];
+    if (change_dc) {
+      Q03 = quanttbl->quantval[Q03_POS];
+      Q12 = quanttbl->quantval[Q12_POS];
+      Q21 = quanttbl->quantval[Q21_POS];
+      Q30 = quanttbl->quantval[Q30_POS];
+    }
     inverse_DCT = cinfo->idct->inverse_DCT[ci];
     output_ptr = output_buf[ci];
     /* Loop over all DCT blocks to be processed. */
     for (block_row = 0; block_row < block_rows; block_row++) {
       buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci];
-      if (first_row && block_row == 0)
+
+      if (block_row > 0 || cinfo->output_iMCU_row > 0)
+        prev_block_row =
+          buffer[block_row - 1] + cinfo->master->first_MCU_col[ci];
+      else
         prev_block_row = buffer_ptr;
+
+      if (block_row > 1 || cinfo->output_iMCU_row > 1)
+        prev_prev_block_row =
+          buffer[block_row - 2] + cinfo->master->first_MCU_col[ci];
+      else
+        prev_prev_block_row = prev_block_row;
+
+      if (block_row < block_rows - 1 || cinfo->output_iMCU_row < last_iMCU_row)
+        next_block_row =
+          buffer[block_row + 1] + cinfo->master->first_MCU_col[ci];
       else
-        prev_block_row = buffer[block_row-1];
-      if (last_row && block_row == block_rows-1)
         next_block_row = buffer_ptr;
+
+      if (block_row < block_rows - 2 ||
+          cinfo->output_iMCU_row < last_iMCU_row - 1)
+        next_next_block_row =
+          buffer[block_row + 2] + cinfo->master->first_MCU_col[ci];
       else
-        next_block_row = buffer[block_row+1];
+        next_next_block_row = next_block_row;
+
       /* We fetch the surrounding DC values using a sliding-register approach.
-       * Initialize all nine here so as to do the right thing on narrow pics.
+       * Initialize all 25 here so as to do the right thing on narrow pics.
        */
-      DC1 = DC2 = DC3 = (int) prev_block_row[0][0];
-      DC4 = DC5 = DC6 = (int) buffer_ptr[0][0];
-      DC7 = DC8 = DC9 = (int) next_block_row[0][0];
+      DC01 = DC02 = DC03 = DC04 = DC05 = (int)prev_prev_block_row[0][0];
+      DC06 = DC07 = DC08 = DC09 = DC10 = (int)prev_block_row[0][0];
+      DC11 = DC12 = DC13 = DC14 = DC15 = (int)buffer_ptr[0][0];
+      DC16 = DC17 = DC18 = DC19 = DC20 = (int)next_block_row[0][0];
+      DC21 = DC22 = DC23 = DC24 = DC25 = (int)next_next_block_row[0][0];
       output_col = 0;
       last_block_column = compptr->width_in_blocks - 1;
       for (block_num = cinfo->master->first_MCU_col[ci];
            block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
         /* Fetch current DCT block into workspace so we can modify it. */
-        jcopy_block_row(buffer_ptr, (JBLOCKROW) workspace, (JDIMENSION) 1);
+        jcopy_block_row(buffer_ptr, (JBLOCKROW)workspace, (JDIMENSION)1);
         /* Update DC values */
-        if (block_num < last_block_column) {
-          DC3 = (int) prev_block_row[1][0];
-          DC6 = (int) buffer_ptr[1][0];
-          DC9 = (int) next_block_row[1][0];
+        if (block_num == cinfo->master->first_MCU_col[ci] &&
+            block_num < last_block_column) {
+          DC04 = (int)prev_prev_block_row[1][0];
+          DC09 = (int)prev_block_row[1][0];
+          DC14 = (int)buffer_ptr[1][0];
+          DC19 = (int)next_block_row[1][0];
+          DC24 = (int)next_next_block_row[1][0];
+        }
+        if (block_num + 1 < last_block_column) {
+          DC05 = (int)prev_prev_block_row[2][0];
+          DC10 = (int)prev_block_row[2][0];
+          DC15 = (int)buffer_ptr[2][0];
+          DC20 = (int)next_block_row[2][0];
+          DC25 = (int)next_next_block_row[2][0];
         }
-        /* Compute coefficient estimates per K.8.
-         * An estimate is applied only if coefficient is still zero,
-         * and is not known to be fully accurate.
+        /* If DC interpolation is enabled, compute coefficient estimates using
+         * a Gaussian-like kernel, keeping the averages of the DC values.
+         *
+         * If DC interpolation is disabled, compute coefficient estimates using
+         * an algorithm similar to the one described in Section K.8 of the JPEG
+         * standard, except applied to a 5x5 window rather than a 3x3 window.
+         *
+         * An estimate is applied only if the coefficient is still zero and is
+         * not known to be fully accurate.
          */
         /* AC01 */
-        if ((Al=coef_bits[1]) != 0 && workspace[1] == 0) {
-          num = 36 * Q00 * (DC4 - DC6);
+        if ((Al = coef_bits[1]) != 0 && workspace[1] == 0) {
+          num = Q00 * (change_dc ?
+                (-DC01 - DC02 + DC04 + DC05 - 3 * DC06 + 13 * DC07 -
+                 13 * DC09 + 3 * DC10 - 3 * DC11 + 38 * DC12 - 38 * DC14 +
+                 3 * DC15 - 3 * DC16 + 13 * DC17 - 13 * DC19 + 3 * DC20 -
+                 DC21 - DC22 + DC24 + DC25) :
+                (-7 * DC11 + 50 * DC12 - 50 * DC14 + 7 * DC15));
           if (num >= 0) {
-            pred = (int) (((Q01<<7) + num) / (Q01<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q01 << 7) + num) / (Q01 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q01<<7) - num) / (Q01<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q01 << 7) - num) / (Q01 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[1] = (JCOEF) pred;
+          workspace[1] = (JCOEF)pred;
         }
         /* AC10 */
-        if ((Al=coef_bits[2]) != 0 && workspace[8] == 0) {
-          num = 36 * Q00 * (DC2 - DC8);
+        if ((Al = coef_bits[2]) != 0 && workspace[8] == 0) {
+          num = Q00 * (change_dc ?
+                (-DC01 - 3 * DC02 - 3 * DC03 - 3 * DC04 - DC05 - DC06 +
+                 13 * DC07 + 38 * DC08 + 13 * DC09 - DC10 + DC16 -
+                 13 * DC17 - 38 * DC18 - 13 * DC19 + DC20 + DC21 +
+                 3 * DC22 + 3 * DC23 + 3 * DC24 + DC25) :
+                (-7 * DC03 + 50 * DC08 - 50 * DC18 + 7 * DC23));
           if (num >= 0) {
-            pred = (int) (((Q10<<7) + num) / (Q10<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q10 << 7) + num) / (Q10 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q10<<7) - num) / (Q10<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q10 << 7) - num) / (Q10 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[8] = (JCOEF) pred;
+          workspace[8] = (JCOEF)pred;
         }
         /* AC20 */
-        if ((Al=coef_bits[3]) != 0 && workspace[16] == 0) {
-          num = 9 * Q00 * (DC2 + DC8 - 2*DC5);
+        if ((Al = coef_bits[3]) != 0 && workspace[16] == 0) {
+          num = Q00 * (change_dc ?
+                (DC03 + 2 * DC07 + 7 * DC08 + 2 * DC09 - 5 * DC12 - 14 * DC13 -
+                 5 * DC14 + 2 * DC17 + 7 * DC18 + 2 * DC19 + DC23) :
+                (-DC03 + 13 * DC08 - 24 * DC13 + 13 * DC18 - DC23));
           if (num >= 0) {
-            pred = (int) (((Q20<<7) + num) / (Q20<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q20 << 7) + num) / (Q20 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q20<<7) - num) / (Q20<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q20 << 7) - num) / (Q20 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[16] = (JCOEF) pred;
+          workspace[16] = (JCOEF)pred;
         }
         /* AC11 */
-        if ((Al=coef_bits[4]) != 0 && workspace[9] == 0) {
-          num = 5 * Q00 * (DC1 - DC3 - DC7 + DC9);
+        if ((Al = coef_bits[4]) != 0 && workspace[9] == 0) {
+          num = Q00 * (change_dc ?
+                (-DC01 + DC05 + 9 * DC07 - 9 * DC09 - 9 * DC17 +
+                 9 * DC19 + DC21 - DC25) :
+                (DC10 + DC16 - 10 * DC17 + 10 * DC19 - DC02 - DC20 + DC22 -
+                 DC24 + DC04 - DC06 + 10 * DC07 - 10 * DC09));
           if (num >= 0) {
-            pred = (int) (((Q11<<7) + num) / (Q11<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q11 << 7) + num) / (Q11 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q11<<7) - num) / (Q11<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q11 << 7) - num) / (Q11 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[9] = (JCOEF) pred;
+          workspace[9] = (JCOEF)pred;
         }
         /* AC02 */
-        if ((Al=coef_bits[5]) != 0 && workspace[2] == 0) {
-          num = 9 * Q00 * (DC4 + DC6 - 2*DC5);
+        if ((Al = coef_bits[5]) != 0 && workspace[2] == 0) {
+          num = Q00 * (change_dc ?
+                (2 * DC07 - 5 * DC08 + 2 * DC09 + DC11 + 7 * DC12 - 14 * DC13 +
+                 7 * DC14 + DC15 + 2 * DC17 - 5 * DC18 + 2 * DC19) :
+                (-DC11 + 13 * DC12 - 24 * DC13 + 13 * DC14 - DC15));
           if (num >= 0) {
-            pred = (int) (((Q02<<7) + num) / (Q02<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q02 << 7) + num) / (Q02 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q02<<7) - num) / (Q02<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q02 << 7) - num) / (Q02 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[2] = (JCOEF) pred;
+          workspace[2] = (JCOEF)pred;
         }
+        if (change_dc) {
+          /* AC03 */
+          if ((Al = coef_bits[6]) != 0 && workspace[3] == 0) {
+            num = Q00 * (DC07 - DC09 + 2 * DC12 - 2 * DC14 + DC17 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q03 << 7) + num) / (Q03 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q03 << 7) - num) / (Q03 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[3] = (JCOEF)pred;
+          }
+          /* AC12 */
+          if ((Al = coef_bits[7]) != 0 && workspace[10] == 0) {
+            num = Q00 * (DC07 - 3 * DC08 + DC09 - DC17 + 3 * DC18 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q12 << 7) + num) / (Q12 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q12 << 7) - num) / (Q12 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[10] = (JCOEF)pred;
+          }
+          /* AC21 */
+          if ((Al = coef_bits[8]) != 0 && workspace[17] == 0) {
+            num = Q00 * (DC07 - DC09 - 3 * DC12 + 3 * DC14 + DC17 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q21 << 7) + num) / (Q21 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q21 << 7) - num) / (Q21 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[17] = (JCOEF)pred;
+          }
+          /* AC30 */
+          if ((Al = coef_bits[9]) != 0 && workspace[24] == 0) {
+            num = Q00 * (DC07 + 2 * DC08 + DC09 - DC17 - 2 * DC18 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q30 << 7) + num) / (Q30 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q30 << 7) - num) / (Q30 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[24] = (JCOEF)pred;
+          }
+          /* coef_bits[0] is non-negative.  Otherwise this function would not
+           * be called.
+           */
+          num = Q00 *
+                (-2 * DC01 - 6 * DC02 - 8 * DC03 - 6 * DC04 - 2 * DC05 -
+                 6 * DC06 + 6 * DC07 + 42 * DC08 + 6 * DC09 - 6 * DC10 -
+                 8 * DC11 + 42 * DC12 + 152 * DC13 + 42 * DC14 - 8 * DC15 -
+                 6 * DC16 + 6 * DC17 + 42 * DC18 + 6 * DC19 - 6 * DC20 -
+                 2 * DC21 - 6 * DC22 - 8 * DC23 - 6 * DC24 - 2 * DC25);
+          if (num >= 0) {
+            pred = (int)(((Q00 << 7) + num) / (Q00 << 8));
+          } else {
+            pred = (int)(((Q00 << 7) - num) / (Q00 << 8));
+            pred = -pred;
+          }
+          workspace[0] = (JCOEF)pred;
+        }  /* change_dc */
+
         /* OK, do the IDCT */
-        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) workspace,
-                        output_ptr, output_col);
+        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR)workspace, output_ptr,
+                        output_col);
         /* Advance for next column */
-        DC1 = DC2; DC2 = DC3;
-        DC4 = DC5; DC5 = DC6;
-        DC7 = DC8; DC8 = DC9;
-        buffer_ptr++, prev_block_row++, next_block_row++;
+        DC01 = DC02;  DC02 = DC03;  DC03 = DC04;  DC04 = DC05;
+        DC06 = DC07;  DC07 = DC08;  DC08 = DC09;  DC09 = DC10;
+        DC11 = DC12;  DC12 = DC13;  DC13 = DC14;  DC14 = DC15;
+        DC16 = DC17;  DC17 = DC18;  DC18 = DC19;  DC19 = DC20;
+        DC21 = DC22;  DC22 = DC23;  DC23 = DC24;  DC24 = DC25;
+        buffer_ptr++, prev_block_row++, next_block_row++,
+          prev_prev_block_row++, next_next_block_row++;
         output_col += compptr->_DCT_scaled_size;
       }
       output_ptr += compptr->_DCT_scaled_size;
@@ -625,14 +810,14 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
  */
 
 GLOBAL(void)
-jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
+jinit_d_coef_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_coef_ptr coef;
 
   coef = (my_coef_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_coef_controller));
-  cinfo->coef = (struct jpeg_d_coef_controller *) coef;
+  cinfo->coef = (struct jpeg_d_coef_controller *)coef;
   coef->pub.start_input_pass = start_input_pass;
   coef->pub.start_output_pass = start_output_pass;
 #ifdef BLOCK_SMOOTHING_SUPPORTED
@@ -654,15 +839,15 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
 #ifdef BLOCK_SMOOTHING_SUPPORTED
       /* If block smoothing could be used, need a bigger window */
       if (cinfo->progressive_mode)
-        access_rows *= 3;
+        access_rows *= 5;
 #endif
       coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE, TRUE,
-         (JDIMENSION) jround_up((long) compptr->width_in_blocks,
-                                (long) compptr->h_samp_factor),
-         (JDIMENSION) jround_up((long) compptr->height_in_blocks,
-                                (long) compptr->v_samp_factor),
-         (JDIMENSION) access_rows);
+        ((j_common_ptr)cinfo, JPOOL_IMAGE, TRUE,
+         (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                               (long)compptr->h_samp_factor),
+         (JDIMENSION)jround_up((long)compptr->height_in_blocks,
+                               (long)compptr->v_samp_factor),
+         (JDIMENSION)access_rows);
     }
     coef->pub.consume_data = consume_data;
     coef->pub.decompress_data = decompress_data;
@@ -676,7 +861,7 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
     int i;
 
     buffer = (JBLOCKROW)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   D_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
     for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
       coef->MCU_buffer[i] = buffer + i;
@@ -688,6 +873,6 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
 
   /* Allocate the workspace buffer */
   coef->workspace = (JCOEF *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(JCOEF) * DCTSIZE2);
 }
diff --git a/media/libjpeg/jdcoefct.h b/media/libjpeg/jdcoefct.h
index bf6beb274b..9a0e780663 100644
--- a/media/libjpeg/jdcoefct.h
+++ b/media/libjpeg/jdcoefct.h
@@ -5,6 +5,7 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2020, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  */
@@ -51,7 +52,7 @@ typedef struct {
 #ifdef BLOCK_SMOOTHING_SUPPORTED
   /* When doing block smoothing, we latch coefficient Al values here */
   int *coef_bits_latch;
-#define SAVED_COEFS  6          /* we save coef_bits[0..5] */
+#define SAVED_COEFS  10         /* we save coef_bits[0..9] */
 #endif
 } my_coef_controller;
 
@@ -59,10 +60,10 @@ typedef my_coef_controller *my_coef_ptr;
 
 
 LOCAL(void)
-start_iMCU_row (j_decompress_ptr cinfo)
+start_iMCU_row(j_decompress_ptr cinfo)
 /* Reset within-iMCU-row counters for a new row (input side) */
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   /* In an interleaved scan, an MCU row is the same as an iMCU row.
    * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
@@ -71,7 +72,7 @@ start_iMCU_row (j_decompress_ptr cinfo)
   if (cinfo->comps_in_scan > 1) {
     coef->MCU_rows_per_iMCU_row = 1;
   } else {
-    if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows-1))
+    if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows - 1))
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
     else
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
diff --git a/media/libjpeg/jdcol565.c b/media/libjpeg/jdcol565.c
index 349fce4a66..53c7bd9187 100644
--- a/media/libjpeg/jdcol565.c
+++ b/media/libjpeg/jdcol565.c
@@ -17,22 +17,22 @@
 
 INLINE
 LOCAL(void)
-ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
-                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-                             JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                            JDIMENSION input_row, JSAMPARRAY output_buf,
+                            int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  register int * Crrtab = cconvert->Cr_r_tab;
-  register int * Cbbtab = cconvert->Cb_b_tab;
-  register JLONG * Crgtab = cconvert->Cr_g_tab;
-  register JLONG * Cbgtab = cconvert->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register int *Crrtab = cconvert->Cr_r_tab;
+  register int *Cbbtab = cconvert->Cb_b_tab;
+  register JLONG *Crgtab = cconvert->Cr_g_tab;
+  register JLONG *Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
@@ -45,31 +45,31 @@ ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
     outptr = *output_buf++;
 
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
 
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -80,15 +80,15 @@ ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
       outptr += 4;
     }
     if (num_cols & 1) {
-      y  = GETJSAMPLE(*inptr0);
-      cb = GETJSAMPLE(*inptr1);
-      cr = GETJSAMPLE(*inptr2);
+      y  = *inptr0;
+      cb = *inptr1;
+      cr = *inptr2;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -96,22 +96,22 @@ ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
-                              JSAMPIMAGE input_buf, JDIMENSION input_row,
-                              JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                             JDIMENSION input_row, JSAMPARRAY output_buf,
+                             int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  register int * Crrtab = cconvert->Cr_r_tab;
-  register int * Cbbtab = cconvert->Cb_b_tab;
-  register JLONG * Crgtab = cconvert->Cr_g_tab;
-  register JLONG * Cbgtab = cconvert->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register int *Crrtab = cconvert->Cr_r_tab;
+  register int *Cbbtab = cconvert->Cb_b_tab;
+  register JLONG *Crgtab = cconvert->Cr_g_tab;
+  register JLONG *Cbgtab = cconvert->Cb_g_tab;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   SHIFT_TEMPS
 
@@ -125,23 +125,23 @@ ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                                      SCALEBITS)), d0)];
       b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -150,9 +150,9 @@ ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_SHORT_565(r, g, b);
 
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -165,16 +165,16 @@ ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
       outptr += 4;
     }
     if (num_cols & 1) {
-      y  = GETJSAMPLE(*inptr0);
-      cb = GETJSAMPLE(*inptr1);
-      cr = GETJSAMPLE(*inptr2);
+      y  = *inptr0;
+      cb = *inptr1;
+      cr = *inptr2;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                                      SCALEBITS)), d0)];
       b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -182,9 +182,9 @@ ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-rgb_rgb565_convert_internal (j_decompress_ptr cinfo,
-                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-                             JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                            JDIMENSION input_row, JSAMPARRAY output_buf,
+                            int num_rows)
 {
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
@@ -202,34 +202,34 @@ rgb_rgb565_convert_internal (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_SHORT_565(r, g, b);
 
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
       WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
       outptr += 4;
     }
     if (num_cols & 1) {
-      r = GETJSAMPLE(*inptr0);
-      g = GETJSAMPLE(*inptr1);
-      b = GETJSAMPLE(*inptr2);
+      r = *inptr0;
+      g = *inptr1;
+      b = *inptr2;
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -237,14 +237,14 @@ rgb_rgb565_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
-                              JSAMPIMAGE input_buf, JDIMENSION input_row,
-                              JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                             JDIMENSION input_row, JSAMPARRAY output_buf,
+                             int num_rows)
 {
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
   JDIMENSION num_cols = cinfo->output_width;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   SHIFT_TEMPS
@@ -259,24 +259,24 @@ rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_SHORT_565(r, g, b);
 
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
@@ -284,11 +284,11 @@ rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
       outptr += 4;
     }
     if (num_cols & 1) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2), d0)];
+      r = range_limit[DITHER_565_R(*inptr0, d0)];
+      g = range_limit[DITHER_565_G(*inptr1, d0)];
+      b = range_limit[DITHER_565_B(*inptr2, d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -296,9 +296,9 @@ rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-gray_rgb565_convert_internal (j_decompress_ptr cinfo,
-                              JSAMPIMAGE input_buf, JDIMENSION input_row,
-                              JSAMPARRAY output_buf, int num_rows)
+gray_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                             JDIMENSION input_row, JSAMPARRAY output_buf,
+                             int num_rows)
 {
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
@@ -313,7 +313,7 @@ gray_rgb565_convert_internal (j_decompress_ptr cinfo,
     if (PACK_NEED_ALIGNMENT(outptr)) {
       g = *inptr++;
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -328,7 +328,7 @@ gray_rgb565_convert_internal (j_decompress_ptr cinfo,
     if (num_cols & 1) {
       g = *inptr;
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -336,13 +336,13 @@ gray_rgb565_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-gray_rgb565D_convert_internal (j_decompress_ptr cinfo,
-                               JSAMPIMAGE input_buf, JDIMENSION input_row,
-                               JSAMPARRAY output_buf, int num_rows)
+gray_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                              JDIMENSION input_row, JSAMPARRAY output_buf,
+                              int num_rows)
 {
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
   JDIMENSION num_cols = cinfo->output_width;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
 
@@ -356,7 +356,7 @@ gray_rgb565D_convert_internal (j_decompress_ptr cinfo,
       g = *inptr++;
       g = range_limit[DITHER_565_R(g, d0)];
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -378,7 +378,7 @@ gray_rgb565D_convert_internal (j_decompress_ptr cinfo,
       g = *inptr;
       g = range_limit[DITHER_565_R(g, d0)];
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
diff --git a/media/libjpeg/jdcolext.c b/media/libjpeg/jdcolext.c
index 59b676cc4d..863c7a2fbc 100644
--- a/media/libjpeg/jdcolext.c
+++ b/media/libjpeg/jdcolext.c
@@ -28,22 +28,22 @@
 
 INLINE
 LOCAL(void)
-ycc_rgb_convert_internal (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
+ycc_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  register int * Crrtab = cconvert->Cr_r_tab;
-  register int * Cbbtab = cconvert->Cb_b_tab;
-  register JLONG * Crgtab = cconvert->Cr_g_tab;
-  register JLONG * Cbgtab = cconvert->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register int *Crrtab = cconvert->Cr_r_tab;
+  register int *Cbbtab = cconvert->Cb_b_tab;
+  register JLONG *Crgtab = cconvert->Cr_g_tab;
+  register JLONG *Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
@@ -53,14 +53,14 @@ ycc_rgb_convert_internal (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      y  = GETJSAMPLE(inptr0[col]);
-      cb = GETJSAMPLE(inptr1[col]);
-      cr = GETJSAMPLE(inptr2[col]);
+      y  = inptr0[col];
+      cb = inptr1[col];
+      cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
       outptr[RGB_GREEN] = range_limit[y +
-                              ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
-                                                 SCALEBITS))];
+                              ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+                                                SCALEBITS))];
       outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
       /* Set unused byte to 0xFF so it can be interpreted as an opaque */
       /* alpha channel value */
@@ -81,9 +81,9 @@ ycc_rgb_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-gray_rgb_convert_internal (j_decompress_ptr cinfo,
-                           JSAMPIMAGE input_buf, JDIMENSION input_row,
-                           JSAMPARRAY output_buf, int num_rows)
+gray_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                          JDIMENSION input_row, JSAMPARRAY output_buf,
+                          int num_rows)
 {
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
@@ -93,7 +93,6 @@ gray_rgb_convert_internal (j_decompress_ptr cinfo,
     inptr = input_buf[0][input_row++];
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
       /* Set unused byte to 0xFF so it can be interpreted as an opaque */
       /* alpha channel value */
@@ -112,9 +111,9 @@ gray_rgb_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-rgb_rgb_convert_internal (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
+rgb_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
 {
   register JSAMPROW inptr0, inptr1, inptr2;
   register JSAMPROW outptr;
@@ -128,7 +127,6 @@ rgb_rgb_convert_internal (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = inptr0[col];
       outptr[RGB_GREEN] = inptr1[col];
       outptr[RGB_BLUE] = inptr2[col];
diff --git a/media/libjpeg/jdcolor.c b/media/libjpeg/jdcolor.c
index ab8fa24925..8da2b4eaf2 100644
--- a/media/libjpeg/jdcolor.c
+++ b/media/libjpeg/jdcolor.c
@@ -74,8 +74,8 @@ typedef my_color_deconverter *my_cconvert_ptr;
  */
 
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
-#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
+#define ONE_HALF        ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x)          ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
 
 /* We allocate one big table for RGB->Y conversion and divide it up into
  * three parts, instead of doing three alloc_small requests.  This lets us
@@ -85,9 +85,9 @@ typedef my_color_deconverter *my_cconvert_ptr;
  */
 
 #define R_Y_OFF         0                       /* offset to R => Y section */
-#define G_Y_OFF         (1*(MAXJSAMPLE+1))      /* offset to G => Y section */
-#define B_Y_OFF         (2*(MAXJSAMPLE+1))      /* etc. */
-#define TABLE_SIZE      (3*(MAXJSAMPLE+1))
+#define G_Y_OFF         (1 * (MAXJSAMPLE + 1))  /* offset to G => Y section */
+#define B_Y_OFF         (2 * (MAXJSAMPLE + 1))  /* etc. */
+#define TABLE_SIZE      (3 * (MAXJSAMPLE + 1))
 
 
 /* Include inline routines for colorspace extensions */
@@ -98,13 +98,13 @@ typedef my_color_deconverter *my_cconvert_ptr;
 #undef RGB_BLUE
 #undef RGB_PIXELSIZE
 
-#define RGB_RED EXT_RGB_RED
-#define RGB_GREEN EXT_RGB_GREEN
-#define RGB_BLUE EXT_RGB_BLUE
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extrgb_convert_internal
-#define gray_rgb_convert_internal gray_extrgb_convert_internal
-#define rgb_rgb_convert_internal rgb_extrgb_convert_internal
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define ycc_rgb_convert_internal  ycc_extrgb_convert_internal
+#define gray_rgb_convert_internal  gray_extrgb_convert_internal
+#define rgb_rgb_convert_internal  rgb_extrgb_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -114,14 +114,14 @@ typedef my_color_deconverter *my_cconvert_ptr;
 #undef gray_rgb_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_RGBX_RED
-#define RGB_GREEN EXT_RGBX_GREEN
-#define RGB_BLUE EXT_RGBX_BLUE
-#define RGB_ALPHA 3
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extrgbx_convert_internal
-#define gray_rgb_convert_internal gray_extrgbx_convert_internal
-#define rgb_rgb_convert_internal rgb_extrgbx_convert_internal
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define ycc_rgb_convert_internal  ycc_extrgbx_convert_internal
+#define gray_rgb_convert_internal  gray_extrgbx_convert_internal
+#define rgb_rgb_convert_internal  rgb_extrgbx_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -132,13 +132,13 @@ typedef my_color_deconverter *my_cconvert_ptr;
 #undef gray_rgb_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_BGR_RED
-#define RGB_GREEN EXT_BGR_GREEN
-#define RGB_BLUE EXT_BGR_BLUE
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extbgr_convert_internal
-#define gray_rgb_convert_internal gray_extbgr_convert_internal
-#define rgb_rgb_convert_internal rgb_extbgr_convert_internal
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define ycc_rgb_convert_internal  ycc_extbgr_convert_internal
+#define gray_rgb_convert_internal  gray_extbgr_convert_internal
+#define rgb_rgb_convert_internal  rgb_extbgr_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -148,14 +148,14 @@ typedef my_color_deconverter *my_cconvert_ptr;
 #undef gray_rgb_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_BGRX_RED
-#define RGB_GREEN EXT_BGRX_GREEN
-#define RGB_BLUE EXT_BGRX_BLUE
-#define RGB_ALPHA 3
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extbgrx_convert_internal
-#define gray_rgb_convert_internal gray_extbgrx_convert_internal
-#define rgb_rgb_convert_internal rgb_extbgrx_convert_internal
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define ycc_rgb_convert_internal  ycc_extbgrx_convert_internal
+#define gray_rgb_convert_internal  gray_extbgrx_convert_internal
+#define rgb_rgb_convert_internal  rgb_extbgrx_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -166,14 +166,14 @@ typedef my_color_deconverter *my_cconvert_ptr;
 #undef gray_rgb_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_XBGR_RED
-#define RGB_GREEN EXT_XBGR_GREEN
-#define RGB_BLUE EXT_XBGR_BLUE
-#define RGB_ALPHA 0
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extxbgr_convert_internal
-#define gray_rgb_convert_internal gray_extxbgr_convert_internal
-#define rgb_rgb_convert_internal rgb_extxbgr_convert_internal
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define ycc_rgb_convert_internal  ycc_extxbgr_convert_internal
+#define gray_rgb_convert_internal  gray_extxbgr_convert_internal
+#define rgb_rgb_convert_internal  rgb_extxbgr_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -184,14 +184,14 @@ typedef my_color_deconverter *my_cconvert_ptr;
 #undef gray_rgb_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_XRGB_RED
-#define RGB_GREEN EXT_XRGB_GREEN
-#define RGB_BLUE EXT_XRGB_BLUE
-#define RGB_ALPHA 0
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extxrgb_convert_internal
-#define gray_rgb_convert_internal gray_extxrgb_convert_internal
-#define rgb_rgb_convert_internal rgb_extxrgb_convert_internal
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define ycc_rgb_convert_internal  ycc_extxrgb_convert_internal
+#define gray_rgb_convert_internal  gray_extxrgb_convert_internal
+#define rgb_rgb_convert_internal  rgb_extxrgb_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -208,25 +208,25 @@ typedef my_color_deconverter *my_cconvert_ptr;
  */
 
 LOCAL(void)
-build_ycc_rgb_table (j_decompress_ptr cinfo)
+build_ycc_rgb_table(j_decompress_ptr cinfo)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   int i;
   JLONG x;
   SHIFT_TEMPS
 
   cconvert->Cr_r_tab = (int *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(int));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(int));
   cconvert->Cb_b_tab = (int *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(int));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(int));
   cconvert->Cr_g_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(JLONG));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(JLONG));
   cconvert->Cb_g_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(JLONG));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(JLONG));
 
   for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
     /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
@@ -238,10 +238,10 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
     cconvert->Cb_b_tab[i] = (int)
                     RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
     /* Cr=>G value is scaled-up -0.71414 * x */
-    cconvert->Cr_g_tab[i] = (- FIX(0.71414)) * x;
+    cconvert->Cr_g_tab[i] = (-FIX(0.71414)) * x;
     /* Cb=>G value is scaled-up -0.34414 * x */
     /* We also add in ONE_HALF so that need not do it in inner loop */
-    cconvert->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
+    cconvert->Cb_g_tab[i] = (-FIX(0.34414)) * x + ONE_HALF;
   }
 }
 
@@ -251,43 +251,42 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-ycc_rgb_convert (j_decompress_ptr cinfo,
-                 JSAMPIMAGE input_buf, JDIMENSION input_row,
-                 JSAMPARRAY output_buf, int num_rows)
+ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      ycc_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                  num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      ycc_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGR:
-      ycc_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                  num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      ycc_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      ycc_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      ycc_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    default:
-      ycc_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                               num_rows);
-      break;
+  case JCS_EXT_RGB:
+    ycc_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    ycc_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGR:
+    ycc_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    ycc_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    ycc_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    ycc_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  default:
+    ycc_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                             num_rows);
+    break;
   }
 }
 
@@ -300,21 +299,21 @@ ycc_rgb_convert (j_decompress_ptr cinfo,
  */
 
 LOCAL(void)
-build_rgb_y_table (j_decompress_ptr cinfo)
+build_rgb_y_table(j_decompress_ptr cinfo)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   JLONG *rgb_y_tab;
   JLONG i;
 
   /* Allocate and fill in the conversion tables. */
   cconvert->rgb_y_tab = rgb_y_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 (TABLE_SIZE * sizeof(JLONG)));
 
   for (i = 0; i <= MAXJSAMPLE; i++) {
-    rgb_y_tab[i+R_Y_OFF] = FIX(0.29900) * i;
-    rgb_y_tab[i+G_Y_OFF] = FIX(0.58700) * i;
-    rgb_y_tab[i+B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
+    rgb_y_tab[i + R_Y_OFF] = FIX(0.29900) * i;
+    rgb_y_tab[i + G_Y_OFF] = FIX(0.58700) * i;
+    rgb_y_tab[i + B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
   }
 }
 
@@ -324,11 +323,10 @@ build_rgb_y_table (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-rgb_gray_convert (j_decompress_ptr cinfo,
-                  JSAMPIMAGE input_buf, JDIMENSION input_row,
-                  JSAMPARRAY output_buf, int num_rows)
+rgb_gray_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                 JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
   register JLONG *ctab = cconvert->rgb_y_tab;
   register JSAMPROW outptr;
@@ -343,13 +341,12 @@ rgb_gray_convert (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr0[col]);
-      g = GETJSAMPLE(inptr1[col]);
-      b = GETJSAMPLE(inptr2[col]);
+      r = inptr0[col];
+      g = inptr1[col];
+      b = inptr2[col];
       /* Y */
-      outptr[col] = (JSAMPLE)
-                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-                 >> SCALEBITS);
+      outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                               ctab[b + B_Y_OFF]) >> SCALEBITS);
     }
   }
 }
@@ -361,9 +358,8 @@ rgb_gray_convert (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-null_convert (j_decompress_ptr cinfo,
-              JSAMPIMAGE input_buf, JDIMENSION input_row,
-              JSAMPARRAY output_buf, int num_rows)
+null_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+             JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   register JSAMPROW inptr, inptr0, inptr1, inptr2, inptr3, outptr;
   register JDIMENSION col;
@@ -423,12 +419,11 @@ null_convert (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-grayscale_convert (j_decompress_ptr cinfo,
-                   JSAMPIMAGE input_buf, JDIMENSION input_row,
-                   JSAMPARRAY output_buf, int num_rows)
+grayscale_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                  JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
-  jcopy_sample_rows(input_buf[0], (int) input_row, output_buf, 0,
-                    num_rows, cinfo->output_width);
+  jcopy_sample_rows(input_buf[0], (int)input_row, output_buf, 0, num_rows,
+                    cinfo->output_width);
 }
 
 
@@ -437,43 +432,42 @@ grayscale_convert (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-gray_rgb_convert (j_decompress_ptr cinfo,
-                  JSAMPIMAGE input_buf, JDIMENSION input_row,
-                  JSAMPARRAY output_buf, int num_rows)
+gray_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                 JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      gray_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      gray_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                    num_rows);
-      break;
-    case JCS_EXT_BGR:
-      gray_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      gray_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                    num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      gray_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                    num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      gray_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                    num_rows);
-      break;
-    default:
-      gray_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                num_rows);
-      break;
+  case JCS_EXT_RGB:
+    gray_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    gray_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+    break;
+  case JCS_EXT_BGR:
+    gray_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    gray_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    gray_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    gray_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+    break;
+  default:
+    gray_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                              num_rows);
+    break;
   }
 }
 
@@ -483,43 +477,42 @@ gray_rgb_convert (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-rgb_rgb_convert (j_decompress_ptr cinfo,
-                  JSAMPIMAGE input_buf, JDIMENSION input_row,
-                  JSAMPARRAY output_buf, int num_rows)
+rgb_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      rgb_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                  num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      rgb_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGR:
-      rgb_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                  num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      rgb_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      rgb_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      rgb_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    default:
-      rgb_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                               num_rows);
-      break;
+  case JCS_EXT_RGB:
+    rgb_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    rgb_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGR:
+    rgb_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    rgb_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    rgb_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    rgb_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  default:
+    rgb_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                             num_rows);
+    break;
   }
 }
 
@@ -532,11 +525,10 @@ rgb_rgb_convert (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-ycck_cmyk_convert (j_decompress_ptr cinfo,
-                   JSAMPIMAGE input_buf, JDIMENSION input_row,
-                   JSAMPARRAY output_buf, int num_rows)
+ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                  JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2, inptr3;
@@ -558,17 +550,17 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      y  = GETJSAMPLE(inptr0[col]);
-      cb = GETJSAMPLE(inptr1[col]);
-      cr = GETJSAMPLE(inptr2[col]);
+      y  = inptr0[col];
+      cb = inptr1[col];
+      cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];   /* red */
       outptr[1] = range_limit[MAXJSAMPLE - (y +                 /* green */
-                              ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+                              ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                                  SCALEBITS)))];
       outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];   /* blue */
       /* K passes through unchanged */
-      outptr[3] = inptr3[col];  /* don't need GETJSAMPLE here */
+      outptr[3] = inptr3[col];
       outptr += 4;
     }
   }
@@ -579,16 +571,15 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
  * RGB565 conversion
  */
 
-#define PACK_SHORT_565_LE(r, g, b)   ((((r) << 8) & 0xF800) |  \
-                                      (((g) << 3) & 0x7E0) | ((b) >> 3))
-#define PACK_SHORT_565_BE(r, g, b)   (((r) & 0xF8) | ((g) >> 5) |  \
-                                      (((g) << 11) & 0xE000) |  \
-                                      (((b) << 5) & 0x1F00))
+#define PACK_SHORT_565_LE(r, g, b) \
+  ((((r) << 8) & 0xF800) | (((g) << 3) & 0x7E0) | ((b) >> 3))
+#define PACK_SHORT_565_BE(r, g, b) \
+  (((r) & 0xF8) | ((g) >> 5) | (((g) << 11) & 0xE000) | (((b) << 5) & 0x1F00))
 
-#define PACK_TWO_PIXELS_LE(l, r)     ((r << 16) | l)
-#define PACK_TWO_PIXELS_BE(l, r)     ((l << 16) | r)
+#define PACK_TWO_PIXELS_LE(l, r)    ((r << 16) | l)
+#define PACK_TWO_PIXELS_BE(l, r)    ((l << 16) | r)
 
-#define PACK_NEED_ALIGNMENT(ptr)     (((size_t)(ptr)) & 3)
+#define PACK_NEED_ALIGNMENT(ptr)    (((size_t)(ptr)) & 3)
 
 #define WRITE_TWO_ALIGNED_PIXELS(addr, pixels)  ((*(int *)(addr)) = pixels)
 
@@ -600,7 +591,7 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
 /* Declarations for ordered dithering
  *
  * We use a 4x4 ordered dither array packed into 32 bits.  This array is
- * sufficent for dithering RGB888 to RGB565.
+ * sufficient for dithering RGB888 to RGB565.
  */
 
 #define DITHER_MASK       0x3
@@ -616,7 +607,7 @@ static const JLONG dither_matrix[4] = {
 static INLINE boolean is_big_endian(void)
 {
   int test_value = 1;
-  if(*(char *)&test_value != 1)
+  if (*(char *)&test_value != 1)
     return TRUE;
   return FALSE;
 }
@@ -624,14 +615,14 @@ static INLINE boolean is_big_endian(void)
 
 /* Include inline routines for RGB565 conversion */
 
-#define PACK_SHORT_565 PACK_SHORT_565_LE
-#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE
-#define ycc_rgb565_convert_internal ycc_rgb565_convert_le
-#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_le
-#define rgb_rgb565_convert_internal rgb_rgb565_convert_le
-#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_le
-#define gray_rgb565_convert_internal gray_rgb565_convert_le
-#define gray_rgb565D_convert_internal gray_rgb565D_convert_le
+#define PACK_SHORT_565  PACK_SHORT_565_LE
+#define PACK_TWO_PIXELS  PACK_TWO_PIXELS_LE
+#define ycc_rgb565_convert_internal  ycc_rgb565_convert_le
+#define ycc_rgb565D_convert_internal  ycc_rgb565D_convert_le
+#define rgb_rgb565_convert_internal  rgb_rgb565_convert_le
+#define rgb_rgb565D_convert_internal  rgb_rgb565D_convert_le
+#define gray_rgb565_convert_internal  gray_rgb565_convert_le
+#define gray_rgb565D_convert_internal  gray_rgb565D_convert_le
 #include "jdcol565.c"
 #undef PACK_SHORT_565
 #undef PACK_TWO_PIXELS
@@ -642,14 +633,14 @@ static INLINE boolean is_big_endian(void)
 #undef gray_rgb565_convert_internal
 #undef gray_rgb565D_convert_internal
 
-#define PACK_SHORT_565 PACK_SHORT_565_BE
-#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE
-#define ycc_rgb565_convert_internal ycc_rgb565_convert_be
-#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_be
-#define rgb_rgb565_convert_internal rgb_rgb565_convert_be
-#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_be
-#define gray_rgb565_convert_internal gray_rgb565_convert_be
-#define gray_rgb565D_convert_internal gray_rgb565D_convert_be
+#define PACK_SHORT_565  PACK_SHORT_565_BE
+#define PACK_TWO_PIXELS  PACK_TWO_PIXELS_BE
+#define ycc_rgb565_convert_internal  ycc_rgb565_convert_be
+#define ycc_rgb565D_convert_internal  ycc_rgb565D_convert_be
+#define rgb_rgb565_convert_internal  rgb_rgb565_convert_be
+#define rgb_rgb565D_convert_internal  rgb_rgb565D_convert_be
+#define gray_rgb565_convert_internal  gray_rgb565_convert_be
+#define gray_rgb565D_convert_internal  gray_rgb565D_convert_be
 #include "jdcol565.c"
 #undef PACK_SHORT_565
 #undef PACK_TWO_PIXELS
@@ -662,9 +653,8 @@ static INLINE boolean is_big_endian(void)
 
 
 METHODDEF(void)
-ycc_rgb565_convert (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION input_row,
-                    JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     ycc_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -674,9 +664,8 @@ ycc_rgb565_convert (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-ycc_rgb565D_convert (j_decompress_ptr cinfo,
-                     JSAMPIMAGE input_buf, JDIMENSION input_row,
-                     JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     ycc_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -686,9 +675,8 @@ ycc_rgb565D_convert (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-rgb_rgb565_convert (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION input_row,
-                    JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     rgb_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -698,9 +686,8 @@ rgb_rgb565_convert (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-rgb_rgb565D_convert (j_decompress_ptr cinfo,
-                     JSAMPIMAGE input_buf, JDIMENSION input_row,
-                     JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     rgb_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -710,9 +697,8 @@ rgb_rgb565D_convert (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-gray_rgb565_convert (j_decompress_ptr cinfo,
-                     JSAMPIMAGE input_buf, JDIMENSION input_row,
-                     JSAMPARRAY output_buf, int num_rows)
+gray_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     gray_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -722,9 +708,8 @@ gray_rgb565_convert (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-gray_rgb565D_convert (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION input_row,
-                      JSAMPARRAY output_buf, int num_rows)
+gray_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                     JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     gray_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -738,7 +723,7 @@ gray_rgb565D_convert (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-start_pass_dcolor (j_decompress_ptr cinfo)
+start_pass_dcolor(j_decompress_ptr cinfo)
 {
   /* no work needed */
 }
@@ -749,15 +734,15 @@ start_pass_dcolor (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_color_deconverter (j_decompress_ptr cinfo)
+jinit_color_deconverter(j_decompress_ptr cinfo)
 {
   my_cconvert_ptr cconvert;
   int ci;
 
   cconvert = (my_cconvert_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_color_deconverter));
-  cinfo->cconvert = (struct jpeg_color_deconverter *) cconvert;
+  cinfo->cconvert = (struct jpeg_color_deconverter *)cconvert;
   cconvert->pub.start_pass = start_pass_dcolor;
 
   /* Make sure num_components agrees with jpeg_color_space */
@@ -843,11 +828,11 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
     cinfo->out_color_components = 3;
     if (cinfo->dither_mode == JDITHER_NONE) {
       if (cinfo->jpeg_color_space == JCS_YCbCr) {
-         if (jsimd_can_ycc_rgb565())
-           cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
-         else {
-           cconvert->pub.color_convert = ycc_rgb565_convert;
-           build_ycc_rgb_table(cinfo);
+        if (jsimd_can_ycc_rgb565())
+          cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
+        else {
+          cconvert->pub.color_convert = ycc_rgb565_convert;
+          build_ycc_rgb_table(cinfo);
         }
       } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
         cconvert->pub.color_convert = gray_rgb565_convert;
diff --git a/media/libjpeg/jdct.h b/media/libjpeg/jdct.h
index faf8e1cf03..66d1718b77 100644
--- a/media/libjpeg/jdct.h
+++ b/media/libjpeg/jdct.h
@@ -36,7 +36,7 @@ typedef int DCTELEM;            /* 16 or 32 bits is fine */
 typedef unsigned int UDCTELEM;
 typedef unsigned long long UDCTELEM2;
 #else
-typedef short DCTELEM;  /* prefer 16 bit with SIMD for parellelism */
+typedef short DCTELEM;          /* prefer 16 bit with SIMD for parellelism */
 typedef unsigned short UDCTELEM;
 typedef unsigned int UDCTELEM2;
 #endif
@@ -63,15 +63,15 @@ typedef unsigned long long UDCTELEM2;
  * Each IDCT routine has its own ideas about the best dct_table element type.
  */
 
-typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
+typedef MULTIPLIER ISLOW_MULT_TYPE;  /* short or int, whichever is faster */
 #if BITS_IN_JSAMPLE == 8
-typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
-#define IFAST_SCALE_BITS  2     /* fractional bits in scale factors */
+typedef MULTIPLIER IFAST_MULT_TYPE;  /* 16 bits is OK, use short if faster */
+#define IFAST_SCALE_BITS  2          /* fractional bits in scale factors */
 #else
-typedef JLONG IFAST_MULT_TYPE;  /* need 32 bits for scaled quantizers */
-#define IFAST_SCALE_BITS  13    /* fractional bits in scale factors */
+typedef JLONG IFAST_MULT_TYPE;       /* need 32 bits for scaled quantizers */
+#define IFAST_SCALE_BITS  13         /* fractional bits in scale factors */
 #endif
-typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
+typedef FAST_FLOAT FLOAT_MULT_TYPE;  /* preferred floating type */
 
 
 /*
@@ -90,64 +90,64 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
 
 /* Extern declarations for the forward and inverse DCT routines. */
 
-EXTERN(void) jpeg_fdct_islow (DCTELEM *data);
-EXTERN(void) jpeg_fdct_ifast (DCTELEM *data);
-EXTERN(void) jpeg_fdct_float (FAST_FLOAT *data);
-
-EXTERN(void) jpeg_idct_islow
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_ifast
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_float
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_7x7
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_6x6
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_5x5
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_4x4
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_3x3
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_2x2
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_1x1
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_9x9
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_10x10
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_11x11
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_12x12
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_13x13
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_14x14
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_15x15
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_16x16
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_fdct_islow(DCTELEM *data);
+EXTERN(void) jpeg_fdct_ifast(DCTELEM *data);
+EXTERN(void) jpeg_fdct_float(FAST_FLOAT *data);
+
+EXTERN(void) jpeg_idct_islow(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_ifast(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_float(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_7x7(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_6x6(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_5x5(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_4x4(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_3x3(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_2x2(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_1x1(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_9x9(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_10x10(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_11x11(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_12x12(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_13x13(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_14x14(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_15x15(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_16x16(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
 
 
 /*
@@ -160,22 +160,22 @@ EXTERN(void) jpeg_idct_16x16
  * and may differ from one module to the next.
  */
 
-#define ONE     ((JLONG) 1)
-#define CONST_SCALE (ONE << CONST_BITS)
+#define ONE          ((JLONG)1)
+#define CONST_SCALE  (ONE << CONST_BITS)
 
 /* Convert a positive real constant to an integer scaled by CONST_SCALE.
  * Caution: some C compilers fail to reduce "FIX(constant)" at compile time,
  * thus causing a lot of useless floating-point operations at run time.
  */
 
-#define FIX(x)  ((JLONG) ((x) * CONST_SCALE + 0.5))
+#define FIX(x)  ((JLONG)((x) * CONST_SCALE + 0.5))
 
 /* Descale and correctly round a JLONG value that's scaled by N bits.
  * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
  * the fudge factor is correct for either sign of X.
  */
 
-#define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
+#define DESCALE(x, n)  RIGHT_SHIFT((x) + (ONE << ((n) - 1)), n)
 
 /* Multiply a JLONG variable by a JLONG constant to yield a JLONG result.
  * This macro is used only when the two inputs will actually be no more than
@@ -187,22 +187,22 @@ EXTERN(void) jpeg_idct_16x16
  */
 
 #ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
-#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((INT16) (const)))
+#define MULTIPLY16C16(var, const)  (((INT16)(var)) * ((INT16)(const)))
 #endif
 #ifdef SHORTxLCONST_32          /* known to work with Microsoft C 6.0 */
-#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((JLONG) (const)))
+#define MULTIPLY16C16(var, const)  (((INT16)(var)) * ((JLONG)(const)))
 #endif
 
 #ifndef MULTIPLY16C16           /* default definition */
-#define MULTIPLY16C16(var,const)  ((var) * (const))
+#define MULTIPLY16C16(var, const)  ((var) * (const))
 #endif
 
 /* Same except both inputs are variables. */
 
 #ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
-#define MULTIPLY16V16(var1,var2)  (((INT16) (var1)) * ((INT16) (var2)))
+#define MULTIPLY16V16(var1, var2)  (((INT16)(var1)) * ((INT16)(var2)))
 #endif
 
 #ifndef MULTIPLY16V16           /* default definition */
-#define MULTIPLY16V16(var1,var2)  ((var1) * (var2))
+#define MULTIPLY16V16(var1, var2)  ((var1) * (var2))
 #endif
diff --git a/media/libjpeg/jddctmgr.c b/media/libjpeg/jddctmgr.c
index 3a5ba7e893..e78d7bebe2 100644
--- a/media/libjpeg/jddctmgr.c
+++ b/media/libjpeg/jddctmgr.c
@@ -6,7 +6,7 @@
  * Modified 2002-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015, D. R. Commander.
+ * Copyright (C) 2010, 2015, 2022, D. R. Commander.
  * Copyright (C) 2013, MIPS Technologies, Inc., California.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -94,9 +94,9 @@ typedef union {
  */
 
 METHODDEF(void)
-start_pass (j_decompress_ptr cinfo)
+start_pass(j_decompress_ptr cinfo)
 {
-  my_idct_ptr idct = (my_idct_ptr) cinfo->idct;
+  my_idct_ptr idct = (my_idct_ptr)cinfo->idct;
   int ci, i;
   jpeg_component_info *compptr;
   int method = 0;
@@ -233,7 +233,7 @@ start_pass (j_decompress_ptr cinfo)
      * multiplier table all-zero; we'll be reading zeroes from the
      * coefficient controller's buffer anyway.
      */
-    if (! compptr->component_needed || idct->cur_method[ci] == method)
+    if (!compptr->component_needed || idct->cur_method[ci] == method)
       continue;
     qtbl = compptr->quant_table;
     if (qtbl == NULL)           /* happens if no data yet for component */
@@ -246,9 +246,9 @@ start_pass (j_decompress_ptr cinfo)
         /* For LL&M IDCT method, multipliers are equal to raw quantization
          * coefficients, but are stored as ints to ensure access efficiency.
          */
-        ISLOW_MULT_TYPE *ismtbl = (ISLOW_MULT_TYPE *) compptr->dct_table;
+        ISLOW_MULT_TYPE *ismtbl = (ISLOW_MULT_TYPE *)compptr->dct_table;
         for (i = 0; i < DCTSIZE2; i++) {
-          ismtbl[i] = (ISLOW_MULT_TYPE) qtbl->quantval[i];
+          ismtbl[i] = (ISLOW_MULT_TYPE)qtbl->quantval[i];
         }
       }
       break;
@@ -263,8 +263,8 @@ start_pass (j_decompress_ptr cinfo)
          * For integer operation, the multiplier table is to be scaled by
          * IFAST_SCALE_BITS.
          */
-        IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table;
-#define CONST_BITS 14
+        IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *)compptr->dct_table;
+#define CONST_BITS  14
         static const INT16 aanscales[DCTSIZE2] = {
           /* precomputed values scaled up by 14 bits */
           16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
@@ -280,9 +280,9 @@ start_pass (j_decompress_ptr cinfo)
 
         for (i = 0; i < DCTSIZE2; i++) {
           ifmtbl[i] = (IFAST_MULT_TYPE)
-            DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
-                                  (JLONG) aanscales[i]),
-                    CONST_BITS-IFAST_SCALE_BITS);
+            DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+                                  (JLONG)aanscales[i]),
+                    CONST_BITS - IFAST_SCALE_BITS);
         }
       }
       break;
@@ -295,7 +295,7 @@ start_pass (j_decompress_ptr cinfo)
          *   scalefactor[0] = 1
          *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
          */
-        FLOAT_MULT_TYPE *fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table;
+        FLOAT_MULT_TYPE *fmtbl = (FLOAT_MULT_TYPE *)compptr->dct_table;
         int row, col;
         static const double aanscalefactor[DCTSIZE] = {
           1.0, 1.387039845, 1.306562965, 1.175875602,
@@ -306,7 +306,7 @@ start_pass (j_decompress_ptr cinfo)
         for (row = 0; row < DCTSIZE; row++) {
           for (col = 0; col < DCTSIZE; col++) {
             fmtbl[i] = (FLOAT_MULT_TYPE)
-              ((double) qtbl->quantval[i] *
+              ((double)qtbl->quantval[i] *
                aanscalefactor[row] * aanscalefactor[col]);
             i++;
           }
@@ -327,25 +327,25 @@ start_pass (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_inverse_dct (j_decompress_ptr cinfo)
+jinit_inverse_dct(j_decompress_ptr cinfo)
 {
   my_idct_ptr idct;
   int ci;
   jpeg_component_info *compptr;
 
   idct = (my_idct_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_idct_controller));
-  cinfo->idct = (struct jpeg_inverse_dct *) idct;
+  cinfo->idct = (struct jpeg_inverse_dct *)idct;
   idct->pub.start_pass = start_pass;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Allocate and pre-zero a multiplier table for each component */
     compptr->dct_table =
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(multiplier_table));
-    MEMZERO(compptr->dct_table, sizeof(multiplier_table));
+    memset(compptr->dct_table, 0, sizeof(multiplier_table));
     /* Mark multiplier table not yet set up for any method */
     idct->cur_method[ci] = -1;
   }
diff --git a/media/libjpeg/jdhuff.c b/media/libjpeg/jdhuff.c
index fa78e2962a..679d221685 100644
--- a/media/libjpeg/jdhuff.c
+++ b/media/libjpeg/jdhuff.c
@@ -4,7 +4,8 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2018-2019, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -15,6 +16,9 @@
  * up to the start of the current MCU.  To do this, we copy state variables
  * into local working storage, and update them back to the permanent
  * storage only upon successful completion of an MCU.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
  */
 
 #define JPEG_INTERNALS
@@ -36,24 +40,6 @@ typedef struct {
   int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src)  \
-        ((dest).last_dc_val[0] = (src).last_dc_val[0], \
-         (dest).last_dc_val[1] = (src).last_dc_val[1], \
-         (dest).last_dc_val[2] = (src).last_dc_val[2], \
-         (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
@@ -88,9 +74,9 @@ typedef huff_entropy_decoder *huff_entropy_ptr;
  */
 
 METHODDEF(void)
-start_pass_huff_decoder (j_decompress_ptr cinfo)
+start_pass_huff_decoder(j_decompress_ptr cinfo)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int ci, blkn, dctbl, actbl;
   d_derived_tbl **pdtbl;
   jpeg_component_info *compptr;
@@ -99,7 +85,7 @@ start_pass_huff_decoder (j_decompress_ptr cinfo)
    * This ought to be an error condition, but we make it a warning because
    * there are some baseline files out there with all zeroes in these bytes.
    */
-  if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2-1 ||
+  if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 ||
       cinfo->Ah != 0 || cinfo->Al != 0)
     WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
 
@@ -152,8 +138,8 @@ start_pass_huff_decoder (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
-                         d_derived_tbl **pdtbl)
+jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC, int tblno,
+                        d_derived_tbl **pdtbl)
 {
   JHUFF_TBL *htbl;
   d_derived_tbl *dtbl;
@@ -178,7 +164,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
   /* Allocate a workspace if we haven't already done so. */
   if (*pdtbl == NULL)
     *pdtbl = (d_derived_tbl *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(d_derived_tbl));
   dtbl = *pdtbl;
   dtbl->pub = htbl;             /* fill in back link */
@@ -187,11 +173,11 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 
   p = 0;
   for (l = 1; l <= 16; l++) {
-    i = (int) htbl->bits[l];
+    i = (int)htbl->bits[l];
     if (i < 0 || p + i > 256)   /* protect against table overrun */
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     while (i--)
-      huffsize[p++] = (char) l;
+      huffsize[p++] = (char)l;
   }
   huffsize[p] = 0;
   numsymbols = p;
@@ -203,14 +189,14 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
   si = huffsize[0];
   p = 0;
   while (huffsize[p]) {
-    while (((int) huffsize[p]) == si) {
+    while (((int)huffsize[p]) == si) {
       huffcode[p++] = code;
       code++;
     }
     /* code is now 1 more than the last code used for codelength si; but
      * it must still fit in si bits, since no code is allowed to be all ones.
      */
-    if (((JLONG) code) >= (((JLONG) 1) << si))
+    if (((JLONG)code) >= (((JLONG)1) << si))
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     code <<= 1;
     si++;
@@ -224,9 +210,9 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
       /* valoffset[l] = huffval[] index of 1st symbol of code length l,
        * minus the minimum code of length l
        */
-      dtbl->valoffset[l] = (JLONG) p - (JLONG) huffcode[p];
+      dtbl->valoffset[l] = (JLONG)p - (JLONG)huffcode[p];
       p += htbl->bits[l];
-      dtbl->maxcode[l] = huffcode[p-1]; /* maximum code of length l */
+      dtbl->maxcode[l] = huffcode[p - 1]; /* maximum code of length l */
     } else {
       dtbl->maxcode[l] = -1;    /* -1 if no codes of this length */
     }
@@ -241,16 +227,16 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
    * with that code.
    */
 
-   for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
-     dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
+  for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
+    dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
 
   p = 0;
   for (l = 1; l <= HUFF_LOOKAHEAD; l++) {
-    for (i = 1; i <= (int) htbl->bits[l]; i++, p++) {
+    for (i = 1; i <= (int)htbl->bits[l]; i++, p++) {
       /* l = current code's length, p = its index in huffcode[] & huffval[]. */
       /* Generate left-justified code followed by all possible bit sequences */
-      lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l);
-      for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) {
+      lookbits = huffcode[p] << (HUFF_LOOKAHEAD - l);
+      for (ctr = 1 << (HUFF_LOOKAHEAD - l); ctr > 0; ctr--) {
         dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p];
         lookbits++;
       }
@@ -291,14 +277,14 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 #ifdef SLOW_SHIFT_32
 #define MIN_GET_BITS  15        /* minimum allowable value */
 #else
-#define MIN_GET_BITS  (BIT_BUF_SIZE-7)
+#define MIN_GET_BITS  (BIT_BUF_SIZE - 7)
 #endif
 
 
 GLOBAL(boolean)
-jpeg_fill_bit_buffer (bitread_working_state *state,
-                      register bit_buf_type get_buffer, register int bits_left,
-                      int nbits)
+jpeg_fill_bit_buffer(bitread_working_state *state,
+                     register bit_buf_type get_buffer, register int bits_left,
+                     int nbits)
 /* Load up the bit buffer to a depth of at least nbits */
 {
   /* Copy heavily used state fields into locals (hopefully registers) */
@@ -316,13 +302,13 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
 
       /* Attempt to read a byte */
       if (bytes_in_buffer == 0) {
-        if (! (*cinfo->src->fill_input_buffer) (cinfo))
+        if (!(*cinfo->src->fill_input_buffer) (cinfo))
           return FALSE;
         next_input_byte = cinfo->src->next_input_byte;
         bytes_in_buffer = cinfo->src->bytes_in_buffer;
       }
       bytes_in_buffer--;
-      c = GETJOCTET(*next_input_byte++);
+      c = *next_input_byte++;
 
       /* If it's 0xFF, check and discard stuffed zero byte */
       if (c == 0xFF) {
@@ -333,13 +319,13 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
          */
         do {
           if (bytes_in_buffer == 0) {
-            if (! (*cinfo->src->fill_input_buffer) (cinfo))
+            if (!(*cinfo->src->fill_input_buffer) (cinfo))
               return FALSE;
             next_input_byte = cinfo->src->next_input_byte;
             bytes_in_buffer = cinfo->src->bytes_in_buffer;
           }
           bytes_in_buffer--;
-          c = GETJOCTET(*next_input_byte++);
+          c = *next_input_byte++;
         } while (c == 0xFF);
 
         if (c == 0) {
@@ -365,7 +351,7 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
       bits_left += 8;
     } /* end while */
   } else {
-  no_more_bytes:
+no_more_bytes:
     /* We get here if we've read the marker that terminates the compressed
      * data segment.  There should be enough bits in the buffer register
      * to satisfy the request; if so, no problem.
@@ -376,7 +362,7 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
        * We use a nonvolatile flag to ensure that only one warning message
        * appears per data segment.
        */
-      if (! cinfo->entropy->insufficient_data) {
+      if (!cinfo->entropy->insufficient_data) {
         WARNMS(cinfo, JWRN_HIT_MARKER);
         cinfo->entropy->insufficient_data = TRUE;
       }
@@ -400,11 +386,10 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
    handle markers.  We have to hand off any blocks with markers to the
    slower routines. */
 
-#define GET_BYTE \
-{ \
+#define GET_BYTE { \
   register int c0, c1; \
-  c0 = GETJOCTET(*buffer++); \
-  c1 = GETJOCTET(*buffer); \
+  c0 = *buffer++; \
+  c1 = *buffer; \
   /* Pre-execute most common case */ \
   get_buffer = (get_buffer << 8) | c0; \
   bits_left += 8; \
@@ -421,7 +406,7 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
   } \
 }
 
-#if SIZEOF_SIZE_T==8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64) || (defined(__x86_64__) && defined(__ILP32__))
 
 /* Pre-fetch 48 bytes, because the holding register is 64-bit */
 #define FILL_BIT_BUFFER_FAST \
@@ -446,9 +431,9 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
  */
 
 GLOBAL(int)
-jpeg_huff_decode (bitread_working_state *state,
-                  register bit_buf_type get_buffer, register int bits_left,
-                  d_derived_tbl *htbl, int min_bits)
+jpeg_huff_decode(bitread_working_state *state,
+                 register bit_buf_type get_buffer, register int bits_left,
+                 d_derived_tbl *htbl, int min_bits)
 {
   register int l = min_bits;
   register JLONG code;
@@ -460,7 +445,7 @@ jpeg_huff_decode (bitread_working_state *state,
   code = GET_BITS(l);
 
   /* Collect the rest of the Huffman code one bit at a time. */
-  /* This is per Figure F.16 in the JPEG spec. */
+  /* This is per Figure F.16. */
 
   while (code > htbl->maxcode[l]) {
     code <<= 1;
@@ -480,7 +465,7 @@ jpeg_huff_decode (bitread_working_state *state,
     return 0;                   /* fake a zero as the safest result */
   }
 
-  return htbl->pub->huffval[ (int) (code + htbl->valoffset[l]) ];
+  return htbl->pub->huffval[(int)(code + htbl->valoffset[l])];
 }
 
 
@@ -492,22 +477,26 @@ jpeg_huff_decode (bitread_working_state *state,
 #define AVOID_TABLES
 #ifdef AVOID_TABLES
 
-#define NEG_1 ((unsigned int)-1)
-#define HUFF_EXTEND(x,s)  ((x) + ((((x) - (1<<((s)-1))) >> 31) & (((NEG_1)<<(s)) + 1)))
+#define NEG_1  ((unsigned int)-1)
+#define HUFF_EXTEND(x, s) \
+  ((x) + ((((x) - (1 << ((s) - 1))) >> 31) & (((NEG_1) << (s)) + 1)))
 
 #else
 
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
+#define HUFF_EXTEND(x, s) \
+  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
 
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+static const int extend_test[16] = {   /* entry n is 2**(n-1) */
+  0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
+  0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+};
 
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
+static const int extend_offset[16] = { /* entry n is (-1 << n) + 1 */
+  0, ((-1) << 1) + 1, ((-1) << 2) + 1, ((-1) << 3) + 1, ((-1) << 4) + 1,
+  ((-1) << 5) + 1, ((-1) << 6) + 1, ((-1) << 7) + 1, ((-1) << 8) + 1,
+  ((-1) << 9) + 1, ((-1) << 10) + 1, ((-1) << 11) + 1, ((-1) << 12) + 1,
+  ((-1) << 13) + 1, ((-1) << 14) + 1, ((-1) << 15) + 1
+};
 
 #endif /* AVOID_TABLES */
 
@@ -518,9 +507,9 @@ static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
  */
 
 LOCAL(boolean)
-process_restart (j_decompress_ptr cinfo)
+process_restart(j_decompress_ptr cinfo)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int ci;
 
   /* Throw away any unused bits remaining in bit buffer; */
@@ -529,7 +518,7 @@ process_restart (j_decompress_ptr cinfo)
   entropy->bitstate.bits_left = 0;
 
   /* Advance past the RSTn marker */
-  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+  if (!(*cinfo->marker->read_restart_marker) (cinfo))
     return FALSE;
 
   /* Re-initialize DC predictions to 0 */
@@ -551,18 +540,24 @@ process_restart (j_decompress_ptr cinfo)
 }
 
 
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("signed-integer-overflow"),
+               no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
 LOCAL(boolean)
-decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   BITREAD_STATE_VARS;
   int blkn;
   savable_state state;
   /* Outer loop handles each block in the MCU */
 
   /* Load up working state */
-  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-  ASSIGN_STATE(state, entropy->saved);
+  BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+  state = entropy->saved;
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -583,11 +578,19 @@ decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     if (entropy->dc_needed[blkn]) {
       /* Convert DC difference to actual value, update last_dc_val */
       int ci = cinfo->MCU_membership[blkn];
+      /* Certain malformed JPEG images produce repeated DC coefficient
+       * differences of 2047 or -2047, which causes state.last_dc_val[ci] to
+       * grow until it overflows or underflows a 32-bit signed integer.  This
+       * behavior is, to the best of our understanding, innocuous, and it is
+       * unclear how to work around it without potentially affecting
+       * performance.  Thus, we (hopefully temporarily) suppress UBSan integer
+       * overflow errors for this function and decode_mcu_fast().
+       */
       s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       if (block) {
         /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
-        (*block)[0] = (JCOEF) s;
+        (*block)[0] = (JCOEF)s;
       }
     }
 
@@ -610,7 +613,7 @@ decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
            * Note: the extra entries in jpeg_natural_order[] will save us
            * if k >= DCTSIZE2, which could happen if the data is corrupted.
            */
-          (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+          (*block)[jpeg_natural_order[k]] = (JCOEF)s;
         } else {
           if (r != 15)
             break;
@@ -642,16 +645,22 @@ decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   /* Completed MCU, so update state */
-  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-  ASSIGN_STATE(entropy->saved, state);
+  BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+  entropy->saved = state;
   return TRUE;
 }
 
 
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("signed-integer-overflow"),
+               no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
 LOCAL(boolean)
-decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   BITREAD_STATE_VARS;
   JOCTET *buffer;
   int blkn;
@@ -659,9 +668,9 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Outer loop handles each block in the MCU */
 
   /* Load up working state */
-  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-  buffer = (JOCTET *) br_state.next_input_byte;
-  ASSIGN_STATE(state, entropy->saved);
+  BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+  buffer = (JOCTET *)br_state.next_input_byte;
+  state = entropy->saved;
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -669,7 +678,7 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn];
     register int s, k, r, l;
 
-    HUFF_DECODE_FAST(s, l, dctbl, slow_decode_mcu);
+    HUFF_DECODE_FAST(s, l, dctbl);
     if (s) {
       FILL_BIT_BUFFER_FAST
       r = GET_BITS(s);
@@ -678,16 +687,19 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
     if (entropy->dc_needed[blkn]) {
       int ci = cinfo->MCU_membership[blkn];
+      /* Refer to the comment in decode_mcu_slow() regarding the supression of
+       * a UBSan integer overflow error in this line of code.
+       */
       s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       if (block)
-        (*block)[0] = (JCOEF) s;
+        (*block)[0] = (JCOEF)s;
     }
 
     if (entropy->ac_needed[blkn] && block) {
 
       for (k = 1; k < DCTSIZE2; k++) {
-        HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
+        HUFF_DECODE_FAST(s, l, actbl);
         r = s >> 4;
         s &= 15;
 
@@ -696,7 +708,7 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
           FILL_BIT_BUFFER_FAST
           r = GET_BITS(s);
           s = HUFF_EXTEND(r, s);
-          (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+          (*block)[jpeg_natural_order[k]] = (JCOEF)s;
         } else {
           if (r != 15) break;
           k += 15;
@@ -706,7 +718,7 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     } else {
 
       for (k = 1; k < DCTSIZE2; k++) {
-        HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
+        HUFF_DECODE_FAST(s, l, actbl);
         r = s >> 4;
         s &= 15;
 
@@ -723,15 +735,14 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   if (cinfo->unread_marker != 0) {
-slow_decode_mcu:
     cinfo->unread_marker = 0;
     return FALSE;
   }
 
   br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
   br_state.next_input_byte = buffer;
-  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-  ASSIGN_STATE(entropy->saved, state);
+  BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+  entropy->saved = state;
   return TRUE;
 }
 
@@ -751,43 +762,43 @@ slow_decode_mcu:
  * this module, since we'll just re-assign them on the next call.)
  */
 
-#define BUFSIZE (DCTSIZE2 * 8)
+#define BUFSIZE  (DCTSIZE2 * 8)
 
 METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int usefast = 1;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
     usefast = 0;
   }
 
-  if (cinfo->src->bytes_in_buffer < BUFSIZE * (size_t)cinfo->blocks_in_MCU
-    || cinfo->unread_marker != 0)
+  if (cinfo->src->bytes_in_buffer < BUFSIZE * (size_t)cinfo->blocks_in_MCU ||
+      cinfo->unread_marker != 0)
     usefast = 0;
 
   /* If we've run out of data, just leave the MCU set to zeroes.
    * This way, we return uniform gray for the remainder of the segment.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (!entropy->pub.insufficient_data) {
 
     if (usefast) {
       if (!decode_mcu_fast(cinfo, MCU_data)) goto use_slow;
-    }
-    else {
-      use_slow:
+    } else {
+use_slow:
       if (!decode_mcu_slow(cinfo, MCU_data)) return FALSE;
     }
 
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -798,7 +809,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 GLOBAL(void)
-jinit_huff_decoder (j_decompress_ptr cinfo)
+jinit_huff_decoder(j_decompress_ptr cinfo)
 {
   huff_entropy_ptr entropy;
   int i;
@@ -807,12 +818,12 @@ jinit_huff_decoder (j_decompress_ptr cinfo)
      are the default tables.  Thus, if the tables are not set by the time
      the Huffman decoder is initialized (usually within the body of
      jpeg_start_decompress()), we set them to default values. */
-  std_huff_tables((j_common_ptr) cinfo);
+  std_huff_tables((j_common_ptr)cinfo);
 
   entropy = (huff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(huff_entropy_decoder));
-  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
   entropy->pub.start_pass = start_pass_huff_decoder;
   entropy->pub.decode_mcu = decode_mcu;
 
diff --git a/media/libjpeg/jdhuff.h b/media/libjpeg/jdhuff.h
index 3f15d71a81..cfa0b7f558 100644
--- a/media/libjpeg/jdhuff.h
+++ b/media/libjpeg/jdhuff.h
@@ -4,7 +4,8 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010-2011, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010-2011, 2015-2016, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -43,13 +44,12 @@ typedef struct {
    * if too long.  The next 8 bits of each entry contain the
    * symbol.
    */
-  int lookup[1<<HUFF_LOOKAHEAD];
+  int lookup[1 << HUFF_LOOKAHEAD];
 } d_derived_tbl;
 
 /* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_d_derived_tbl
-        (j_decompress_ptr cinfo, boolean isDC, int tblno,
-         d_derived_tbl ** pdtbl);
+EXTERN(void) jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC,
+                                     int tblno, d_derived_tbl **pdtbl);
 
 
 /*
@@ -74,11 +74,16 @@ EXTERN(void) jpeg_make_d_derived_tbl
 #error Cannot determine word size
 #endif
 
-#if SIZEOF_SIZE_T==8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
 
 typedef size_t bit_buf_type;            /* type of bit-extraction buffer */
 #define BIT_BUF_SIZE  64                /* size of buffer in bits */
 
+#elif defined(__x86_64__) && defined(__ILP32__)
+
+typedef unsigned long long bit_buf_type; /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  64                 /* size of buffer in bits */
+
 #else
 
 typedef unsigned long bit_buf_type;     /* type of bit-extraction buffer */
@@ -113,23 +118,23 @@ typedef struct {                /* Bitreading working state within an MCU */
 } bitread_working_state;
 
 /* Macros to declare and load/save bitread local variables. */
-#define BITREAD_STATE_VARS  \
-        register bit_buf_type get_buffer;  \
-        register int bits_left;  \
-        bitread_working_state br_state
-
-#define BITREAD_LOAD_STATE(cinfop,permstate)  \
-        br_state.cinfo = cinfop; \
-        br_state.next_input_byte = cinfop->src->next_input_byte; \
-        br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
-        get_buffer = permstate.get_buffer; \
-        bits_left = permstate.bits_left;
-
-#define BITREAD_SAVE_STATE(cinfop,permstate)  \
-        cinfop->src->next_input_byte = br_state.next_input_byte; \
-        cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
-        permstate.get_buffer = get_buffer; \
-        permstate.bits_left = bits_left
+#define BITREAD_STATE_VARS \
+  register bit_buf_type get_buffer; \
+  register int bits_left; \
+  bitread_working_state br_state
+
+#define BITREAD_LOAD_STATE(cinfop, permstate) \
+  br_state.cinfo = cinfop; \
+  br_state.next_input_byte = cinfop->src->next_input_byte; \
+  br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
+  get_buffer = permstate.get_buffer; \
+  bits_left = permstate.bits_left;
+
+#define BITREAD_SAVE_STATE(cinfop, permstate) \
+  cinfop->src->next_input_byte = br_state.next_input_byte; \
+  cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
+  permstate.get_buffer = get_buffer; \
+  permstate.bits_left = bits_left
 
 /*
  * These macros provide the in-line portion of bit fetching.
@@ -137,7 +142,7 @@ typedef struct {                /* Bitreading working state within an MCU */
  * before using GET_BITS, PEEK_BITS, or DROP_BITS.
  * The variables get_buffer and bits_left are assumed to be locals,
  * but the state struct might not be (jpeg_huff_decode needs this).
- *      CHECK_BIT_BUFFER(state,n,action);
+ *      CHECK_BIT_BUFFER(state, n, action);
  *              Ensure there are N bits in get_buffer; if suspend, take action.
  *      val = GET_BITS(n);
  *              Fetch next N bits.
@@ -149,25 +154,27 @@ typedef struct {                /* Bitreading working state within an MCU */
  * is evaluated multiple times.
  */
 
-#define CHECK_BIT_BUFFER(state,nbits,action) \
-        { if (bits_left < (nbits)) {  \
-            if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits))  \
-              { action; }  \
-            get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
+#define CHECK_BIT_BUFFER(state, nbits, action) { \
+  if (bits_left < (nbits)) { \
+    if (!jpeg_fill_bit_buffer(&(state), get_buffer, bits_left, nbits)) \
+      { action; } \
+    get_buffer = (state).get_buffer;  bits_left = (state).bits_left; \
+  } \
+}
 
 #define GET_BITS(nbits) \
-        (((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1))
+  (((int)(get_buffer >> (bits_left -= (nbits)))) & ((1 << (nbits)) - 1))
 
 #define PEEK_BITS(nbits) \
-        (((int) (get_buffer >> (bits_left -  (nbits)))) & ((1<<(nbits))-1))
+  (((int)(get_buffer >> (bits_left -  (nbits)))) & ((1 << (nbits)) - 1))
 
 #define DROP_BITS(nbits) \
-        (bits_left -= (nbits))
+  (bits_left -= (nbits))
 
 /* Load up the bit buffer to a depth of at least nbits */
-EXTERN(boolean) jpeg_fill_bit_buffer
-        (bitread_working_state *state, register bit_buf_type get_buffer,
-         register int bits_left, int nbits);
+EXTERN(boolean) jpeg_fill_bit_buffer(bitread_working_state *state,
+                                     register bit_buf_type get_buffer,
+                                     register int bits_left, int nbits);
 
 
 /*
@@ -187,13 +194,14 @@ EXTERN(boolean) jpeg_fill_bit_buffer
  * 3. jpeg_huff_decode returns -1 if forced to suspend.
  */
 
-#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
-{ register int nb, look; \
+#define HUFF_DECODE(result, state, htbl, failaction, slowlabel) { \
+  register int nb, look; \
   if (bits_left < HUFF_LOOKAHEAD) { \
-    if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
+    if (!jpeg_fill_bit_buffer(&state, get_buffer, bits_left, 0)) \
+      { failaction; } \
+    get_buffer = state.get_buffer;  bits_left = state.bits_left; \
     if (bits_left < HUFF_LOOKAHEAD) { \
-      nb = 1; goto slowlabel; \
+      nb = 1;  goto slowlabel; \
     } \
   } \
   look = PEEK_BITS(HUFF_LOOKAHEAD); \
@@ -202,13 +210,14 @@ EXTERN(boolean) jpeg_fill_bit_buffer
     result = htbl->lookup[look] & ((1 << HUFF_LOOKAHEAD) - 1); \
   } else { \
 slowlabel: \
-    if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
-        { failaction; } \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
+    if ((result = \
+         jpeg_huff_decode(&state, get_buffer, bits_left, htbl, nb)) < 0) \
+      { failaction; } \
+    get_buffer = state.get_buffer;  bits_left = state.bits_left; \
   } \
 }
 
-#define HUFF_DECODE_FAST(s,nb,htbl,slowlabel) \
+#define HUFF_DECODE_FAST(s, nb, htbl) \
   FILL_BIT_BUFFER_FAST; \
   s = PEEK_BITS(HUFF_LOOKAHEAD); \
   s = htbl->lookup[s]; \
@@ -226,11 +235,13 @@ slowlabel: \
       nb++; \
     } \
     if (nb > 16) \
-      goto slowlabel; \
-    s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) ]; \
+      s = 0; \
+    else \
+      s = htbl->pub->huffval[(int)(s + htbl->valoffset[nb]) & 0xFF]; \
   }
 
 /* Out-of-line case for Huffman code fetching */
-EXTERN(int) jpeg_huff_decode
-        (bitread_working_state *state, register bit_buf_type get_buffer,
-         register int bits_left, d_derived_tbl *htbl, int min_bits);
+EXTERN(int) jpeg_huff_decode(bitread_working_state *state,
+                             register bit_buf_type get_buffer,
+                             register int bits_left, d_derived_tbl *htbl,
+                             int min_bits);
diff --git a/media/libjpeg/jdicc.c b/media/libjpeg/jdicc.c
new file mode 100644
index 0000000000..50aa9a9676
--- /dev/null
+++ b/media/libjpeg/jdicc.c
@@ -0,0 +1,167 @@
+/*
+ * jdicc.c
+ *
+ * Copyright (C) 1997-1998, Thomas G. Lane, Todd Newman.
+ * Copyright (C) 2017, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides code to read International Color Consortium (ICC) device
+ * profiles embedded in JFIF JPEG image files.  The ICC has defined a standard
+ * for including such data in JPEG "APP2" markers.  The code given here does
+ * not know anything about the internal structure of the ICC profile data; it
+ * just knows how to get the profile data from a JPEG file while reading it.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+
+#define ICC_MARKER  (JPEG_APP0 + 2)     /* JPEG marker code for ICC */
+#define ICC_OVERHEAD_LEN  14            /* size of non-profile data in APP2 */
+
+
+/*
+ * Handy subroutine to test whether a saved marker is an ICC profile marker.
+ */
+
+LOCAL(boolean)
+marker_is_icc(jpeg_saved_marker_ptr marker)
+{
+  return
+    marker->marker == ICC_MARKER &&
+    marker->data_length >= ICC_OVERHEAD_LEN &&
+    /* verify the identifying string */
+    marker->data[0] == 0x49 &&
+    marker->data[1] == 0x43 &&
+    marker->data[2] == 0x43 &&
+    marker->data[3] == 0x5F &&
+    marker->data[4] == 0x50 &&
+    marker->data[5] == 0x52 &&
+    marker->data[6] == 0x4F &&
+    marker->data[7] == 0x46 &&
+    marker->data[8] == 0x49 &&
+    marker->data[9] == 0x4C &&
+    marker->data[10] == 0x45 &&
+    marker->data[11] == 0x0;
+}
+
+
+/*
+ * See if there was an ICC profile in the JPEG file being read; if so,
+ * reassemble and return the profile data.
+ *
+ * TRUE is returned if an ICC profile was found, FALSE if not.  If TRUE is
+ * returned, *icc_data_ptr is set to point to the returned data, and
+ * *icc_data_len is set to its length.
+ *
+ * IMPORTANT: the data at *icc_data_ptr is allocated with malloc() and must be
+ * freed by the caller with free() when the caller no longer needs it.
+ * (Alternatively, we could write this routine to use the IJG library's memory
+ * allocator, so that the data would be freed implicitly when
+ * jpeg_finish_decompress() is called.  But it seems likely that many
+ * applications will prefer to have the data stick around after decompression
+ * finishes.)
+ */
+
+GLOBAL(boolean)
+jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
+                      unsigned int *icc_data_len)
+{
+  jpeg_saved_marker_ptr marker;
+  int num_markers = 0;
+  int seq_no;
+  JOCTET *icc_data;
+  unsigned int total_length;
+#define MAX_SEQ_NO  255         /* sufficient since marker numbers are bytes */
+  char marker_present[MAX_SEQ_NO + 1];      /* 1 if marker found */
+  unsigned int data_length[MAX_SEQ_NO + 1]; /* size of profile data in marker */
+  unsigned int data_offset[MAX_SEQ_NO + 1]; /* offset for data in marker */
+
+  if (icc_data_ptr == NULL || icc_data_len == NULL)
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+  if (cinfo->global_state < DSTATE_READY)
+    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+  *icc_data_ptr = NULL;         /* avoid confusion if FALSE return */
+  *icc_data_len = 0;
+
+  /* This first pass over the saved markers discovers whether there are
+   * any ICC markers and verifies the consistency of the marker numbering.
+   */
+
+  for (seq_no = 1; seq_no <= MAX_SEQ_NO; seq_no++)
+    marker_present[seq_no] = 0;
+
+  for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
+    if (marker_is_icc(marker)) {
+      if (num_markers == 0)
+        num_markers = marker->data[13];
+      else if (num_markers != marker->data[13]) {
+        WARNMS(cinfo, JWRN_BOGUS_ICC);  /* inconsistent num_markers fields */
+        return FALSE;
+      }
+      seq_no = marker->data[12];
+      if (seq_no <= 0 || seq_no > num_markers) {
+        WARNMS(cinfo, JWRN_BOGUS_ICC);  /* bogus sequence number */
+        return FALSE;
+      }
+      if (marker_present[seq_no]) {
+        WARNMS(cinfo, JWRN_BOGUS_ICC);  /* duplicate sequence numbers */
+        return FALSE;
+      }
+      marker_present[seq_no] = 1;
+      data_length[seq_no] = marker->data_length - ICC_OVERHEAD_LEN;
+    }
+  }
+
+  if (num_markers == 0)
+    return FALSE;
+
+  /* Check for missing markers, count total space needed,
+   * compute offset of each marker's part of the data.
+   */
+
+  total_length = 0;
+  for (seq_no = 1; seq_no <= num_markers; seq_no++) {
+    if (marker_present[seq_no] == 0) {
+      WARNMS(cinfo, JWRN_BOGUS_ICC);  /* missing sequence number */
+      return FALSE;
+    }
+    data_offset[seq_no] = total_length;
+    total_length += data_length[seq_no];
+  }
+
+  if (total_length == 0) {
+    WARNMS(cinfo, JWRN_BOGUS_ICC);  /* found only empty markers? */
+    return FALSE;
+  }
+
+  /* Allocate space for assembled data */
+  icc_data = (JOCTET *)malloc(total_length * sizeof(JOCTET));
+  if (icc_data == NULL)
+    ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 11);  /* oops, out of memory */
+
+  /* and fill it in */
+  for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
+    if (marker_is_icc(marker)) {
+      JOCTET FAR *src_ptr;
+      JOCTET *dst_ptr;
+      unsigned int length;
+      seq_no = marker->data[12];
+      dst_ptr = icc_data + data_offset[seq_no];
+      src_ptr = marker->data + ICC_OVERHEAD_LEN;
+      length = data_length[seq_no];
+      while (length--) {
+        *dst_ptr++ = *src_ptr++;
+      }
+    }
+  }
+
+  *icc_data_ptr = icc_data;
+  *icc_data_len = total_length;
+
+  return TRUE;
+}
diff --git a/media/libjpeg/jdinput.c b/media/libjpeg/jdinput.c
index 32a6b424e2..1bc5aff1a7 100644
--- a/media/libjpeg/jdinput.c
+++ b/media/libjpeg/jdinput.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2016, D. R. Commander.
+ * Copyright (C) 2010, 2016, 2018, 2022, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -33,7 +33,7 @@ typedef my_input_controller *my_inputctl_ptr;
 
 
 /* Forward declarations */
-METHODDEF(int) consume_markers (j_decompress_ptr cinfo);
+METHODDEF(int) consume_markers(j_decompress_ptr cinfo);
 
 
 /*
@@ -41,16 +41,16 @@ METHODDEF(int) consume_markers (j_decompress_ptr cinfo);
  */
 
 LOCAL(void)
-initial_setup (j_decompress_ptr cinfo)
+initial_setup(j_decompress_ptr cinfo)
 /* Called once, when first SOS marker is reached */
 {
   int ci;
   jpeg_component_info *compptr;
 
   /* Make sure image isn't bigger than I can handle */
-  if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION ||
-      (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION)
-    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
+  if ((long)cinfo->image_height > (long)JPEG_MAX_DIMENSION ||
+      (long)cinfo->image_width > (long)JPEG_MAX_DIMENSION)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)JPEG_MAX_DIMENSION);
 
   /* For now, precision must match compiled-in value... */
   if (cinfo->data_precision != BITS_IN_JSAMPLE)
@@ -66,8 +66,10 @@ initial_setup (j_decompress_ptr cinfo)
   cinfo->max_v_samp_factor = 1;
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR ||
-        compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
+    if (compptr->h_samp_factor <= 0 ||
+        compptr->h_samp_factor > MAX_SAMP_FACTOR ||
+        compptr->v_samp_factor <= 0 ||
+        compptr->v_samp_factor > MAX_SAMP_FACTOR)
       ERREXIT(cinfo, JERR_BAD_SAMPLING);
     cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
                                    compptr->h_samp_factor);
@@ -75,10 +77,10 @@ initial_setup (j_decompress_ptr cinfo)
                                    compptr->v_samp_factor);
   }
 
-#if JPEG_LIB_VERSION >=80
-    cinfo->block_size = DCTSIZE;
-    cinfo->natural_order = jpeg_natural_order;
-    cinfo->lim_Se = DCTSIZE2-1;
+#if JPEG_LIB_VERSION >= 80
+  cinfo->block_size = DCTSIZE;
+  cinfo->natural_order = jpeg_natural_order;
+  cinfo->lim_Se = DCTSIZE2 - 1;
 #endif
 
   /* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE.
@@ -101,11 +103,11 @@ initial_setup (j_decompress_ptr cinfo)
 #endif
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->image_width * (long)compptr->h_samp_factor,
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     compptr->height_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->image_height * (long)compptr->v_samp_factor,
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
     /* Set the first and last MCU columns to decompress from multi-scan images.
      * By default, decompress all of the MCU columns.
      */
@@ -117,11 +119,11 @@ initial_setup (j_decompress_ptr cinfo)
      */
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-                    (long) cinfo->max_h_samp_factor);
+      jdiv_round_up((long)cinfo->image_width * (long)compptr->h_samp_factor,
+                    (long)cinfo->max_h_samp_factor);
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-                    (long) cinfo->max_v_samp_factor);
+      jdiv_round_up((long)cinfo->image_height * (long)compptr->v_samp_factor,
+                    (long)cinfo->max_v_samp_factor);
     /* Mark component needed, until color conversion says otherwise */
     compptr->component_needed = TRUE;
     /* Mark no quantization table yet saved for component */
@@ -130,8 +132,8 @@ initial_setup (j_decompress_ptr cinfo)
 
   /* Compute number of fully interleaved MCU rows. */
   cinfo->total_iMCU_rows = (JDIMENSION)
-    jdiv_round_up((long) cinfo->image_height,
-                  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+    jdiv_round_up((long)cinfo->image_height,
+                  (long)(cinfo->max_v_samp_factor * DCTSIZE));
 
   /* Decide whether file contains multiple scans */
   if (cinfo->comps_in_scan < cinfo->num_components || cinfo->progressive_mode)
@@ -142,7 +144,7 @@ initial_setup (j_decompress_ptr cinfo)
 
 
 LOCAL(void)
-per_scan_setup (j_decompress_ptr cinfo)
+per_scan_setup(j_decompress_ptr cinfo)
 /* Do computations that are needed before processing a JPEG scan */
 /* cinfo->comps_in_scan and cinfo->cur_comp_info[] were set from SOS marker */
 {
@@ -167,7 +169,7 @@ per_scan_setup (j_decompress_ptr cinfo)
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
      */
-    tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+    tmp = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
     if (tmp == 0) tmp = compptr->v_samp_factor;
     compptr->last_row_height = tmp;
 
@@ -184,11 +186,11 @@ per_scan_setup (j_decompress_ptr cinfo)
 
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width,
-                    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+      jdiv_round_up((long)cinfo->image_width,
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height,
-                    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+      jdiv_round_up((long)cinfo->image_height,
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
 
     cinfo->blocks_in_MCU = 0;
 
@@ -198,12 +200,13 @@ per_scan_setup (j_decompress_ptr cinfo)
       compptr->MCU_width = compptr->h_samp_factor;
       compptr->MCU_height = compptr->v_samp_factor;
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * compptr->_DCT_scaled_size;
+      compptr->MCU_sample_width = compptr->MCU_width *
+                                  compptr->_DCT_scaled_size;
       /* Figure number of non-dummy blocks in last MCU column & row */
-      tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
+      tmp = (int)(compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
       compptr->last_col_width = tmp;
-      tmp = (int) (compptr->height_in_blocks % compptr->MCU_height);
+      tmp = (int)(compptr->height_in_blocks % compptr->MCU_height);
       if (tmp == 0) tmp = compptr->MCU_height;
       compptr->last_row_height = tmp;
       /* Prepare array describing MCU composition */
@@ -231,17 +234,17 @@ per_scan_setup (j_decompress_ptr cinfo)
  * means that we have to save away the table actually used for each component.
  * We do this by copying the table at the start of the first scan containing
  * the component.
- * The JPEG spec prohibits the encoder from changing the contents of a Q-table
- * slot between scans of a component using that slot.  If the encoder does so
- * anyway, this decoder will simply use the Q-table values that were current
- * at the start of the first scan for the component.
+ * Rec. ITU-T T.81 | ISO/IEC 10918-1 prohibits the encoder from changing the
+ * contents of a Q-table slot between scans of a component using that slot.  If
+ * the encoder does so anyway, this decoder will simply use the Q-table values
+ * that were current at the start of the first scan for the component.
  *
  * The decompressor output side looks only at the saved quant tables,
  * not at the current Q-table slots.
  */
 
 LOCAL(void)
-latch_quant_tables (j_decompress_ptr cinfo)
+latch_quant_tables(j_decompress_ptr cinfo)
 {
   int ci, qtblno;
   jpeg_component_info *compptr;
@@ -259,9 +262,9 @@ latch_quant_tables (j_decompress_ptr cinfo)
       ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
     /* OK, save away the quantization table */
     qtbl = (JQUANT_TBL *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(JQUANT_TBL));
-    MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
+    memcpy(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
     compptr->quant_table = qtbl;
   }
 }
@@ -275,7 +278,7 @@ latch_quant_tables (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_input_pass (j_decompress_ptr cinfo)
+start_input_pass(j_decompress_ptr cinfo)
 {
   per_scan_setup(cinfo);
   latch_quant_tables(cinfo);
@@ -292,7 +295,7 @@ start_input_pass (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-finish_input_pass (j_decompress_ptr cinfo)
+finish_input_pass(j_decompress_ptr cinfo)
 {
   cinfo->inputctl->consume_input = consume_markers;
 }
@@ -309,9 +312,9 @@ finish_input_pass (j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-consume_markers (j_decompress_ptr cinfo)
+consume_markers(j_decompress_ptr cinfo)
 {
-  my_inputctl_ptr inputctl = (my_inputctl_ptr) cinfo->inputctl;
+  my_inputctl_ptr inputctl = (my_inputctl_ptr)cinfo->inputctl;
   int val;
 
   if (inputctl->pub.eoi_reached) /* After hitting EOI, read no further */
@@ -329,7 +332,7 @@ consume_markers (j_decompress_ptr cinfo)
        * responsible for enforcing this sequencing.
        */
     } else {                    /* 2nd or later SOS marker */
-      if (! inputctl->pub.has_multiple_scans)
+      if (!inputctl->pub.has_multiple_scans)
         ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */
       start_input_pass(cinfo);
     }
@@ -360,16 +363,16 @@ consume_markers (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-reset_input_controller (j_decompress_ptr cinfo)
+reset_input_controller(j_decompress_ptr cinfo)
 {
-  my_inputctl_ptr inputctl = (my_inputctl_ptr) cinfo->inputctl;
+  my_inputctl_ptr inputctl = (my_inputctl_ptr)cinfo->inputctl;
 
   inputctl->pub.consume_input = consume_markers;
   inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */
   inputctl->pub.eoi_reached = FALSE;
   inputctl->inheaders = TRUE;
   /* Reset other modules */
-  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   (*cinfo->marker->reset_marker_reader) (cinfo);
   /* Reset progression state -- would be cleaner if entropy decoder did this */
   cinfo->coef_bits = NULL;
@@ -382,15 +385,15 @@ reset_input_controller (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_input_controller (j_decompress_ptr cinfo)
+jinit_input_controller(j_decompress_ptr cinfo)
 {
   my_inputctl_ptr inputctl;
 
   /* Create subobject in permanent pool */
   inputctl = (my_inputctl_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                 sizeof(my_input_controller));
-  cinfo->inputctl = (struct jpeg_input_controller *) inputctl;
+  cinfo->inputctl = (struct jpeg_input_controller *)inputctl;
   /* Initialize method pointers */
   inputctl->pub.consume_input = consume_markers;
   inputctl->pub.reset_input_controller = reset_input_controller;
diff --git a/media/libjpeg/jdmainct.c b/media/libjpeg/jdmainct.c
index ebb069b0f4..f466b259f0 100644
--- a/media/libjpeg/jdmainct.c
+++ b/media/libjpeg/jdmainct.c
@@ -18,6 +18,7 @@
 
 #include "jinclude.h"
 #include "jdmainct.h"
+#include "jconfigint.h"
 
 
 /*
@@ -112,26 +113,29 @@
 
 
 /* Forward declarations */
-METHODDEF(void) process_data_simple_main
-        (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
-         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
-METHODDEF(void) process_data_context_main
-        (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
-         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_simple_main(j_decompress_ptr cinfo,
+                                         JSAMPARRAY output_buf,
+                                         JDIMENSION *out_row_ctr,
+                                         JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_context_main(j_decompress_ptr cinfo,
+                                          JSAMPARRAY output_buf,
+                                          JDIMENSION *out_row_ctr,
+                                          JDIMENSION out_rows_avail);
 #ifdef QUANT_2PASS_SUPPORTED
-METHODDEF(void) process_data_crank_post
-        (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
-         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_crank_post(j_decompress_ptr cinfo,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION *out_row_ctr,
+                                        JDIMENSION out_rows_avail);
 #endif
 
 
 LOCAL(void)
-alloc_funny_pointers (j_decompress_ptr cinfo)
+alloc_funny_pointers(j_decompress_ptr cinfo)
 /* Allocate space for the funny pointer lists.
  * This is done only once, not once per pass.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, rgroup;
   int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
@@ -141,7 +145,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
    * We alloc both arrays with one call to save a few cycles.
    */
   main_ptr->xbuffer[0] = (JSAMPIMAGE)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 cinfo->num_components * 2 * sizeof(JSAMPARRAY));
   main_ptr->xbuffer[1] = main_ptr->xbuffer[0] + cinfo->num_components;
 
@@ -153,7 +157,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
      * We alloc both pointer lists with one call to save a few cycles.
      */
     xbuf = (JSAMPARRAY)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   2 * (rgroup * (M + 4)) * sizeof(JSAMPROW));
     xbuf += rgroup;             /* want one row group at negative offsets */
     main_ptr->xbuffer[0][ci] = xbuf;
@@ -164,7 +168,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
 
 
 LOCAL(void)
-make_funny_pointers (j_decompress_ptr cinfo)
+make_funny_pointers(j_decompress_ptr cinfo)
 /* Create the funny pointer lists discussed in the comments above.
  * The actual workspace is already allocated (in main_ptr->buffer),
  * and the space for the pointer lists is allocated too.
@@ -172,7 +176,7 @@ make_funny_pointers (j_decompress_ptr cinfo)
  * This will be repeated at the beginning of each pass.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, i, rgroup;
   int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
@@ -191,8 +195,8 @@ make_funny_pointers (j_decompress_ptr cinfo)
     }
     /* In the second list, put the last four row groups in swapped order */
     for (i = 0; i < rgroup * 2; i++) {
-      xbuf1[rgroup*(M-2) + i] = buf[rgroup*M + i];
-      xbuf1[rgroup*M + i] = buf[rgroup*(M-2) + i];
+      xbuf1[rgroup * (M - 2) + i] = buf[rgroup * M + i];
+      xbuf1[rgroup * M + i] = buf[rgroup * (M - 2) + i];
     }
     /* The wraparound pointers at top and bottom will be filled later
      * (see set_wraparound_pointers, below).  Initially we want the "above"
@@ -207,13 +211,13 @@ make_funny_pointers (j_decompress_ptr cinfo)
 
 
 LOCAL(void)
-set_bottom_pointers (j_decompress_ptr cinfo)
+set_bottom_pointers(j_decompress_ptr cinfo)
 /* Change the pointer lists to duplicate the last sample row at the bottom
  * of the image.  whichptr indicates which xbuffer holds the final iMCU row.
  * Also sets rowgroups_avail to indicate number of nondummy row groups in row.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, i, rgroup, iMCUheight, rows_left;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf;
@@ -224,20 +228,20 @@ set_bottom_pointers (j_decompress_ptr cinfo)
     iMCUheight = compptr->v_samp_factor * compptr->_DCT_scaled_size;
     rgroup = iMCUheight / cinfo->_min_DCT_scaled_size;
     /* Count nondummy sample rows remaining for this component */
-    rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight);
+    rows_left = (int)(compptr->downsampled_height % (JDIMENSION)iMCUheight);
     if (rows_left == 0) rows_left = iMCUheight;
     /* Count nondummy row groups.  Should get same answer for each component,
      * so we need only do it once.
      */
     if (ci == 0) {
-      main_ptr->rowgroups_avail = (JDIMENSION) ((rows_left-1) / rgroup + 1);
+      main_ptr->rowgroups_avail = (JDIMENSION)((rows_left - 1) / rgroup + 1);
     }
     /* Duplicate the last real sample row rgroup*2 times; this pads out the
      * last partial rowgroup and ensures at least one full rowgroup of context.
      */
     xbuf = main_ptr->xbuffer[main_ptr->whichptr][ci];
     for (i = 0; i < rgroup * 2; i++) {
-      xbuf[rows_left + i] = xbuf[rows_left-1];
+      xbuf[rows_left + i] = xbuf[rows_left - 1];
     }
   }
 }
@@ -248,9 +252,9 @@ set_bottom_pointers (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_main(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   switch (pass_mode) {
   case JBUF_PASS_THRU:
@@ -286,22 +290,21 @@ start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(void)
-process_data_simple_main (j_decompress_ptr cinfo,
-                          JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                          JDIMENSION out_rows_avail)
+process_data_simple_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+                         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   JDIMENSION rowgroups_avail;
 
   /* Read input data if we haven't filled the main buffer yet */
-  if (! main_ptr->buffer_full) {
-    if (! (*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
+  if (!main_ptr->buffer_full) {
+    if (!(*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
       return;                   /* suspension forced, can do nothing more */
     main_ptr->buffer_full = TRUE;       /* OK, we have an iMCU row to work with */
   }
 
   /* There are always min_DCT_scaled_size row groups in an iMCU row. */
-  rowgroups_avail = (JDIMENSION) cinfo->_min_DCT_scaled_size;
+  rowgroups_avail = (JDIMENSION)cinfo->_min_DCT_scaled_size;
   /* Note: at the bottom of the image, we may pass extra garbage row groups
    * to the postprocessor.  The postprocessor has to check for bottom
    * of image anyway (at row resolution), so no point in us doing it too.
@@ -326,16 +329,15 @@ process_data_simple_main (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-process_data_context_main (j_decompress_ptr cinfo,
-                           JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                           JDIMENSION out_rows_avail)
+process_data_context_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+                          JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   /* Read input data if we haven't filled the main buffer yet */
-  if (! main_ptr->buffer_full) {
-    if (! (*cinfo->coef->decompress_data) (cinfo,
-                                           main_ptr->xbuffer[main_ptr->whichptr]))
+  if (!main_ptr->buffer_full) {
+    if (!(*cinfo->coef->decompress_data) (cinfo,
+                                          main_ptr->xbuffer[main_ptr->whichptr]))
       return;                   /* suspension forced, can do nothing more */
     main_ptr->buffer_full = TRUE;       /* OK, we have an iMCU row to work with */
     main_ptr->iMCU_row_ctr++;   /* count rows received */
@@ -349,31 +351,35 @@ process_data_context_main (j_decompress_ptr cinfo,
   switch (main_ptr->context_state) {
   case CTX_POSTPONED_ROW:
     /* Call postprocessor using previously set pointers for postponed row */
-    (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
-                        &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
-                        output_buf, out_row_ctr, out_rows_avail);
+    (*cinfo->post->post_process_data) (cinfo,
+                                       main_ptr->xbuffer[main_ptr->whichptr],
+                                       &main_ptr->rowgroup_ctr,
+                                       main_ptr->rowgroups_avail, output_buf,
+                                       out_row_ctr, out_rows_avail);
     if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
       return;                   /* Need to suspend */
     main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
     if (*out_row_ctr >= out_rows_avail)
       return;                   /* Postprocessor exactly filled output buf */
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case CTX_PREPARE_FOR_IMCU:
     /* Prepare to process first M-1 row groups of this iMCU row */
     main_ptr->rowgroup_ctr = 0;
-    main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size - 1);
+    main_ptr->rowgroups_avail = (JDIMENSION)(cinfo->_min_DCT_scaled_size - 1);
     /* Check for bottom of image: if so, tweak pointers to "duplicate"
      * the last sample row, and adjust rowgroups_avail to ignore padding rows.
      */
     if (main_ptr->iMCU_row_ctr == cinfo->total_iMCU_rows)
       set_bottom_pointers(cinfo);
     main_ptr->context_state = CTX_PROCESS_IMCU;
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case CTX_PROCESS_IMCU:
     /* Call postprocessor using previously set pointers */
-    (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
-                        &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
-                        output_buf, out_row_ctr, out_rows_avail);
+    (*cinfo->post->post_process_data) (cinfo,
+                                       main_ptr->xbuffer[main_ptr->whichptr],
+                                       &main_ptr->rowgroup_ctr,
+                                       main_ptr->rowgroups_avail, output_buf,
+                                       out_row_ctr, out_rows_avail);
     if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
       return;                   /* Need to suspend */
     /* After the first iMCU, change wraparound pointers to normal state */
@@ -384,8 +390,8 @@ process_data_context_main (j_decompress_ptr cinfo,
     main_ptr->buffer_full = FALSE;
     /* Still need to process last row group of this iMCU row, */
     /* which is saved at index M+1 of the other xbuffer */
-    main_ptr->rowgroup_ctr = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 1);
-    main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 2);
+    main_ptr->rowgroup_ctr = (JDIMENSION)(cinfo->_min_DCT_scaled_size + 1);
+    main_ptr->rowgroups_avail = (JDIMENSION)(cinfo->_min_DCT_scaled_size + 2);
     main_ptr->context_state = CTX_POSTPONED_ROW;
   }
 }
@@ -400,12 +406,11 @@ process_data_context_main (j_decompress_ptr cinfo,
 #ifdef QUANT_2PASS_SUPPORTED
 
 METHODDEF(void)
-process_data_crank_post (j_decompress_ptr cinfo,
-                         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                         JDIMENSION out_rows_avail)
+process_data_crank_post(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+                        JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE) NULL,
-                                     (JDIMENSION *) NULL, (JDIMENSION) 0,
+  (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE)NULL,
+                                     (JDIMENSION *)NULL, (JDIMENSION)0,
                                      output_buf, out_row_ctr, out_rows_avail);
 }
 
@@ -417,16 +422,16 @@ process_data_crank_post (j_decompress_ptr cinfo,
  */
 
 GLOBAL(void)
-jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
+jinit_d_main_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_main_ptr main_ptr;
   int ci, rgroup, ngroups;
   jpeg_component_info *compptr;
 
   main_ptr = (my_main_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_main_controller));
-  cinfo->main = (struct jpeg_d_main_controller *) main_ptr;
+  cinfo->main = (struct jpeg_d_main_controller *)main_ptr;
   main_ptr->pub.start_pass = start_pass_main;
 
   if (need_full_buffer)         /* shouldn't happen */
@@ -449,8 +454,8 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
     rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
       cinfo->_min_DCT_scaled_size; /* height of a row group of component */
     main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
-                        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                        ((j_common_ptr)cinfo, JPOOL_IMAGE,
                          compptr->width_in_blocks * compptr->_DCT_scaled_size,
-                         (JDIMENSION) (rgroup * ngroups));
+                         (JDIMENSION)(rgroup * ngroups));
   }
 }
diff --git a/media/libjpeg/jdmainct.h b/media/libjpeg/jdmainct.h
index 30903019ca..37b201ca88 100644
--- a/media/libjpeg/jdmainct.h
+++ b/media/libjpeg/jdmainct.h
@@ -44,12 +44,12 @@ typedef my_main_controller *my_main_ptr;
 
 
 LOCAL(void)
-set_wraparound_pointers (j_decompress_ptr cinfo)
+set_wraparound_pointers(j_decompress_ptr cinfo)
 /* Set up the "wraparound" pointers at top and bottom of the pointer lists.
  * This changes the pointer list state from top-of-image to the normal state.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, i, rgroup;
   int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
@@ -62,10 +62,10 @@ set_wraparound_pointers (j_decompress_ptr cinfo)
     xbuf0 = main_ptr->xbuffer[0][ci];
     xbuf1 = main_ptr->xbuffer[1][ci];
     for (i = 0; i < rgroup; i++) {
-      xbuf0[i - rgroup] = xbuf0[rgroup*(M+1) + i];
-      xbuf1[i - rgroup] = xbuf1[rgroup*(M+1) + i];
-      xbuf0[rgroup*(M+2) + i] = xbuf0[i];
-      xbuf1[rgroup*(M+2) + i] = xbuf1[i];
+      xbuf0[i - rgroup] = xbuf0[rgroup * (M + 1) + i];
+      xbuf1[i - rgroup] = xbuf1[rgroup * (M + 1) + i];
+      xbuf0[rgroup * (M + 2) + i] = xbuf0[i];
+      xbuf1[rgroup * (M + 2) + i] = xbuf1[i];
     }
   }
 }
diff --git a/media/libjpeg/jdmarker.c b/media/libjpeg/jdmarker.c
index e3b612c9b9..f7eba615fd 100644
--- a/media/libjpeg/jdmarker.c
+++ b/media/libjpeg/jdmarker.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2012, 2015, D. R. Commander.
+ * Copyright (C) 2012, 2015, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -119,50 +119,50 @@ typedef my_marker_reader *my_marker_ptr;
  */
 
 /* Declare and initialize local copies of input pointer/count */
-#define INPUT_VARS(cinfo)  \
-        struct jpeg_source_mgr *datasrc = (cinfo)->src;  \
-        const JOCTET *next_input_byte = datasrc->next_input_byte;  \
-        size_t bytes_in_buffer = datasrc->bytes_in_buffer
+#define INPUT_VARS(cinfo) \
+  struct jpeg_source_mgr *datasrc = (cinfo)->src; \
+  const JOCTET *next_input_byte = datasrc->next_input_byte; \
+  size_t bytes_in_buffer = datasrc->bytes_in_buffer
 
 /* Unload the local copies --- do this only at a restart boundary */
-#define INPUT_SYNC(cinfo)  \
-        ( datasrc->next_input_byte = next_input_byte,  \
-          datasrc->bytes_in_buffer = bytes_in_buffer )
+#define INPUT_SYNC(cinfo) \
+  ( datasrc->next_input_byte = next_input_byte, \
+    datasrc->bytes_in_buffer = bytes_in_buffer )
 
 /* Reload the local copies --- used only in MAKE_BYTE_AVAIL */
-#define INPUT_RELOAD(cinfo)  \
-        ( next_input_byte = datasrc->next_input_byte,  \
-          bytes_in_buffer = datasrc->bytes_in_buffer )
+#define INPUT_RELOAD(cinfo) \
+  ( next_input_byte = datasrc->next_input_byte, \
+    bytes_in_buffer = datasrc->bytes_in_buffer )
 
 /* Internal macro for INPUT_BYTE and INPUT_2BYTES: make a byte available.
  * Note we do *not* do INPUT_SYNC before calling fill_input_buffer,
  * but we must reload the local copies after a successful fill.
  */
-#define MAKE_BYTE_AVAIL(cinfo,action)  \
-        if (bytes_in_buffer == 0) {  \
-          if (! (*datasrc->fill_input_buffer) (cinfo))  \
-            { action; }  \
-          INPUT_RELOAD(cinfo);  \
-        }
+#define MAKE_BYTE_AVAIL(cinfo, action) \
+  if (bytes_in_buffer == 0) { \
+    if (!(*datasrc->fill_input_buffer) (cinfo)) \
+      { action; } \
+    INPUT_RELOAD(cinfo); \
+  }
 
 /* Read a byte into variable V.
  * If must suspend, take the specified action (typically "return FALSE").
  */
-#define INPUT_BYTE(cinfo,V,action)  \
-        MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
-                  bytes_in_buffer--; \
-                  V = GETJOCTET(*next_input_byte++); )
+#define INPUT_BYTE(cinfo, V, action) \
+  MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
+            bytes_in_buffer--; \
+            V = *next_input_byte++; )
 
 /* As above, but read two bytes interpreted as an unsigned 16-bit integer.
  * V should be declared unsigned int or perhaps JLONG.
  */
-#define INPUT_2BYTES(cinfo,V,action)  \
-        MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
-                  bytes_in_buffer--; \
-                  V = ((unsigned int) GETJOCTET(*next_input_byte++)) << 8; \
-                  MAKE_BYTE_AVAIL(cinfo,action); \
-                  bytes_in_buffer--; \
-                  V += GETJOCTET(*next_input_byte++); )
+#define INPUT_2BYTES(cinfo, V, action) \
+  MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
+            bytes_in_buffer--; \
+            V = ((unsigned int)(*next_input_byte++)) << 8; \
+            MAKE_BYTE_AVAIL(cinfo, action); \
+            bytes_in_buffer--; \
+            V += *next_input_byte++; )
 
 
 /*
@@ -197,7 +197,7 @@ typedef my_marker_reader *my_marker_ptr;
 
 
 LOCAL(boolean)
-get_soi (j_decompress_ptr cinfo)
+get_soi(j_decompress_ptr cinfo)
 /* Process an SOI marker */
 {
   int i;
@@ -237,7 +237,7 @@ get_soi (j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
+get_sof(j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
 /* Process a SOFn marker */
 {
   JLONG length;
@@ -258,7 +258,7 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
   length -= 8;
 
   TRACEMS4(cinfo, 1, JTRC_SOF, cinfo->unread_marker,
-           (int) cinfo->image_width, (int) cinfo->image_height,
+           (int)cinfo->image_width, (int)cinfo->image_height,
            cinfo->num_components);
 
   if (cinfo->marker->saw_SOF)
@@ -267,16 +267,16 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
   /* We don't support files in which the image height is initially specified */
   /* as 0 and is later redefined by DNL.  As long as we have to check that,  */
   /* might as well have a general sanity check. */
-  if (cinfo->image_height <= 0 || cinfo->image_width <= 0
-      || cinfo->num_components <= 0)
+  if (cinfo->image_height <= 0 || cinfo->image_width <= 0 ||
+      cinfo->num_components <= 0)
     ERREXIT(cinfo, JERR_EMPTY_IMAGE);
 
   if (length != (cinfo->num_components * 3))
     ERREXIT(cinfo, JERR_BAD_LENGTH);
 
   if (cinfo->comp_info == NULL) /* do only once, even if suspend */
-    cinfo->comp_info = (jpeg_component_info *) (*cinfo->mem->alloc_small)
-                        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    cinfo->comp_info = (jpeg_component_info *)(*cinfo->mem->alloc_small)
+                        ((j_common_ptr)cinfo, JPOOL_IMAGE,
                          cinfo->num_components * sizeof(jpeg_component_info));
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
@@ -301,7 +301,7 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
 
 
 LOCAL(boolean)
-get_sos (j_decompress_ptr cinfo)
+get_sos(j_decompress_ptr cinfo)
 /* Process a SOS marker */
 {
   JLONG length;
@@ -309,7 +309,7 @@ get_sos (j_decompress_ptr cinfo)
   jpeg_component_info *compptr;
   INPUT_VARS(cinfo);
 
-  if (! cinfo->marker->saw_SOF)
+  if (!cinfo->marker->saw_SOF)
     ERREXIT(cinfo, JERR_SOS_NO_SOF);
 
   INPUT_2BYTES(cinfo, length, return FALSE);
@@ -341,7 +341,7 @@ get_sos (j_decompress_ptr cinfo)
 
     ERREXIT1(cinfo, JERR_BAD_COMPONENT_ID, cc);
 
-  id_found:
+id_found:
 
     cinfo->cur_comp_info[i] = compptr;
     compptr->dc_tbl_no = (c >> 4) & 15;
@@ -384,7 +384,7 @@ get_sos (j_decompress_ptr cinfo)
 #ifdef D_ARITH_CODING_SUPPORTED
 
 LOCAL(boolean)
-get_dac (j_decompress_ptr cinfo)
+get_dac(j_decompress_ptr cinfo)
 /* Process a DAC marker */
 {
   JLONG length;
@@ -402,14 +402,14 @@ get_dac (j_decompress_ptr cinfo)
 
     TRACEMS2(cinfo, 1, JTRC_DAC, index, val);
 
-    if (index < 0 || index >= (2*NUM_ARITH_TBLS))
+    if (index < 0 || index >= (2 * NUM_ARITH_TBLS))
       ERREXIT1(cinfo, JERR_DAC_INDEX, index);
 
     if (index >= NUM_ARITH_TBLS) { /* define AC table */
-      cinfo->arith_ac_K[index-NUM_ARITH_TBLS] = (UINT8) val;
+      cinfo->arith_ac_K[index - NUM_ARITH_TBLS] = (UINT8)val;
     } else {                    /* define DC table */
-      cinfo->arith_dc_L[index] = (UINT8) (val & 0x0F);
-      cinfo->arith_dc_U[index] = (UINT8) (val >> 4);
+      cinfo->arith_dc_L[index] = (UINT8)(val & 0x0F);
+      cinfo->arith_dc_U[index] = (UINT8)(val >> 4);
       if (cinfo->arith_dc_L[index] > cinfo->arith_dc_U[index])
         ERREXIT1(cinfo, JERR_DAC_VALUE, val);
     }
@@ -422,7 +422,7 @@ get_dac (j_decompress_ptr cinfo)
   return TRUE;
 }
 
-#else /* ! D_ARITH_CODING_SUPPORTED */
+#else /* !D_ARITH_CODING_SUPPORTED */
 
 #define get_dac(cinfo)  skip_variable(cinfo)
 
@@ -430,7 +430,7 @@ get_dac (j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-get_dht (j_decompress_ptr cinfo)
+get_dht(j_decompress_ptr cinfo)
 /* Process a DHT marker */
 {
   JLONG length;
@@ -467,13 +467,13 @@ get_dht (j_decompress_ptr cinfo)
     /* Here we just do minimal validation of the counts to avoid walking
      * off the end of our table space.  jdhuff.c will check more carefully.
      */
-    if (count > 256 || ((JLONG) count) > length)
+    if (count > 256 || ((JLONG)count) > length)
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
 
     for (i = 0; i < count; i++)
       INPUT_BYTE(cinfo, huffval[i], return FALSE);
 
-    MEMZERO(&huffval[count], (256 - count) * sizeof(UINT8));
+    memset(&huffval[count], 0, (256 - count) * sizeof(UINT8));
 
     length -= count;
 
@@ -489,10 +489,10 @@ get_dht (j_decompress_ptr cinfo)
     }
 
     if (*htblptr == NULL)
-      *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+      *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
 
-    MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
-    MEMCOPY((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
+    memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
+    memcpy((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
   }
 
   if (length != 0)
@@ -504,7 +504,7 @@ get_dht (j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-get_dqt (j_decompress_ptr cinfo)
+get_dqt(j_decompress_ptr cinfo)
 /* Process a DQT marker */
 {
   JLONG length;
@@ -527,7 +527,7 @@ get_dqt (j_decompress_ptr cinfo)
       ERREXIT1(cinfo, JERR_DQT_INDEX, n);
 
     if (cinfo->quant_tbl_ptrs[n] == NULL)
-      cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) cinfo);
+      cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr)cinfo);
     quant_ptr = cinfo->quant_tbl_ptrs[n];
 
     for (i = 0; i < DCTSIZE2; i++) {
@@ -536,20 +536,20 @@ get_dqt (j_decompress_ptr cinfo)
       else
         INPUT_BYTE(cinfo, tmp, return FALSE);
       /* We convert the zigzag-order table to natural array order. */
-      quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16) tmp;
+      quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16)tmp;
     }
 
     if (cinfo->err->trace_level >= 2) {
       for (i = 0; i < DCTSIZE2; i += 8) {
         TRACEMS8(cinfo, 2, JTRC_QUANTVALS,
-                 quant_ptr->quantval[i],   quant_ptr->quantval[i+1],
-                 quant_ptr->quantval[i+2], quant_ptr->quantval[i+3],
-                 quant_ptr->quantval[i+4], quant_ptr->quantval[i+5],
-                 quant_ptr->quantval[i+6], quant_ptr->quantval[i+7]);
+                 quant_ptr->quantval[i],     quant_ptr->quantval[i + 1],
+                 quant_ptr->quantval[i + 2], quant_ptr->quantval[i + 3],
+                 quant_ptr->quantval[i + 4], quant_ptr->quantval[i + 5],
+                 quant_ptr->quantval[i + 6], quant_ptr->quantval[i + 7]);
       }
     }
 
-    length -= DCTSIZE2+1;
+    length -= DCTSIZE2 + 1;
     if (prec) length -= DCTSIZE2;
   }
 
@@ -562,7 +562,7 @@ get_dqt (j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-get_dri (j_decompress_ptr cinfo)
+get_dri(j_decompress_ptr cinfo)
 /* Process a DRI marker */
 {
   JLONG length;
@@ -598,28 +598,28 @@ get_dri (j_decompress_ptr cinfo)
 
 
 LOCAL(void)
-examine_app0 (j_decompress_ptr cinfo, JOCTET *data,
-              unsigned int datalen, JLONG remaining)
+examine_app0(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
+             JLONG remaining)
 /* Examine first few bytes from an APP0.
  * Take appropriate action if it is a JFIF marker.
  * datalen is # of bytes at data[], remaining is length of rest of marker data.
  */
 {
-  JLONG totallen = (JLONG) datalen + remaining;
+  JLONG totallen = (JLONG)datalen + remaining;
 
   if (datalen >= APP0_DATA_LEN &&
-      GETJOCTET(data[0]) == 0x4A &&
-      GETJOCTET(data[1]) == 0x46 &&
-      GETJOCTET(data[2]) == 0x49 &&
-      GETJOCTET(data[3]) == 0x46 &&
-      GETJOCTET(data[4]) == 0) {
+      data[0] == 0x4A &&
+      data[1] == 0x46 &&
+      data[2] == 0x49 &&
+      data[3] == 0x46 &&
+      data[4] == 0) {
     /* Found JFIF APP0 marker: save info */
     cinfo->saw_JFIF_marker = TRUE;
-    cinfo->JFIF_major_version = GETJOCTET(data[5]);
-    cinfo->JFIF_minor_version = GETJOCTET(data[6]);
-    cinfo->density_unit = GETJOCTET(data[7]);
-    cinfo->X_density = (GETJOCTET(data[8]) << 8) + GETJOCTET(data[9]);
-    cinfo->Y_density = (GETJOCTET(data[10]) << 8) + GETJOCTET(data[11]);
+    cinfo->JFIF_major_version = data[5];
+    cinfo->JFIF_minor_version = data[6];
+    cinfo->density_unit = data[7];
+    cinfo->X_density = (data[8] << 8) + data[9];
+    cinfo->Y_density = (data[10] << 8) + data[11];
     /* Check version.
      * Major version must be 1, anything else signals an incompatible change.
      * (We used to treat this as an error, but now it's a nonfatal warning,
@@ -634,48 +634,45 @@ examine_app0 (j_decompress_ptr cinfo, JOCTET *data,
              cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
              cinfo->X_density, cinfo->Y_density, cinfo->density_unit);
     /* Validate thumbnail dimensions and issue appropriate messages */
-    if (GETJOCTET(data[12]) | GETJOCTET(data[13]))
-      TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL,
-               GETJOCTET(data[12]), GETJOCTET(data[13]));
+    if (data[12] | data[13])
+      TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL, data[12], data[13]);
     totallen -= APP0_DATA_LEN;
-    if (totallen !=
-        ((JLONG)GETJOCTET(data[12]) * (JLONG)GETJOCTET(data[13]) * (JLONG) 3))
-      TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int) totallen);
+    if (totallen != ((JLONG)data[12] * (JLONG)data[13] * (JLONG)3))
+      TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int)totallen);
   } else if (datalen >= 6 &&
-      GETJOCTET(data[0]) == 0x4A &&
-      GETJOCTET(data[1]) == 0x46 &&
-      GETJOCTET(data[2]) == 0x58 &&
-      GETJOCTET(data[3]) == 0x58 &&
-      GETJOCTET(data[4]) == 0) {
+             data[0] == 0x4A &&
+             data[1] == 0x46 &&
+             data[2] == 0x58 &&
+             data[3] == 0x58 &&
+             data[4] == 0) {
     /* Found JFIF "JFXX" extension APP0 marker */
     /* The library doesn't actually do anything with these,
      * but we try to produce a helpful trace message.
      */
-    switch (GETJOCTET(data[5])) {
+    switch (data[5]) {
     case 0x10:
-      TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int) totallen);
+      TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int)totallen);
       break;
     case 0x11:
-      TRACEMS1(cinfo, 1, JTRC_THUMB_PALETTE, (int) totallen);
+      TRACEMS1(cinfo, 1, JTRC_THUMB_PALETTE, (int)totallen);
       break;
     case 0x13:
-      TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int) totallen);
+      TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int)totallen);
       break;
     default:
-      TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION,
-               GETJOCTET(data[5]), (int) totallen);
+      TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION, data[5], (int)totallen);
       break;
     }
   } else {
     /* Start of APP0 does not match "JFIF" or "JFXX", or too short */
-    TRACEMS1(cinfo, 1, JTRC_APP0, (int) totallen);
+    TRACEMS1(cinfo, 1, JTRC_APP0, (int)totallen);
   }
 }
 
 
 LOCAL(void)
-examine_app14 (j_decompress_ptr cinfo, JOCTET *data,
-               unsigned int datalen, JLONG remaining)
+examine_app14(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
+              JLONG remaining)
 /* Examine first few bytes from an APP14.
  * Take appropriate action if it is an Adobe marker.
  * datalen is # of bytes at data[], remaining is length of rest of marker data.
@@ -684,28 +681,28 @@ examine_app14 (j_decompress_ptr cinfo, JOCTET *data,
   unsigned int version, flags0, flags1, transform;
 
   if (datalen >= APP14_DATA_LEN &&
-      GETJOCTET(data[0]) == 0x41 &&
-      GETJOCTET(data[1]) == 0x64 &&
-      GETJOCTET(data[2]) == 0x6F &&
-      GETJOCTET(data[3]) == 0x62 &&
-      GETJOCTET(data[4]) == 0x65) {
+      data[0] == 0x41 &&
+      data[1] == 0x64 &&
+      data[2] == 0x6F &&
+      data[3] == 0x62 &&
+      data[4] == 0x65) {
     /* Found Adobe APP14 marker */
-    version = (GETJOCTET(data[5]) << 8) + GETJOCTET(data[6]);
-    flags0 = (GETJOCTET(data[7]) << 8) + GETJOCTET(data[8]);
-    flags1 = (GETJOCTET(data[9]) << 8) + GETJOCTET(data[10]);
-    transform = GETJOCTET(data[11]);
+    version = (data[5] << 8) + data[6];
+    flags0 = (data[7] << 8) + data[8];
+    flags1 = (data[9] << 8) + data[10];
+    transform = data[11];
     TRACEMS4(cinfo, 1, JTRC_ADOBE, version, flags0, flags1, transform);
     cinfo->saw_Adobe_marker = TRUE;
-    cinfo->Adobe_transform = (UINT8) transform;
+    cinfo->Adobe_transform = (UINT8)transform;
   } else {
     /* Start of APP14 does not match "Adobe", or too short */
-    TRACEMS1(cinfo, 1, JTRC_APP14, (int) (datalen + remaining));
+    TRACEMS1(cinfo, 1, JTRC_APP14, (int)(datalen + remaining));
   }
 }
 
 
 METHODDEF(boolean)
-get_interesting_appn (j_decompress_ptr cinfo)
+get_interesting_appn(j_decompress_ptr cinfo)
 /* Process an APP0 or APP14 marker without saving it */
 {
   JLONG length;
@@ -720,7 +717,7 @@ get_interesting_appn (j_decompress_ptr cinfo)
   if (length >= APPN_DATA_LEN)
     numtoread = APPN_DATA_LEN;
   else if (length > 0)
-    numtoread = (unsigned int) length;
+    numtoread = (unsigned int)length;
   else
     numtoread = 0;
   for (i = 0; i < numtoread; i++)
@@ -730,10 +727,10 @@ get_interesting_appn (j_decompress_ptr cinfo)
   /* process it */
   switch (cinfo->unread_marker) {
   case M_APP0:
-    examine_app0(cinfo, (JOCTET *) b, numtoread, length);
+    examine_app0(cinfo, (JOCTET *)b, numtoread, length);
     break;
   case M_APP14:
-    examine_app14(cinfo, (JOCTET *) b, numtoread, length);
+    examine_app14(cinfo, (JOCTET *)b, numtoread, length);
     break;
   default:
     /* can't get here unless jpeg_save_markers chooses wrong processor */
@@ -744,7 +741,7 @@ get_interesting_appn (j_decompress_ptr cinfo)
   /* skip any remaining data -- could be lots */
   INPUT_SYNC(cinfo);
   if (length > 0)
-    (*cinfo->src->skip_input_data) (cinfo, (long) length);
+    (*cinfo->src->skip_input_data) (cinfo, (long)length);
 
   return TRUE;
 }
@@ -753,10 +750,10 @@ get_interesting_appn (j_decompress_ptr cinfo)
 #ifdef SAVE_MARKERS_SUPPORTED
 
 METHODDEF(boolean)
-save_marker (j_decompress_ptr cinfo)
+save_marker(j_decompress_ptr cinfo)
 /* Save an APPn or COM marker into the marker list */
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
   jpeg_saved_marker_ptr cur_marker = marker->cur_marker;
   unsigned int bytes_read, data_length;
   JOCTET *data;
@@ -770,22 +767,22 @@ save_marker (j_decompress_ptr cinfo)
     if (length >= 0) {          /* watch out for bogus length word */
       /* figure out how much we want to save */
       unsigned int limit;
-      if (cinfo->unread_marker == (int) M_COM)
+      if (cinfo->unread_marker == (int)M_COM)
         limit = marker->length_limit_COM;
       else
-        limit = marker->length_limit_APPn[cinfo->unread_marker - (int) M_APP0];
-      if ((unsigned int) length < limit)
-        limit = (unsigned int) length;
+        limit = marker->length_limit_APPn[cinfo->unread_marker - (int)M_APP0];
+      if ((unsigned int)length < limit)
+        limit = (unsigned int)length;
       /* allocate and initialize the marker item */
       cur_marker = (jpeg_saved_marker_ptr)
-        (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+        (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                     sizeof(struct jpeg_marker_struct) + limit);
       cur_marker->next = NULL;
-      cur_marker->marker = (UINT8) cinfo->unread_marker;
-      cur_marker->original_length = (unsigned int) length;
+      cur_marker->marker = (UINT8)cinfo->unread_marker;
+      cur_marker->original_length = (unsigned int)length;
       cur_marker->data_length = limit;
       /* data area is just beyond the jpeg_marker_struct */
-      data = cur_marker->data = (JOCTET *) (cur_marker + 1);
+      data = cur_marker->data = (JOCTET *)(cur_marker + 1);
       marker->cur_marker = cur_marker;
       marker->bytes_read = 0;
       bytes_read = 0;
@@ -843,14 +840,14 @@ save_marker (j_decompress_ptr cinfo)
     break;
   default:
     TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker,
-             (int) (data_length + length));
+             (int)(data_length + length));
     break;
   }
 
   /* skip any remaining data -- could be lots */
   INPUT_SYNC(cinfo);            /* do before skip_input_data */
   if (length > 0)
-    (*cinfo->src->skip_input_data) (cinfo, (long) length);
+    (*cinfo->src->skip_input_data) (cinfo, (long)length);
 
   return TRUE;
 }
@@ -859,7 +856,7 @@ save_marker (j_decompress_ptr cinfo)
 
 
 METHODDEF(boolean)
-skip_variable (j_decompress_ptr cinfo)
+skip_variable(j_decompress_ptr cinfo)
 /* Skip over an unknown or uninteresting variable-length marker */
 {
   JLONG length;
@@ -868,11 +865,11 @@ skip_variable (j_decompress_ptr cinfo)
   INPUT_2BYTES(cinfo, length, return FALSE);
   length -= 2;
 
-  TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, (int) length);
+  TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, (int)length);
 
   INPUT_SYNC(cinfo);            /* do before skip_input_data */
   if (length > 0)
-    (*cinfo->src->skip_input_data) (cinfo, (long) length);
+    (*cinfo->src->skip_input_data) (cinfo, (long)length);
 
   return TRUE;
 }
@@ -888,7 +885,7 @@ skip_variable (j_decompress_ptr cinfo)
  */
 
 LOCAL(boolean)
-next_marker (j_decompress_ptr cinfo)
+next_marker(j_decompress_ptr cinfo)
 {
   int c;
   INPUT_VARS(cinfo);
@@ -935,7 +932,7 @@ next_marker (j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-first_marker (j_decompress_ptr cinfo)
+first_marker(j_decompress_ptr cinfo)
 /* Like next_marker, but used to obtain the initial SOI marker. */
 /* For this marker, we do not allow preceding garbage or fill; otherwise,
  * we might well scan an entire input file before realizing it ain't JPEG.
@@ -948,7 +945,7 @@ first_marker (j_decompress_ptr cinfo)
 
   INPUT_BYTE(cinfo, c, return FALSE);
   INPUT_BYTE(cinfo, c2, return FALSE);
-  if (c != 0xFF || c2 != (int) M_SOI)
+  if (c != 0xFF || c2 != (int)M_SOI)
     ERREXIT2(cinfo, JERR_NO_SOI, c, c2);
 
   cinfo->unread_marker = c2;
@@ -966,18 +963,18 @@ first_marker (j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-read_markers (j_decompress_ptr cinfo)
+read_markers(j_decompress_ptr cinfo)
 {
   /* Outer loop repeats once for each marker. */
   for (;;) {
     /* Collect the marker proper, unless we already did. */
     /* NB: first_marker() enforces the requirement that SOI appear first. */
     if (cinfo->unread_marker == 0) {
-      if (! cinfo->marker->saw_SOI) {
-        if (! first_marker(cinfo))
+      if (!cinfo->marker->saw_SOI) {
+        if (!first_marker(cinfo))
           return JPEG_SUSPENDED;
       } else {
-        if (! next_marker(cinfo))
+        if (!next_marker(cinfo))
           return JPEG_SUSPENDED;
       }
     }
@@ -987,28 +984,28 @@ read_markers (j_decompress_ptr cinfo)
      */
     switch (cinfo->unread_marker) {
     case M_SOI:
-      if (! get_soi(cinfo))
+      if (!get_soi(cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF0:                /* Baseline */
     case M_SOF1:                /* Extended sequential, Huffman */
-      if (! get_sof(cinfo, FALSE, FALSE))
+      if (!get_sof(cinfo, FALSE, FALSE))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF2:                /* Progressive, Huffman */
-      if (! get_sof(cinfo, TRUE, FALSE))
+      if (!get_sof(cinfo, TRUE, FALSE))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF9:                /* Extended sequential, arithmetic */
-      if (! get_sof(cinfo, FALSE, TRUE))
+      if (!get_sof(cinfo, FALSE, TRUE))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF10:               /* Progressive, arithmetic */
-      if (! get_sof(cinfo, TRUE, TRUE))
+      if (!get_sof(cinfo, TRUE, TRUE))
         return JPEG_SUSPENDED;
       break;
 
@@ -1026,7 +1023,7 @@ read_markers (j_decompress_ptr cinfo)
       break;
 
     case M_SOS:
-      if (! get_sos(cinfo))
+      if (!get_sos(cinfo))
         return JPEG_SUSPENDED;
       cinfo->unread_marker = 0; /* processed the marker */
       return JPEG_REACHED_SOS;
@@ -1037,22 +1034,22 @@ read_markers (j_decompress_ptr cinfo)
       return JPEG_REACHED_EOI;
 
     case M_DAC:
-      if (! get_dac(cinfo))
+      if (!get_dac(cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_DHT:
-      if (! get_dht(cinfo))
+      if (!get_dht(cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_DQT:
-      if (! get_dqt(cinfo))
+      if (!get_dqt(cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_DRI:
-      if (! get_dri(cinfo))
+      if (!get_dri(cinfo))
         return JPEG_SUSPENDED;
       break;
 
@@ -1072,13 +1069,13 @@ read_markers (j_decompress_ptr cinfo)
     case M_APP13:
     case M_APP14:
     case M_APP15:
-      if (! (*((my_marker_ptr) cinfo->marker)->process_APPn[
-                cinfo->unread_marker - (int) M_APP0]) (cinfo))
+      if (!(*((my_marker_ptr)cinfo->marker)->process_APPn[
+               cinfo->unread_marker - (int)M_APP0]) (cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_COM:
-      if (! (*((my_marker_ptr) cinfo->marker)->process_COM) (cinfo))
+      if (!(*((my_marker_ptr)cinfo->marker)->process_COM) (cinfo))
         return JPEG_SUSPENDED;
       break;
 
@@ -1095,7 +1092,7 @@ read_markers (j_decompress_ptr cinfo)
       break;
 
     case M_DNL:                 /* Ignore DNL ... perhaps the wrong thing */
-      if (! skip_variable(cinfo))
+      if (!skip_variable(cinfo))
         return JPEG_SUSPENDED;
       break;
 
@@ -1127,25 +1124,25 @@ read_markers (j_decompress_ptr cinfo)
  */
 
 METHODDEF(boolean)
-read_restart_marker (j_decompress_ptr cinfo)
+read_restart_marker(j_decompress_ptr cinfo)
 {
   /* Obtain a marker unless we already did. */
   /* Note that next_marker will complain if it skips any data. */
   if (cinfo->unread_marker == 0) {
-    if (! next_marker(cinfo))
+    if (!next_marker(cinfo))
       return FALSE;
   }
 
   if (cinfo->unread_marker ==
-      ((int) M_RST0 + cinfo->marker->next_restart_num)) {
+      ((int)M_RST0 + cinfo->marker->next_restart_num)) {
     /* Normal case --- swallow the marker and let entropy decoder continue */
     TRACEMS1(cinfo, 3, JTRC_RST, cinfo->marker->next_restart_num);
     cinfo->unread_marker = 0;
   } else {
     /* Uh-oh, the restart markers have been messed up. */
     /* Let the data source manager determine how to resync. */
-    if (! (*cinfo->src->resync_to_restart) (cinfo,
-                                            cinfo->marker->next_restart_num))
+    if (!(*cinfo->src->resync_to_restart) (cinfo,
+                                           cinfo->marker->next_restart_num))
       return FALSE;
   }
 
@@ -1206,7 +1203,7 @@ read_restart_marker (j_decompress_ptr cinfo)
  */
 
 GLOBAL(boolean)
-jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
+jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired)
 {
   int marker = cinfo->unread_marker;
   int action = 1;
@@ -1216,16 +1213,16 @@ jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
 
   /* Outer loop handles repeated decision after scanning forward. */
   for (;;) {
-    if (marker < (int) M_SOF0)
+    if (marker < (int)M_SOF0)
       action = 2;               /* invalid marker */
-    else if (marker < (int) M_RST0 || marker > (int) M_RST7)
+    else if (marker < (int)M_RST0 || marker > (int)M_RST7)
       action = 3;               /* valid non-restart marker */
     else {
-      if (marker == ((int) M_RST0 + ((desired+1) & 7)) ||
-          marker == ((int) M_RST0 + ((desired+2) & 7)))
+      if (marker == ((int)M_RST0 + ((desired + 1) & 7)) ||
+          marker == ((int)M_RST0 + ((desired + 2) & 7)))
         action = 3;             /* one of the next two expected restarts */
-      else if (marker == ((int) M_RST0 + ((desired-1) & 7)) ||
-               marker == ((int) M_RST0 + ((desired-2) & 7)))
+      else if (marker == ((int)M_RST0 + ((desired - 1) & 7)) ||
+               marker == ((int)M_RST0 + ((desired - 2) & 7)))
         action = 2;             /* a prior restart, so advance */
       else
         action = 1;             /* desired restart or too far away */
@@ -1238,7 +1235,7 @@ jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
       return TRUE;
     case 2:
       /* Scan to the next marker, and repeat the decision loop. */
-      if (! next_marker(cinfo))
+      if (!next_marker(cinfo))
         return FALSE;
       marker = cinfo->unread_marker;
       break;
@@ -1256,9 +1253,9 @@ jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
  */
 
 METHODDEF(void)
-reset_marker_reader (j_decompress_ptr cinfo)
+reset_marker_reader(j_decompress_ptr cinfo)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
 
   cinfo->comp_info = NULL;              /* until allocated by get_sof */
   cinfo->input_scan_number = 0;         /* no SOS seen yet */
@@ -1276,16 +1273,16 @@ reset_marker_reader (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_marker_reader (j_decompress_ptr cinfo)
+jinit_marker_reader(j_decompress_ptr cinfo)
 {
   my_marker_ptr marker;
   int i;
 
   /* Create subobject in permanent pool */
   marker = (my_marker_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                 sizeof(my_marker_reader));
-  cinfo->marker = (struct jpeg_marker_reader *) marker;
+  cinfo->marker = (struct jpeg_marker_reader *)marker;
   /* Initialize public method pointers */
   marker->pub.reset_marker_reader = reset_marker_reader;
   marker->pub.read_markers = read_markers;
@@ -1314,10 +1311,10 @@ jinit_marker_reader (j_decompress_ptr cinfo)
 #ifdef SAVE_MARKERS_SUPPORTED
 
 GLOBAL(void)
-jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
-                   unsigned int length_limit)
+jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+                  unsigned int length_limit)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
   long maxlength;
   jpeg_marker_parser_method processor;
 
@@ -1325,8 +1322,8 @@ jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
    * (should only be a concern in a 16-bit environment).
    */
   maxlength = cinfo->mem->max_alloc_chunk - sizeof(struct jpeg_marker_struct);
-  if (((long) length_limit) > maxlength)
-    length_limit = (unsigned int) maxlength;
+  if (((long)length_limit) > maxlength)
+    length_limit = (unsigned int)maxlength;
 
   /* Choose processor routine to use.
    * APP0/APP14 have special requirements.
@@ -1334,23 +1331,23 @@ jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
   if (length_limit) {
     processor = save_marker;
     /* If saving APP0/APP14, save at least enough for our internal use. */
-    if (marker_code == (int) M_APP0 && length_limit < APP0_DATA_LEN)
+    if (marker_code == (int)M_APP0 && length_limit < APP0_DATA_LEN)
       length_limit = APP0_DATA_LEN;
-    else if (marker_code == (int) M_APP14 && length_limit < APP14_DATA_LEN)
+    else if (marker_code == (int)M_APP14 && length_limit < APP14_DATA_LEN)
       length_limit = APP14_DATA_LEN;
   } else {
     processor = skip_variable;
     /* If discarding APP0/APP14, use our regular on-the-fly processor. */
-    if (marker_code == (int) M_APP0 || marker_code == (int) M_APP14)
+    if (marker_code == (int)M_APP0 || marker_code == (int)M_APP14)
       processor = get_interesting_appn;
   }
 
-  if (marker_code == (int) M_COM) {
+  if (marker_code == (int)M_COM) {
     marker->process_COM = processor;
     marker->length_limit_COM = length_limit;
-  } else if (marker_code >= (int) M_APP0 && marker_code <= (int) M_APP15) {
-    marker->process_APPn[marker_code - (int) M_APP0] = processor;
-    marker->length_limit_APPn[marker_code - (int) M_APP0] = length_limit;
+  } else if (marker_code >= (int)M_APP0 && marker_code <= (int)M_APP15) {
+    marker->process_APPn[marker_code - (int)M_APP0] = processor;
+    marker->length_limit_APPn[marker_code - (int)M_APP0] = length_limit;
   } else
     ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, marker_code);
 }
@@ -1363,15 +1360,15 @@ jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
  */
 
 GLOBAL(void)
-jpeg_set_marker_processor (j_decompress_ptr cinfo, int marker_code,
-                           jpeg_marker_parser_method routine)
+jpeg_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+                          jpeg_marker_parser_method routine)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
 
-  if (marker_code == (int) M_COM)
+  if (marker_code == (int)M_COM)
     marker->process_COM = routine;
-  else if (marker_code >= (int) M_APP0 && marker_code <= (int) M_APP15)
-    marker->process_APPn[marker_code - (int) M_APP0] = routine;
+  else if (marker_code >= (int)M_APP0 && marker_code <= (int)M_APP15)
+    marker->process_APPn[marker_code - (int)M_APP0] = routine;
   else
     ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, marker_code);
 }
diff --git a/media/libjpeg/jdmaster.c b/media/libjpeg/jdmaster.c
index 9079dda65c..a3690bf560 100644
--- a/media/libjpeg/jdmaster.c
+++ b/media/libjpeg/jdmaster.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2019, 2022, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -22,7 +22,6 @@
 #include "jpeglib.h"
 #include "jpegcomp.h"
 #include "jdmaster.h"
-#include "jsimd.h"
 
 
 /*
@@ -31,7 +30,7 @@
  */
 
 LOCAL(boolean)
-use_merged_upsample (j_decompress_ptr cinfo)
+use_merged_upsample(j_decompress_ptr cinfo)
 {
 #ifdef UPSAMPLE_MERGING_SUPPORTED
   /* Merging is the equivalent of plain box-filter upsampling */
@@ -40,22 +39,22 @@ use_merged_upsample (j_decompress_ptr cinfo)
   /* jdmerge.c only supports YCC=>RGB and YCC=>RGB565 color conversion */
   if (cinfo->jpeg_color_space != JCS_YCbCr || cinfo->num_components != 3 ||
       (cinfo->out_color_space != JCS_RGB &&
-      cinfo->out_color_space != JCS_RGB565 &&
-      cinfo->out_color_space != JCS_EXT_RGB &&
-      cinfo->out_color_space != JCS_EXT_RGBX &&
-      cinfo->out_color_space != JCS_EXT_BGR &&
-      cinfo->out_color_space != JCS_EXT_BGRX &&
-      cinfo->out_color_space != JCS_EXT_XBGR &&
-      cinfo->out_color_space != JCS_EXT_XRGB &&
-      cinfo->out_color_space != JCS_EXT_RGBA &&
-      cinfo->out_color_space != JCS_EXT_BGRA &&
-      cinfo->out_color_space != JCS_EXT_ABGR &&
-      cinfo->out_color_space != JCS_EXT_ARGB))
+       cinfo->out_color_space != JCS_RGB565 &&
+       cinfo->out_color_space != JCS_EXT_RGB &&
+       cinfo->out_color_space != JCS_EXT_RGBX &&
+       cinfo->out_color_space != JCS_EXT_BGR &&
+       cinfo->out_color_space != JCS_EXT_BGRX &&
+       cinfo->out_color_space != JCS_EXT_XBGR &&
+       cinfo->out_color_space != JCS_EXT_XRGB &&
+       cinfo->out_color_space != JCS_EXT_RGBA &&
+       cinfo->out_color_space != JCS_EXT_BGRA &&
+       cinfo->out_color_space != JCS_EXT_ABGR &&
+       cinfo->out_color_space != JCS_EXT_ARGB))
     return FALSE;
   if ((cinfo->out_color_space == JCS_RGB565 &&
-      cinfo->out_color_components != 3) ||
+       cinfo->out_color_components != 3) ||
       (cinfo->out_color_space != JCS_RGB565 &&
-      cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space]))
+       cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space]))
     return FALSE;
   /* and it only handles 2h1v or 2h2v sampling ratios */
   if (cinfo->comp_info[0].h_samp_factor != 2 ||
@@ -70,17 +69,6 @@ use_merged_upsample (j_decompress_ptr cinfo)
       cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
       cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
     return FALSE;
-#ifdef WITH_SIMD
-  /* If YCbCr-to-RGB color conversion is SIMD-accelerated but merged upsampling
-     isn't, then disabling merged upsampling is likely to be faster when
-     decompressing YCbCr JPEG images. */
-  if (!jsimd_can_h2v2_merged_upsample() && !jsimd_can_h2v1_merged_upsample() &&
-      jsimd_can_ycc_rgb() && cinfo->jpeg_color_space == JCS_YCbCr &&
-      (cinfo->out_color_space == JCS_RGB ||
-       (cinfo->out_color_space >= JCS_EXT_RGB &&
-        cinfo->out_color_space <= JCS_EXT_ARGB)))
-    return FALSE;
-#endif
   /* ??? also need to test for upsample-time rescaling, when & if supported */
   return TRUE;                  /* by golly, it'll work... */
 #else
@@ -100,7 +88,7 @@ GLOBAL(void)
 #else
 LOCAL(void)
 #endif
-jpeg_core_output_dimensions (j_decompress_ptr cinfo)
+jpeg_core_output_dimensions(j_decompress_ptr cinfo)
 /* Do computations that are needed before master selection phase.
  * This function is used for transcoding and full decompression.
  */
@@ -113,129 +101,129 @@ jpeg_core_output_dimensions (j_decompress_ptr cinfo)
   if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom) {
     /* Provide 1/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 1;
     cinfo->_min_DCT_v_scaled_size = 1;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 2) {
     /* Provide 2/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 2L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 2L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 2L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 2L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 2;
     cinfo->_min_DCT_v_scaled_size = 2;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 3) {
     /* Provide 3/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 3L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 3L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 3L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 3L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 3;
     cinfo->_min_DCT_v_scaled_size = 3;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 4) {
     /* Provide 4/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 4L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 4L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 4L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 4L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 4;
     cinfo->_min_DCT_v_scaled_size = 4;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 5) {
     /* Provide 5/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 5L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 5L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 5L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 5L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 5;
     cinfo->_min_DCT_v_scaled_size = 5;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 6) {
     /* Provide 6/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 6L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 6L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 6L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 6L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 6;
     cinfo->_min_DCT_v_scaled_size = 6;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 7) {
     /* Provide 7/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 7L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 7L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 7L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 7L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 7;
     cinfo->_min_DCT_v_scaled_size = 7;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 8) {
     /* Provide 8/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 8L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 8L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 8L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 8L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 8;
     cinfo->_min_DCT_v_scaled_size = 8;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 9) {
     /* Provide 9/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 9L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 9L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 9L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 9L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 9;
     cinfo->_min_DCT_v_scaled_size = 9;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 10) {
     /* Provide 10/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 10L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 10L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 10L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 10L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 10;
     cinfo->_min_DCT_v_scaled_size = 10;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 11) {
     /* Provide 11/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 11L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 11L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 11L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 11L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 11;
     cinfo->_min_DCT_v_scaled_size = 11;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 12) {
     /* Provide 12/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 12L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 12L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 12L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 12L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 12;
     cinfo->_min_DCT_v_scaled_size = 12;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 13) {
     /* Provide 13/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 13L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 13L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 13L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 13L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 13;
     cinfo->_min_DCT_v_scaled_size = 13;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 14) {
     /* Provide 14/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 14L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 14L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 14L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 14L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 14;
     cinfo->_min_DCT_v_scaled_size = 14;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 15) {
     /* Provide 15/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 15L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 15L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 15L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 15L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 15;
     cinfo->_min_DCT_v_scaled_size = 15;
   } else {
     /* Provide 16/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 16L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 16L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 16L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 16L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 16;
     cinfo->_min_DCT_v_scaled_size = 16;
   }
@@ -268,7 +256,7 @@ jpeg_core_output_dimensions (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
+jpeg_calc_output_dimensions(j_decompress_ptr cinfo)
 /* Do computations that are needed before master selection phase */
 {
 #ifdef IDCT_SCALING_SUPPORTED
@@ -314,13 +302,13 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
        ci++, compptr++) {
     /* Size in samples, after IDCT scaling */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width *
-                    (long) (compptr->h_samp_factor * compptr->_DCT_scaled_size),
-                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->image_width *
+                    (long)(compptr->h_samp_factor * compptr->_DCT_scaled_size),
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height *
-                    (long) (compptr->v_samp_factor * compptr->_DCT_scaled_size),
-                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->image_height *
+                    (long)(compptr->v_samp_factor * compptr->_DCT_scaled_size),
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
   }
 
 #else /* !IDCT_SCALING_SUPPORTED */
@@ -417,31 +405,31 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
  */
 
 LOCAL(void)
-prepare_range_limit_table (j_decompress_ptr cinfo)
+prepare_range_limit_table(j_decompress_ptr cinfo)
 /* Allocate and fill in the sample_range_limit table */
 {
   JSAMPLE *table;
   int i;
 
   table = (JSAMPLE *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                (5 * (MAXJSAMPLE+1) + CENTERJSAMPLE) * sizeof(JSAMPLE));
-  table += (MAXJSAMPLE+1);      /* allow negative subscripts of simple table */
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                (5 * (MAXJSAMPLE + 1) + CENTERJSAMPLE) * sizeof(JSAMPLE));
+  table += (MAXJSAMPLE + 1);    /* allow negative subscripts of simple table */
   cinfo->sample_range_limit = table;
   /* First segment of "simple" table: limit[x] = 0 for x < 0 */
-  MEMZERO(table - (MAXJSAMPLE+1), (MAXJSAMPLE+1) * sizeof(JSAMPLE));
+  memset(table - (MAXJSAMPLE + 1), 0, (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
   /* Main part of "simple" table: limit[x] = x */
   for (i = 0; i <= MAXJSAMPLE; i++)
-    table[i] = (JSAMPLE) i;
+    table[i] = (JSAMPLE)i;
   table += CENTERJSAMPLE;       /* Point to where post-IDCT table starts */
   /* End of simple table, rest of first half of post-IDCT table */
-  for (i = CENTERJSAMPLE; i < 2*(MAXJSAMPLE+1); i++)
+  for (i = CENTERJSAMPLE; i < 2 * (MAXJSAMPLE + 1); i++)
     table[i] = MAXJSAMPLE;
   /* Second half of post-IDCT table */
-  MEMZERO(table + (2 * (MAXJSAMPLE+1)),
-          (2 * (MAXJSAMPLE+1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
-  MEMCOPY(table + (4 * (MAXJSAMPLE+1) - CENTERJSAMPLE),
-          cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
+  memset(table + (2 * (MAXJSAMPLE + 1)), 0,
+         (2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
+  memcpy(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
+         cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
 }
 
 
@@ -457,9 +445,9 @@ prepare_range_limit_table (j_decompress_ptr cinfo)
  */
 
 LOCAL(void)
-master_selection (j_decompress_ptr cinfo)
+master_selection(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
   boolean use_c_buffer;
   long samplesperrow;
   JDIMENSION jd_samplesperrow;
@@ -469,9 +457,10 @@ master_selection (j_decompress_ptr cinfo)
   prepare_range_limit_table(cinfo);
 
   /* Width of an output scanline must be representable as JDIMENSION. */
-  samplesperrow = (long) cinfo->output_width * (long) cinfo->out_color_components;
-  jd_samplesperrow = (JDIMENSION) samplesperrow;
-  if ((long) jd_samplesperrow != samplesperrow)
+  samplesperrow = (long)cinfo->output_width *
+                  (long)cinfo->out_color_components;
+  jd_samplesperrow = (JDIMENSION)samplesperrow;
+  if ((long)jd_samplesperrow != samplesperrow)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
 
   /* Initialize my private state */
@@ -482,7 +471,7 @@ master_selection (j_decompress_ptr cinfo)
   master->quantizer_1pass = NULL;
   master->quantizer_2pass = NULL;
   /* No mode changes if not using buffered-image mode. */
-  if (! cinfo->quantize_colors || ! cinfo->buffered_image) {
+  if (!cinfo->quantize_colors || !cinfo->buffered_image) {
     cinfo->enable_1pass_quant = FALSE;
     cinfo->enable_external_quant = FALSE;
     cinfo->enable_2pass_quant = FALSE;
@@ -528,7 +517,7 @@ master_selection (j_decompress_ptr cinfo)
   }
 
   /* Post-processing: in particular, color conversion first */
-  if (! cinfo->raw_data_out) {
+  if (!cinfo->raw_data_out) {
     if (master->using_merged_upsample) {
 #ifdef UPSAMPLE_MERGING_SUPPORTED
       jinit_merged_upsampler(cinfo); /* does color conversion too */
@@ -565,11 +554,11 @@ master_selection (j_decompress_ptr cinfo)
   use_c_buffer = cinfo->inputctl->has_multiple_scans || cinfo->buffered_image;
   jinit_d_coef_controller(cinfo, use_c_buffer);
 
-  if (! cinfo->raw_data_out)
+  if (!cinfo->raw_data_out)
     jinit_d_main_controller(cinfo, FALSE /* never need full buffer here */);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
-  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
   /* Initialize input side of decompressor to consume first scan. */
   (*cinfo->inputctl->start_input_pass) (cinfo);
@@ -579,13 +568,14 @@ master_selection (j_decompress_ptr cinfo)
    */
   cinfo->master->first_iMCU_col = 0;
   cinfo->master->last_iMCU_col = cinfo->MCUs_per_row - 1;
+  cinfo->master->last_good_iMCU_row = 0;
 
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* If jpeg_start_decompress will read the whole file, initialize
    * progress monitoring appropriately.  The input step is counted
    * as one pass.
    */
-  if (cinfo->progress != NULL && ! cinfo->buffered_image &&
+  if (cinfo->progress != NULL && !cinfo->buffered_image &&
       cinfo->inputctl->has_multiple_scans) {
     int nscans;
     /* Estimate number of scans to set pass_limit. */
@@ -597,7 +587,7 @@ master_selection (j_decompress_ptr cinfo)
       nscans = cinfo->num_components;
     }
     cinfo->progress->pass_counter = 0L;
-    cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows * nscans;
+    cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows * nscans;
     cinfo->progress->completed_passes = 0;
     cinfo->progress->total_passes = (cinfo->enable_2pass_quant ? 3 : 2);
     /* Count the input pass as done */
@@ -617,9 +607,9 @@ master_selection (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-prepare_for_output_pass (j_decompress_ptr cinfo)
+prepare_for_output_pass(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   if (master->pub.is_dummy_pass) {
 #ifdef QUANT_2PASS_SUPPORTED
@@ -645,8 +635,8 @@ prepare_for_output_pass (j_decompress_ptr cinfo)
     }
     (*cinfo->idct->start_pass) (cinfo);
     (*cinfo->coef->start_output_pass) (cinfo);
-    if (! cinfo->raw_data_out) {
-      if (! master->using_merged_upsample)
+    if (!cinfo->raw_data_out) {
+      if (!master->using_merged_upsample)
         (*cinfo->cconvert->start_pass) (cinfo);
       (*cinfo->upsample->start_pass) (cinfo);
       if (cinfo->quantize_colors)
@@ -665,7 +655,7 @@ prepare_for_output_pass (j_decompress_ptr cinfo)
     /* In buffered-image mode, we assume one more output pass if EOI not
      * yet reached, but no more passes if EOI has been reached.
      */
-    if (cinfo->buffered_image && ! cinfo->inputctl->eoi_reached) {
+    if (cinfo->buffered_image && !cinfo->inputctl->eoi_reached) {
       cinfo->progress->total_passes += (cinfo->enable_2pass_quant ? 2 : 1);
     }
   }
@@ -677,9 +667,9 @@ prepare_for_output_pass (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-finish_output_pass (j_decompress_ptr cinfo)
+finish_output_pass(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   if (cinfo->quantize_colors)
     (*cinfo->cquantize->finish_pass) (cinfo);
@@ -694,9 +684,9 @@ finish_output_pass (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_new_colormap (j_decompress_ptr cinfo)
+jpeg_new_colormap(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   /* Prevent application from calling me at wrong times */
   if (cinfo->global_state != DSTATE_BUFIMAGE)
@@ -722,9 +712,9 @@ jpeg_new_colormap (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_master_decompress (j_decompress_ptr cinfo)
+jinit_master_decompress(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   master->pub.prepare_for_output_pass = prepare_for_output_pass;
   master->pub.finish_output_pass = finish_output_pass;
diff --git a/media/libjpeg/jdmerge.c b/media/libjpeg/jdmerge.c
index 6276dd0950..3a456d6581 100644
--- a/media/libjpeg/jdmerge.c
+++ b/media/libjpeg/jdmerge.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009, 2011, 2014-2015, D. R. Commander.
+ * Copyright (C) 2009, 2011, 2014-2015, 2020, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -40,44 +40,16 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jdmerge.h"
 #include "jsimd.h"
 #include "jconfigint.h"
 
 #ifdef UPSAMPLE_MERGING_SUPPORTED
 
 
-/* Private subobject */
-
-typedef struct {
-  struct jpeg_upsampler pub;    /* public fields */
-
-  /* Pointer to routine to do actual upsampling/conversion of one row group */
-  void (*upmethod) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                    JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
-
-  /* Private state for YCC->RGB conversion */
-  int *Cr_r_tab;                /* => table for Cr to R conversion */
-  int *Cb_b_tab;                /* => table for Cb to B conversion */
-  JLONG *Cr_g_tab;              /* => table for Cr to G conversion */
-  JLONG *Cb_g_tab;              /* => table for Cb to G conversion */
-
-  /* For 2:1 vertical sampling, we produce two output rows at a time.
-   * We need a "spare" row buffer to hold the second output row if the
-   * application provides just a one-row buffer; we also use the spare
-   * to discard the dummy last row if the image height is odd.
-   */
-  JSAMPROW spare_row;
-  boolean spare_full;           /* T if spare buffer is occupied */
-
-  JDIMENSION out_row_width;     /* samples per output row */
-  JDIMENSION rows_to_go;        /* counts rows remaining in image */
-} my_upsampler;
-
-typedef my_upsampler *my_upsample_ptr;
-
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
-#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
+#define ONE_HALF        ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x)          ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
 
 
 /* Include inline routines for colorspace extensions */
@@ -88,12 +60,12 @@ typedef my_upsampler *my_upsample_ptr;
 #undef RGB_BLUE
 #undef RGB_PIXELSIZE
 
-#define RGB_RED EXT_RGB_RED
-#define RGB_GREEN EXT_RGB_GREEN
-#define RGB_BLUE EXT_RGB_BLUE
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define h2v1_merged_upsample_internal extrgb_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extrgb_h2v2_merged_upsample_internal
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define h2v1_merged_upsample_internal  extrgb_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal  extrgb_h2v2_merged_upsample_internal
 #include "jdmrgext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -102,13 +74,13 @@ typedef my_upsampler *my_upsample_ptr;
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
 
-#define RGB_RED EXT_RGBX_RED
-#define RGB_GREEN EXT_RGBX_GREEN
-#define RGB_BLUE EXT_RGBX_BLUE
-#define RGB_ALPHA 3
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define h2v1_merged_upsample_internal extrgbx_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extrgbx_h2v2_merged_upsample_internal
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define h2v1_merged_upsample_internal  extrgbx_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal  extrgbx_h2v2_merged_upsample_internal
 #include "jdmrgext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -118,12 +90,12 @@ typedef my_upsampler *my_upsample_ptr;
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
 
-#define RGB_RED EXT_BGR_RED
-#define RGB_GREEN EXT_BGR_GREEN
-#define RGB_BLUE EXT_BGR_BLUE
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define h2v1_merged_upsample_internal extbgr_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extbgr_h2v2_merged_upsample_internal
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define h2v1_merged_upsample_internal  extbgr_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal  extbgr_h2v2_merged_upsample_internal
 #include "jdmrgext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -132,13 +104,13 @@ typedef my_upsampler *my_upsample_ptr;
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
 
-#define RGB_RED EXT_BGRX_RED
-#define RGB_GREEN EXT_BGRX_GREEN
-#define RGB_BLUE EXT_BGRX_BLUE
-#define RGB_ALPHA 3
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define h2v1_merged_upsample_internal extbgrx_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extbgrx_h2v2_merged_upsample_internal
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define h2v1_merged_upsample_internal  extbgrx_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal  extbgrx_h2v2_merged_upsample_internal
 #include "jdmrgext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -148,13 +120,13 @@ typedef my_upsampler *my_upsample_ptr;
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
 
-#define RGB_RED EXT_XBGR_RED
-#define RGB_GREEN EXT_XBGR_GREEN
-#define RGB_BLUE EXT_XBGR_BLUE
-#define RGB_ALPHA 0
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define h2v1_merged_upsample_internal extxbgr_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extxbgr_h2v2_merged_upsample_internal
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define h2v1_merged_upsample_internal  extxbgr_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal  extxbgr_h2v2_merged_upsample_internal
 #include "jdmrgext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -164,13 +136,13 @@ typedef my_upsampler *my_upsample_ptr;
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
 
-#define RGB_RED EXT_XRGB_RED
-#define RGB_GREEN EXT_XRGB_GREEN
-#define RGB_BLUE EXT_XRGB_BLUE
-#define RGB_ALPHA 0
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define h2v1_merged_upsample_internal extxrgb_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extxrgb_h2v2_merged_upsample_internal
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define h2v1_merged_upsample_internal  extxrgb_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal  extxrgb_h2v2_merged_upsample_internal
 #include "jdmrgext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -187,25 +159,25 @@ typedef my_upsampler *my_upsample_ptr;
  */
 
 LOCAL(void)
-build_ycc_rgb_table (j_decompress_ptr cinfo)
+build_ycc_rgb_table(j_decompress_ptr cinfo)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   int i;
   JLONG x;
   SHIFT_TEMPS
 
   upsample->Cr_r_tab = (int *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(int));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(int));
   upsample->Cb_b_tab = (int *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(int));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(int));
   upsample->Cr_g_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(JLONG));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(JLONG));
   upsample->Cb_g_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(JLONG));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(JLONG));
 
   for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
     /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
@@ -217,10 +189,10 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
     upsample->Cb_b_tab[i] = (int)
                     RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
     /* Cr=>G value is scaled-up -0.71414 * x */
-    upsample->Cr_g_tab[i] = (- FIX(0.71414)) * x;
+    upsample->Cr_g_tab[i] = (-FIX(0.71414)) * x;
     /* Cb=>G value is scaled-up -0.34414 * x */
     /* We also add in ONE_HALF so that need not do it in inner loop */
-    upsample->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
+    upsample->Cb_g_tab[i] = (-FIX(0.34414)) * x + ONE_HALF;
   }
 }
 
@@ -230,9 +202,9 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_pass_merged_upsample (j_decompress_ptr cinfo)
+start_pass_merged_upsample(j_decompress_ptr cinfo)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
 
   /* Mark the spare buffer empty */
   upsample->spare_full = FALSE;
@@ -248,14 +220,13 @@ start_pass_merged_upsample (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-merged_2v_upsample (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                    JDIMENSION in_row_groups_avail,
-                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                    JDIMENSION out_rows_avail)
+merged_2v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION *in_row_group_ctr,
+                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 /* 2:1 vertical sampling case: may need a spare row. */
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   JSAMPROW work_ptrs[2];
   JDIMENSION num_rows;          /* number of rows returned to caller */
 
@@ -264,8 +235,8 @@ merged_2v_upsample (j_decompress_ptr cinfo,
     JDIMENSION size = upsample->out_row_width;
     if (cinfo->out_color_space == JCS_RGB565)
       size = cinfo->output_width * 2;
-    jcopy_sample_rows(& upsample->spare_row, 0, output_buf + *out_row_ctr, 0,
-                      1, size);
+    jcopy_sample_rows(&upsample->spare_row, 0, output_buf + *out_row_ctr, 0, 1,
+                      size);
     num_rows = 1;
     upsample->spare_full = FALSE;
   } else {
@@ -294,20 +265,19 @@ merged_2v_upsample (j_decompress_ptr cinfo,
   *out_row_ctr += num_rows;
   upsample->rows_to_go -= num_rows;
   /* When the buffer is emptied, declare this input row group consumed */
-  if (! upsample->spare_full)
+  if (!upsample->spare_full)
     (*in_row_group_ctr)++;
 }
 
 
 METHODDEF(void)
-merged_1v_upsample (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                    JDIMENSION in_row_groups_avail,
-                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                    JDIMENSION out_rows_avail)
+merged_1v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION *in_row_group_ctr,
+                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 /* 1:1 vertical sampling case: much easier, never need a spare row. */
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
 
   /* Just do the upsampling. */
   (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr,
@@ -333,43 +303,42 @@ merged_1v_upsample (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-h2v1_merged_upsample (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                      JSAMPARRAY output_buf)
+h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                     JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                           output_buf);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                           output_buf);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    default:
-      h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                    output_buf);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                         output_buf);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                         output_buf);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  default:
+    h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                  output_buf);
+    break;
   }
 }
 
@@ -379,43 +348,42 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-h2v2_merged_upsample (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                      JSAMPARRAY output_buf)
+h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                     JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                           output_buf);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                           output_buf);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    default:
-      h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                    output_buf);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                         output_buf);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                         output_buf);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  default:
+    h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                  output_buf);
+    break;
   }
 }
 
@@ -424,24 +392,21 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
  * RGB565 conversion
  */
 
-#define PACK_SHORT_565_LE(r, g, b)   ((((r) << 8) & 0xF800) |  \
-                                      (((g) << 3) & 0x7E0) | ((b) >> 3))
-#define PACK_SHORT_565_BE(r, g, b)   (((r) & 0xF8) | ((g) >> 5) |  \
-                                      (((g) << 11) & 0xE000) |  \
-                                      (((b) << 5) & 0x1F00))
+#define PACK_SHORT_565_LE(r, g, b) \
+  ((((r) << 8) & 0xF800) | (((g) << 3) & 0x7E0) | ((b) >> 3))
+#define PACK_SHORT_565_BE(r, g, b) \
+  (((r) & 0xF8) | ((g) >> 5) | (((g) << 11) & 0xE000) | (((b) << 5) & 0x1F00))
 
-#define PACK_TWO_PIXELS_LE(l, r)     ((r << 16) | l)
-#define PACK_TWO_PIXELS_BE(l, r)     ((l << 16) | r)
+#define PACK_TWO_PIXELS_LE(l, r)    ((r << 16) | l)
+#define PACK_TWO_PIXELS_BE(l, r)    ((l << 16) | r)
 
-#define PACK_NEED_ALIGNMENT(ptr)  (((size_t)(ptr)) & 3)
-
-#define WRITE_TWO_PIXELS_LE(addr, pixels) {  \
-  ((INT16*)(addr))[0] = (INT16)(pixels);  \
-  ((INT16*)(addr))[1] = (INT16)((pixels) >> 16);  \
+#define WRITE_TWO_PIXELS_LE(addr, pixels) { \
+  ((INT16 *)(addr))[0] = (INT16)(pixels); \
+  ((INT16 *)(addr))[1] = (INT16)((pixels) >> 16); \
 }
-#define WRITE_TWO_PIXELS_BE(addr, pixels) {  \
-  ((INT16*)(addr))[1] = (INT16)(pixels);  \
-  ((INT16*)(addr))[0] = (INT16)((pixels) >> 16);  \
+#define WRITE_TWO_PIXELS_BE(addr, pixels) { \
+  ((INT16 *)(addr))[1] = (INT16)(pixels); \
+  ((INT16 *)(addr))[0] = (INT16)((pixels) >> 16); \
 }
 
 #define DITHER_565_R(r, dither)  ((r) + ((dither) & 0xFF))
@@ -452,7 +417,7 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
 /* Declarations for ordered dithering
  *
  * We use a 4x4 ordered dither array packed into 32 bits.  This array is
- * sufficent for dithering RGB888 to RGB565.
+ * sufficient for dithering RGB888 to RGB565.
  */
 
 #define DITHER_MASK       0x3
@@ -467,13 +432,13 @@ static const JLONG dither_matrix[4] = {
 
 /* Include inline routines for RGB565 conversion */
 
-#define PACK_SHORT_565 PACK_SHORT_565_LE
-#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE
-#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_LE
-#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_le
-#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_le
-#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_le
-#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_le
+#define PACK_SHORT_565  PACK_SHORT_565_LE
+#define PACK_TWO_PIXELS  PACK_TWO_PIXELS_LE
+#define WRITE_TWO_PIXELS  WRITE_TWO_PIXELS_LE
+#define h2v1_merged_upsample_565_internal  h2v1_merged_upsample_565_le
+#define h2v1_merged_upsample_565D_internal  h2v1_merged_upsample_565D_le
+#define h2v2_merged_upsample_565_internal  h2v2_merged_upsample_565_le
+#define h2v2_merged_upsample_565D_internal  h2v2_merged_upsample_565D_le
 #include "jdmrg565.c"
 #undef PACK_SHORT_565
 #undef PACK_TWO_PIXELS
@@ -483,13 +448,13 @@ static const JLONG dither_matrix[4] = {
 #undef h2v2_merged_upsample_565_internal
 #undef h2v2_merged_upsample_565D_internal
 
-#define PACK_SHORT_565 PACK_SHORT_565_BE
-#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE
-#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_BE
-#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_be
-#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_be
-#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_be
-#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_be
+#define PACK_SHORT_565  PACK_SHORT_565_BE
+#define PACK_TWO_PIXELS  PACK_TWO_PIXELS_BE
+#define WRITE_TWO_PIXELS  WRITE_TWO_PIXELS_BE
+#define h2v1_merged_upsample_565_internal  h2v1_merged_upsample_565_be
+#define h2v1_merged_upsample_565D_internal  h2v1_merged_upsample_565D_be
+#define h2v2_merged_upsample_565_internal  h2v2_merged_upsample_565_be
+#define h2v2_merged_upsample_565D_internal  h2v2_merged_upsample_565D_be
 #include "jdmrg565.c"
 #undef PACK_SHORT_565
 #undef PACK_TWO_PIXELS
@@ -503,16 +468,15 @@ static const JLONG dither_matrix[4] = {
 static INLINE boolean is_big_endian(void)
 {
   int test_value = 1;
-  if(*(char *)&test_value != 1)
+  if (*(char *)&test_value != 1)
     return TRUE;
   return FALSE;
 }
 
 
 METHODDEF(void)
-h2v1_merged_upsample_565 (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                          JSAMPARRAY output_buf)
+h2v1_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v1_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
@@ -520,13 +484,12 @@ h2v1_merged_upsample_565 (j_decompress_ptr cinfo,
   else
     h2v1_merged_upsample_565_le(cinfo, input_buf, in_row_group_ctr,
                                 output_buf);
- }
+}
 
 
 METHODDEF(void)
-h2v1_merged_upsample_565D (j_decompress_ptr cinfo,
-                           JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                           JSAMPARRAY output_buf)
+h2v1_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                          JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v1_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
@@ -538,9 +501,8 @@ h2v1_merged_upsample_565D (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-h2v2_merged_upsample_565 (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                          JSAMPARRAY output_buf)
+h2v2_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v2_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
@@ -552,9 +514,8 @@ h2v2_merged_upsample_565 (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-h2v2_merged_upsample_565D (j_decompress_ptr cinfo,
-                           JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                           JSAMPARRAY output_buf)
+h2v2_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                          JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v2_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
@@ -574,14 +535,14 @@ h2v2_merged_upsample_565D (j_decompress_ptr cinfo,
  */
 
 GLOBAL(void)
-jinit_merged_upsampler (j_decompress_ptr cinfo)
+jinit_merged_upsampler(j_decompress_ptr cinfo)
 {
-  my_upsample_ptr upsample;
+  my_merged_upsample_ptr upsample;
 
-  upsample = (my_upsample_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                sizeof(my_upsampler));
-  cinfo->upsample = (struct jpeg_upsampler *) upsample;
+  upsample = (my_merged_upsample_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(my_merged_upsampler));
+  cinfo->upsample = (struct jpeg_upsampler *)upsample;
   upsample->pub.start_pass = start_pass_merged_upsample;
   upsample->pub.need_context_rows = FALSE;
 
@@ -602,8 +563,8 @@ jinit_merged_upsampler (j_decompress_ptr cinfo)
     }
     /* Allocate a spare row buffer */
     upsample->spare_row = (JSAMPROW)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                (size_t) (upsample->out_row_width * sizeof(JSAMPLE)));
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                (size_t)(upsample->out_row_width * sizeof(JSAMPLE)));
   } else {
     upsample->pub.upsample = merged_1v_upsample;
     if (jsimd_can_h2v1_merged_upsample())
diff --git a/media/libjpeg/jdmerge.h b/media/libjpeg/jdmerge.h
new file mode 100644
index 0000000000..b583396b10
--- /dev/null
+++ b/media/libjpeg/jdmerge.h
@@ -0,0 +1,47 @@
+/*
+ * jdmerge.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#define JPEG_INTERNALS
+#include "jpeglib.h"
+
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+
+
+/* Private subobject */
+
+typedef struct {
+  struct jpeg_upsampler pub;    /* public fields */
+
+  /* Pointer to routine to do actual upsampling/conversion of one row group */
+  void (*upmethod) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+
+  /* Private state for YCC->RGB conversion */
+  int *Cr_r_tab;                /* => table for Cr to R conversion */
+  int *Cb_b_tab;                /* => table for Cb to B conversion */
+  JLONG *Cr_g_tab;              /* => table for Cr to G conversion */
+  JLONG *Cb_g_tab;              /* => table for Cb to G conversion */
+
+  /* For 2:1 vertical sampling, we produce two output rows at a time.
+   * We need a "spare" row buffer to hold the second output row if the
+   * application provides just a one-row buffer; we also use the spare
+   * to discard the dummy last row if the image height is odd.
+   */
+  JSAMPROW spare_row;
+  boolean spare_full;           /* T if spare buffer is occupied */
+
+  JDIMENSION out_row_width;     /* samples per output row */
+  JDIMENSION rows_to_go;        /* counts rows remaining in image */
+} my_merged_upsampler;
+
+typedef my_merged_upsampler *my_merged_upsample_ptr;
+
+#endif /* UPSAMPLE_MERGING_SUPPORTED */
diff --git a/media/libjpeg/jdmrg565.c b/media/libjpeg/jdmrg565.c
index 18287b3735..980a4e216e 100644
--- a/media/libjpeg/jdmrg565.c
+++ b/media/libjpeg/jdmrg565.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014-2015, D. R. Commander.
+ * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -15,23 +15,22 @@
 
 INLINE
 LOCAL(void)
-h2v1_merged_upsample_565_internal (j_decompress_ptr cinfo,
-                                   JSAMPIMAGE input_buf,
-                                   JDIMENSION in_row_group_ctr,
-                                   JSAMPARRAY output_buf)
+h2v1_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                                  JDIMENSION in_row_group_ctr,
+                                  JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr;
   JSAMPROW inptr0, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   unsigned int r, g, b;
   JLONG rgb;
   SHIFT_TEMPS
@@ -44,20 +43,20 @@ h2v1_merged_upsample_565_internal (j_decompress_ptr cinfo,
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -69,40 +68,40 @@ h2v1_merged_upsample_565_internal (j_decompress_ptr cinfo,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr = (INT16)rgb;
-   }
- }
+    *(INT16 *)outptr = (INT16)rgb;
+  }
+}
 
 
 INLINE
 LOCAL(void)
-h2v1_merged_upsample_565D_internal (j_decompress_ptr cinfo,
-                                    JSAMPIMAGE input_buf,
-                                    JDIMENSION in_row_group_ctr,
-                                    JSAMPARRAY output_buf)
+h2v1_merged_upsample_565D_internal(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION in_row_group_ctr,
+                                   JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr;
   JSAMPROW inptr0, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   unsigned int r, g, b;
   JLONG rgb;
@@ -116,21 +115,21 @@ h2v1_merged_upsample_565D_internal (j_decompress_ptr cinfo,
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     d0 = DITHER_ROTATE(d0);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -143,40 +142,39 @@ h2v1_merged_upsample_565D_internal (j_decompress_ptr cinfo,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr = (INT16)rgb;
+    *(INT16 *)outptr = (INT16)rgb;
   }
 }
 
 
 INLINE
 LOCAL(void)
-h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
-                                   JSAMPIMAGE input_buf,
-                                   JDIMENSION in_row_group_ctr,
-                                   JSAMPARRAY output_buf)
+h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                                  JDIMENSION in_row_group_ctr,
+                                  JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr0, outptr1;
   JSAMPROW inptr00, inptr01, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   unsigned int r, g, b;
   JLONG rgb;
   SHIFT_TEMPS
@@ -191,20 +189,20 @@ h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -213,13 +211,13 @@ h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
     WRITE_TWO_PIXELS(outptr0, rgb);
     outptr0 += 4;
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -231,56 +229,56 @@ h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr0 = (INT16)rgb;
+    *(INT16 *)outptr0 = (INT16)rgb;
 
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr1 = (INT16)rgb;
+    *(INT16 *)outptr1 = (INT16)rgb;
   }
 }
 
 
 INLINE
 LOCAL(void)
-h2v2_merged_upsample_565D_internal (j_decompress_ptr cinfo,
-                                    JSAMPIMAGE input_buf,
-                                    JDIMENSION in_row_group_ctr,
-                                    JSAMPARRAY output_buf)
+h2v2_merged_upsample_565D_internal(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION in_row_group_ctr,
+                                   JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr0, outptr1;
   JSAMPROW inptr00, inptr01, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
-  JLONG d1 = dither_matrix[(cinfo->output_scanline+1) & DITHER_MASK];
+  JLONG d1 = dither_matrix[(cinfo->output_scanline + 1) & DITHER_MASK];
   unsigned int r, g, b;
   JLONG rgb;
   SHIFT_TEMPS
 
-  inptr00 = input_buf[0][in_row_group_ctr*2];
-  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
+  inptr00 = input_buf[0][in_row_group_ctr * 2];
+  inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
   inptr1 = input_buf[1][in_row_group_ctr];
   inptr2 = input_buf[2][in_row_group_ctr];
   outptr0 = output_buf[0];
@@ -289,38 +287,38 @@ h2v2_merged_upsample_565D_internal (j_decompress_ptr cinfo,
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     d0 = DITHER_ROTATE(d0);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr00++);
-    r = range_limit[DITHER_565_R(y + cred, d1)];
-    g = range_limit[DITHER_565_G(y + cgreen, d1)];
-    b = range_limit[DITHER_565_B(y + cblue, d1)];
-    d1 = DITHER_ROTATE(d1);
+    y  = *inptr00++;
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    d0 = DITHER_ROTATE(d0);
     rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
     WRITE_TWO_PIXELS(outptr0, rgb);
     outptr0 += 4;
 
-    y  = GETJSAMPLE(*inptr01++);
-    r = range_limit[DITHER_565_R(y + cred, d0)];
-    g = range_limit[DITHER_565_G(y + cgreen, d0)];
-    b = range_limit[DITHER_565_B(y + cblue, d0)];
-    d0 = DITHER_ROTATE(d0);
+    y  = *inptr01++;
+    r = range_limit[DITHER_565_R(y + cred, d1)];
+    g = range_limit[DITHER_565_G(y + cgreen, d1)];
+    b = range_limit[DITHER_565_B(y + cblue, d1)];
+    d1 = DITHER_ROTATE(d1);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
@@ -333,24 +331,24 @@ h2v2_merged_upsample_565D_internal (j_decompress_ptr cinfo,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr0 = (INT16)rgb;
+    *(INT16 *)outptr0 = (INT16)rgb;
 
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr1 = (INT16)rgb;
+    *(INT16 *)outptr1 = (INT16)rgb;
   }
 }
diff --git a/media/libjpeg/jdmrgext.c b/media/libjpeg/jdmrgext.c
index 9d7d2af2e9..9bf4f1a307 100644
--- a/media/libjpeg/jdmrgext.c
+++ b/media/libjpeg/jdmrgext.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2015, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -21,23 +21,22 @@
 
 INLINE
 LOCAL(void)
-h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
-                               JSAMPIMAGE input_buf,
-                               JDIMENSION in_row_group_ctr,
-                               JSAMPARRAY output_buf)
+h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                              JDIMENSION in_row_group_ctr,
+                              JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr;
   JSAMPROW inptr0, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   SHIFT_TEMPS
 
   inptr0 = input_buf[0][in_row_group_ctr];
@@ -47,13 +46,13 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -61,7 +60,7 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
     outptr[RGB_ALPHA] = 0xFF;
 #endif
     outptr += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -72,12 +71,12 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -94,27 +93,26 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
-                               JSAMPIMAGE input_buf,
-                               JDIMENSION in_row_group_ctr,
-                               JSAMPARRAY output_buf)
+h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                              JDIMENSION in_row_group_ctr,
+                              JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr0, outptr1;
   JSAMPROW inptr00, inptr01, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   SHIFT_TEMPS
 
-  inptr00 = input_buf[0][in_row_group_ctr*2];
-  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
+  inptr00 = input_buf[0][in_row_group_ctr * 2];
+  inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
   inptr1 = input_buf[1][in_row_group_ctr];
   inptr2 = input_buf[2][in_row_group_ctr];
   outptr0 = output_buf[0];
@@ -122,13 +120,13 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
@@ -136,7 +134,7 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
     outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
@@ -144,7 +142,7 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
     outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
@@ -152,7 +150,7 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
     outptr1[RGB_ALPHA] = 0xFF;
 #endif
     outptr1 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
@@ -163,19 +161,19 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
diff --git a/media/libjpeg/jdphuff.c b/media/libjpeg/jdphuff.c
index c927ffa071..9680ebcbd0 100644
--- a/media/libjpeg/jdphuff.c
+++ b/media/libjpeg/jdphuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -15,12 +15,16 @@
  * up to the start of the current MCU.  To do this, we copy state variables
  * into local working storage, and update them back to the permanent
  * storage only upon successful completion of an MCU.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
  */
 
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdhuff.h"             /* Declarations shared with jdhuff.c */
+#include <limits.h>
 
 
 #ifdef D_PROGRESSIVE_SUPPORTED
@@ -37,25 +41,6 @@ typedef struct {
   int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src)  \
-        ((dest).EOBRUN = (src).EOBRUN, \
-         (dest).last_dc_val[0] = (src).last_dc_val[0], \
-         (dest).last_dc_val[1] = (src).last_dc_val[1], \
-         (dest).last_dc_val[2] = (src).last_dc_val[2], \
-         (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
@@ -77,14 +62,14 @@ typedef struct {
 typedef phuff_entropy_decoder *phuff_entropy_ptr;
 
 /* Forward declarations */
-METHODDEF(boolean) decode_mcu_DC_first (j_decompress_ptr cinfo,
+METHODDEF(boolean) decode_mcu_DC_first(j_decompress_ptr cinfo,
+                                       JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_AC_first(j_decompress_ptr cinfo,
+                                       JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_DC_refine(j_decompress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) decode_mcu_AC_first (j_decompress_ptr cinfo,
+METHODDEF(boolean) decode_mcu_AC_refine(j_decompress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) decode_mcu_DC_refine (j_decompress_ptr cinfo,
-                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) decode_mcu_AC_refine (j_decompress_ptr cinfo,
-                                         JBLOCKROW *MCU_data);
 
 
 /*
@@ -92,13 +77,13 @@ METHODDEF(boolean) decode_mcu_AC_refine (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-start_pass_phuff_decoder (j_decompress_ptr cinfo)
+start_pass_phuff_decoder(j_decompress_ptr cinfo)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   boolean is_DC_band, bad;
   int ci, coefi, tbl;
   d_derived_tbl **pdtbl;
-  int *coef_bit_ptr;
+  int *coef_bit_ptr, *prev_coef_bit_ptr;
   jpeg_component_info *compptr;
 
   is_DC_band = (cinfo->Ss == 0);
@@ -118,7 +103,7 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo)
   }
   if (cinfo->Ah != 0) {
     /* Successive approximation refinement scan: must have Al = Ah-1. */
-    if (cinfo->Al != cinfo->Ah-1)
+    if (cinfo->Al != cinfo->Ah - 1)
       bad = TRUE;
   }
   if (cinfo->Al > 13)           /* need not check for < 0 */
@@ -138,9 +123,16 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo)
    */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     int cindex = cinfo->cur_comp_info[ci]->component_index;
-    coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+    coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+    prev_coef_bit_ptr = &cinfo->coef_bits[cindex + cinfo->num_components][0];
     if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
       WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+    for (coefi = MIN(cinfo->Ss, 1); coefi <= MAX(cinfo->Se, 9); coefi++) {
+      if (cinfo->input_scan_number > 1)
+        prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
+      else
+        prev_coef_bit_ptr[coefi] = 0;
+    }
     for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
       int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
       if (cinfo->Ah != expected)
@@ -205,22 +197,26 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo)
 #define AVOID_TABLES
 #ifdef AVOID_TABLES
 
-#define NEG_1 ((unsigned)-1)
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((NEG_1)<<(s)) + 1) : (x))
+#define NEG_1  ((unsigned)-1)
+#define HUFF_EXTEND(x, s) \
+  ((x) < (1 << ((s) - 1)) ? (x) + (((NEG_1) << (s)) + 1) : (x))
 
 #else
 
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
+#define HUFF_EXTEND(x, s) \
+  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
 
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+static const int extend_test[16] = {   /* entry n is 2**(n-1) */
+  0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
+  0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+};
 
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
+static const int extend_offset[16] = { /* entry n is (-1 << n) + 1 */
+  0, ((-1) << 1) + 1, ((-1) << 2) + 1, ((-1) << 3) + 1, ((-1) << 4) + 1,
+  ((-1) << 5) + 1, ((-1) << 6) + 1, ((-1) << 7) + 1, ((-1) << 8) + 1,
+  ((-1) << 9) + 1, ((-1) << 10) + 1, ((-1) << 11) + 1, ((-1) << 12) + 1,
+  ((-1) << 13) + 1, ((-1) << 14) + 1, ((-1) << 15) + 1
+};
 
 #endif /* AVOID_TABLES */
 
@@ -231,9 +227,9 @@ static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
  */
 
 LOCAL(boolean)
-process_restart (j_decompress_ptr cinfo)
+process_restart(j_decompress_ptr cinfo)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int ci;
 
   /* Throw away any unused bits remaining in bit buffer; */
@@ -242,7 +238,7 @@ process_restart (j_decompress_ptr cinfo)
   entropy->bitstate.bits_left = 0;
 
   /* Advance past the RSTn marker */
-  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+  if (!(*cinfo->marker->read_restart_marker) (cinfo))
     return FALSE;
 
   /* Re-initialize DC predictions to 0 */
@@ -289,9 +285,9 @@ process_restart (j_decompress_ptr cinfo)
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int Al = cinfo->Al;
   register int s, r;
   int blkn, ci;
@@ -304,18 +300,18 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
   }
 
   /* If we've run out of data, just leave the MCU set to zeroes.
    * This way, we return uniform gray for the remainder of the segment.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (!entropy->pub.insufficient_data) {
 
     /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(state, entropy->saved);
+    BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+    state = entropy->saved;
 
     /* Outer loop handles each block in the MCU */
 
@@ -336,19 +332,24 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       }
 
       /* Convert DC difference to actual value, update last_dc_val */
+      if ((state.last_dc_val[ci] >= 0 &&
+           s > INT_MAX - state.last_dc_val[ci]) ||
+          (state.last_dc_val[ci] < 0 && s < INT_MIN - state.last_dc_val[ci]))
+        ERREXIT(cinfo, JERR_BAD_DCT_COEF);
       s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
-      (*block)[0] = (JCOEF) LEFT_SHIFT(s, Al);
+      (*block)[0] = (JCOEF)LEFT_SHIFT(s, Al);
     }
 
     /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(entropy->saved, state);
+    BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+    entropy->saved = state;
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -360,9 +361,9 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int Se = cinfo->Se;
   int Al = cinfo->Al;
   register int s, k, r;
@@ -374,14 +375,14 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
   }
 
   /* If we've run out of data, just leave the MCU set to zeroes.
    * This way, we return uniform gray for the remainder of the segment.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (!entropy->pub.insufficient_data) {
 
     /* Load up working state.
      * We can avoid loading/saving bitread state if in an EOB run.
@@ -393,7 +394,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     if (EOBRUN > 0)             /* if it's a band of zeroes... */
       EOBRUN--;                 /* ...process it now (we do nothing) */
     else {
-      BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+      BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
       block = MCU_data[0];
       tbl = entropy->ac_derived_tbl;
 
@@ -407,7 +408,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
           r = GET_BITS(s);
           s = HUFF_EXTEND(r, s);
           /* Scale and output coefficient in natural (dezigzagged) order */
-          (*block)[jpeg_natural_order[k]] = (JCOEF) LEFT_SHIFT(s, Al);
+          (*block)[jpeg_natural_order[k]] = (JCOEF)LEFT_SHIFT(s, Al);
         } else {
           if (r == 15) {        /* ZRL */
             k += 15;            /* skip 15 zeroes in band */
@@ -424,7 +425,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
         }
       }
 
-      BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+      BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
     }
 
     /* Completed MCU, so update state */
@@ -432,7 +433,8 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -445,9 +447,9 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int p1 = 1 << cinfo->Al;      /* 1 in the bit position being coded */
   int blkn;
   JBLOCKROW block;
@@ -456,7 +458,7 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
   }
 
@@ -465,7 +467,7 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
    */
 
   /* Load up working state */
-  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+  BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
 
   /* Outer loop handles each block in the MCU */
 
@@ -480,10 +482,11 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   /* Completed MCU, so update state */
-  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+  BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -494,9 +497,9 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int Se = cinfo->Se;
   int p1 = 1 << cinfo->Al;        /* 1 in the bit position being coded */
   int m1 = (NEG_1) << cinfo->Al;  /* -1 in the bit position being coded */
@@ -512,16 +515,16 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
   }
 
   /* If we've run out of data, don't modify the MCU.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (!entropy->pub.insufficient_data) {
 
     /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+    BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
     EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
 
     /* There is always only one block per MCU */
@@ -575,9 +578,9 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
             if (GET_BITS(1)) {
               if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
                 if (*thiscoef >= 0)
-                  *thiscoef += p1;
+                  *thiscoef += (JCOEF)p1;
                 else
-                  *thiscoef += m1;
+                  *thiscoef += (JCOEF)m1;
               }
             }
           } else {
@@ -589,7 +592,7 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
         if (s) {
           int pos = jpeg_natural_order[k];
           /* Output newly nonzero coefficient */
-          (*block)[pos] = (JCOEF) s;
+          (*block)[pos] = (JCOEF)s;
           /* Remember its position in case we have to suspend */
           newnz_pos[num_newnz++] = pos;
         }
@@ -609,9 +612,9 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
           if (GET_BITS(1)) {
             if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
               if (*thiscoef >= 0)
-                *thiscoef += p1;
+                *thiscoef += (JCOEF)p1;
               else
-                *thiscoef += m1;
+                *thiscoef += (JCOEF)m1;
             }
           }
         }
@@ -621,12 +624,13 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     }
 
     /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
     entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 
@@ -644,16 +648,16 @@ undoit:
  */
 
 GLOBAL(void)
-jinit_phuff_decoder (j_decompress_ptr cinfo)
+jinit_phuff_decoder(j_decompress_ptr cinfo)
 {
   phuff_entropy_ptr entropy;
   int *coef_bit_ptr;
   int ci, i;
 
   entropy = (phuff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(phuff_entropy_decoder));
-  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
   entropy->pub.start_pass = start_pass_phuff_decoder;
 
   /* Mark derived tables unallocated */
@@ -663,9 +667,10 @@ jinit_phuff_decoder (j_decompress_ptr cinfo)
 
   /* Create progression status table */
   cinfo->coef_bits = (int (*)[DCTSIZE2])
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                cinfo->num_components*DCTSIZE2*sizeof(int));
-  coef_bit_ptr = & cinfo->coef_bits[0][0];
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                cinfo->num_components * 2 * DCTSIZE2 *
+                                sizeof(int));
+  coef_bit_ptr = &cinfo->coef_bits[0][0];
   for (ci = 0; ci < cinfo->num_components; ci++)
     for (i = 0; i < DCTSIZE2; i++)
       *coef_bit_ptr++ = -1;
diff --git a/media/libjpeg/jdpostct.c b/media/libjpeg/jdpostct.c
index 601fc2a792..6a2cf5c1b3 100644
--- a/media/libjpeg/jdpostct.c
+++ b/media/libjpeg/jdpostct.c
@@ -46,22 +46,28 @@ typedef my_post_controller *my_post_ptr;
 
 
 /* Forward declarations */
-METHODDEF(void) post_process_1pass
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
-         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-         JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_1pass(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION *in_row_group_ctr,
+                                   JDIMENSION in_row_groups_avail,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION *out_row_ctr,
+                                   JDIMENSION out_rows_avail);
 #ifdef QUANT_2PASS_SUPPORTED
-METHODDEF(void) post_process_prepass
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
-         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-         JDIMENSION out_rows_avail);
-METHODDEF(void) post_process_2pass
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
-         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-         JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_prepass(j_decompress_ptr cinfo,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION *in_row_group_ctr,
+                                     JDIMENSION in_row_groups_avail,
+                                     JSAMPARRAY output_buf,
+                                     JDIMENSION *out_row_ctr,
+                                     JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_2pass(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION *in_row_group_ctr,
+                                   JDIMENSION in_row_groups_avail,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION *out_row_ctr,
+                                   JDIMENSION out_rows_avail);
 #endif
 
 
@@ -70,9 +76,9 @@ METHODDEF(void) post_process_2pass
  */
 
 METHODDEF(void)
-start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_dpost(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_post_ptr post = (my_post_ptr) cinfo->post;
+  my_post_ptr post = (my_post_ptr)cinfo->post;
 
   switch (pass_mode) {
   case JBUF_PASS_THRU:
@@ -85,8 +91,8 @@ start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
        */
       if (post->buffer == NULL) {
         post->buffer = (*cinfo->mem->access_virt_sarray)
-          ((j_common_ptr) cinfo, post->whole_image,
-           (JDIMENSION) 0, post->strip_height, TRUE);
+          ((j_common_ptr)cinfo, post->whole_image,
+           (JDIMENSION)0, post->strip_height, TRUE);
       }
     } else {
       /* For single-pass processing without color quantization,
@@ -123,13 +129,12 @@ start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(void)
-post_process_1pass (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                    JDIMENSION in_row_groups_avail,
-                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                    JDIMENSION out_rows_avail)
+post_process_1pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION *in_row_group_ctr,
+                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_post_ptr post = (my_post_ptr) cinfo->post;
+  my_post_ptr post = (my_post_ptr)cinfo->post;
   JDIMENSION num_rows, max_rows;
 
   /* Fill the buffer, but not more than what we can dump out in one go. */
@@ -138,12 +143,13 @@ post_process_1pass (j_decompress_ptr cinfo,
   if (max_rows > post->strip_height)
     max_rows = post->strip_height;
   num_rows = 0;
-  (*cinfo->upsample->upsample) (cinfo,
-                input_buf, in_row_group_ctr, in_row_groups_avail,
-                post->buffer, &num_rows, max_rows);
+  (*cinfo->upsample->upsample) (cinfo, input_buf, in_row_group_ctr,
+                                in_row_groups_avail, post->buffer, &num_rows,
+                                max_rows);
   /* Quantize and emit data. */
-  (*cinfo->cquantize->color_quantize) (cinfo,
-                post->buffer, output_buf + *out_row_ctr, (int) num_rows);
+  (*cinfo->cquantize->color_quantize) (cinfo, post->buffer,
+                                       output_buf + *out_row_ctr,
+                                       (int)num_rows);
   *out_row_ctr += num_rows;
 }
 
@@ -155,34 +161,33 @@ post_process_1pass (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-post_process_prepass (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                      JDIMENSION in_row_groups_avail,
-                      JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                      JDIMENSION out_rows_avail)
+post_process_prepass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                     JDIMENSION *in_row_group_ctr,
+                     JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                     JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_post_ptr post = (my_post_ptr) cinfo->post;
+  my_post_ptr post = (my_post_ptr)cinfo->post;
   JDIMENSION old_next_row, num_rows;
 
   /* Reposition virtual buffer if at start of strip. */
   if (post->next_row == 0) {
     post->buffer = (*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr) cinfo, post->whole_image,
+        ((j_common_ptr)cinfo, post->whole_image,
          post->starting_row, post->strip_height, TRUE);
   }
 
   /* Upsample some data (up to a strip height's worth). */
   old_next_row = post->next_row;
-  (*cinfo->upsample->upsample) (cinfo,
-                input_buf, in_row_group_ctr, in_row_groups_avail,
-                post->buffer, &post->next_row, post->strip_height);
+  (*cinfo->upsample->upsample) (cinfo, input_buf, in_row_group_ctr,
+                                in_row_groups_avail, post->buffer,
+                                &post->next_row, post->strip_height);
 
   /* Allow quantizer to scan new data.  No data is emitted, */
   /* but we advance out_row_ctr so outer loop can tell when we're done. */
   if (post->next_row > old_next_row) {
     num_rows = post->next_row - old_next_row;
     (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + old_next_row,
-                                         (JSAMPARRAY) NULL, (int) num_rows);
+                                         (JSAMPARRAY)NULL, (int)num_rows);
     *out_row_ctr += num_rows;
   }
 
@@ -199,19 +204,18 @@ post_process_prepass (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-post_process_2pass (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                    JDIMENSION in_row_groups_avail,
-                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                    JDIMENSION out_rows_avail)
+post_process_2pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION *in_row_group_ctr,
+                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_post_ptr post = (my_post_ptr) cinfo->post;
+  my_post_ptr post = (my_post_ptr)cinfo->post;
   JDIMENSION num_rows, max_rows;
 
   /* Reposition virtual buffer if at start of strip. */
   if (post->next_row == 0) {
     post->buffer = (*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr) cinfo, post->whole_image,
+        ((j_common_ptr)cinfo, post->whole_image,
          post->starting_row, post->strip_height, FALSE);
   }
 
@@ -226,9 +230,9 @@ post_process_2pass (j_decompress_ptr cinfo,
     num_rows = max_rows;
 
   /* Quantize and emit data. */
-  (*cinfo->cquantize->color_quantize) (cinfo,
-                post->buffer + post->next_row, output_buf + *out_row_ctr,
-                (int) num_rows);
+  (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + post->next_row,
+                                       output_buf + *out_row_ctr,
+                                       (int)num_rows);
   *out_row_ctr += num_rows;
 
   /* Advance if we filled the strip. */
@@ -247,14 +251,14 @@ post_process_2pass (j_decompress_ptr cinfo,
  */
 
 GLOBAL(void)
-jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
+jinit_d_post_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_post_ptr post;
 
   post = (my_post_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_post_controller));
-  cinfo->post = (struct jpeg_d_post_controller *) post;
+  cinfo->post = (struct jpeg_d_post_controller *)post;
   post->pub.start_pass = start_pass_dpost;
   post->whole_image = NULL;     /* flag for no virtual arrays */
   post->buffer = NULL;          /* flag for no strip buffer */
@@ -265,16 +269,16 @@ jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
      * an efficient number of rows for upsampling to return.
      * (In the presence of output rescaling, we might want to be smarter?)
      */
-    post->strip_height = (JDIMENSION) cinfo->max_v_samp_factor;
+    post->strip_height = (JDIMENSION)cinfo->max_v_samp_factor;
     if (need_full_buffer) {
       /* Two-pass color quantization: need full-image storage. */
       /* We round up the number of rows to a multiple of the strip height. */
 #ifdef QUANT_2PASS_SUPPORTED
       post->whole_image = (*cinfo->mem->request_virt_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
+        ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
          cinfo->output_width * cinfo->out_color_components,
-         (JDIMENSION) jround_up((long) cinfo->output_height,
-                                (long) post->strip_height),
+         (JDIMENSION)jround_up((long)cinfo->output_height,
+                               (long)post->strip_height),
          post->strip_height);
 #else
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -282,7 +286,7 @@ jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
     } else {
       /* One-pass color quantization: just make a strip buffer. */
       post->buffer = (*cinfo->mem->alloc_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+        ((j_common_ptr)cinfo, JPOOL_IMAGE,
          cinfo->output_width * cinfo->out_color_components,
          post->strip_height);
     }
diff --git a/media/libjpeg/jdsample.c b/media/libjpeg/jdsample.c
index b1378e1512..eaad72a030 100644
--- a/media/libjpeg/jdsample.c
+++ b/media/libjpeg/jdsample.c
@@ -8,6 +8,7 @@
  * Copyright (C) 2010, 2015-2016, D. R. Commander.
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2019-2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -36,9 +37,9 @@
  */
 
 METHODDEF(void)
-start_pass_upsample (j_decompress_ptr cinfo)
+start_pass_upsample(j_decompress_ptr cinfo)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
 
   /* Mark the conversion buffer empty */
   upsample->next_row_out = cinfo->max_v_samp_factor;
@@ -56,13 +57,12 @@ start_pass_upsample (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-sep_upsample (j_decompress_ptr cinfo,
-              JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-              JDIMENSION in_row_groups_avail,
-              JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-              JDIMENSION out_rows_avail)
+sep_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+             JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
+             JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+             JDIMENSION out_rows_avail)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   int ci;
   jpeg_component_info *compptr;
   JDIMENSION num_rows;
@@ -84,7 +84,7 @@ sep_upsample (j_decompress_ptr cinfo,
   /* Color-convert and emit rows */
 
   /* How many we have in the buffer: */
-  num_rows = (JDIMENSION) (cinfo->max_v_samp_factor - upsample->next_row_out);
+  num_rows = (JDIMENSION)(cinfo->max_v_samp_factor - upsample->next_row_out);
   /* Not more than the distance to the end of the image.  Need this test
    * in case the image height is not a multiple of max_v_samp_factor:
    */
@@ -96,9 +96,8 @@ sep_upsample (j_decompress_ptr cinfo,
     num_rows = out_rows_avail;
 
   (*cinfo->cconvert->color_convert) (cinfo, upsample->color_buf,
-                                     (JDIMENSION) upsample->next_row_out,
-                                     output_buf + *out_row_ctr,
-                                     (int) num_rows);
+                                     (JDIMENSION)upsample->next_row_out,
+                                     output_buf + *out_row_ctr, (int)num_rows);
 
   /* Adjust counts */
   *out_row_ctr += num_rows;
@@ -124,8 +123,8 @@ sep_upsample (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+fullsize_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   *output_data_ptr = input_data;
 }
@@ -137,8 +136,8 @@ fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-noop_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+noop_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   *output_data_ptr = NULL;      /* safety check */
 }
@@ -156,10 +155,10 @@ noop_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+             JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
   register JSAMPLE invalue;
@@ -178,15 +177,15 @@ int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       for (h = h_expand; h > 0; h--) {
         *outptr++ = invalue;
       }
     }
     /* Generate any additional output rows by duplicating the first one */
     if (v_expand > 1) {
-      jcopy_sample_rows(output_data, outrow, output_data, outrow+1,
-                        v_expand-1, cinfo->output_width);
+      jcopy_sample_rows(output_data, outrow, output_data, outrow + 1,
+                        v_expand - 1, cinfo->output_width);
     }
     inrow++;
     outrow += v_expand;
@@ -200,8 +199,8 @@ int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -214,7 +213,7 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[inrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
@@ -228,8 +227,8 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -243,12 +242,12 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
-    jcopy_sample_rows(output_data, outrow, output_data, outrow+1,
-                      1, cinfo->output_width);
+    jcopy_sample_rows(output_data, outrow, output_data, outrow + 1, 1,
+                      cinfo->output_width);
     inrow++;
     outrow += 2;
   }
@@ -271,8 +270,8 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -284,21 +283,21 @@ h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     inptr = input_data[inrow];
     outptr = output_data[inrow];
     /* Special case for first column */
-    invalue = GETJSAMPLE(*inptr++);
-    *outptr++ = (JSAMPLE) invalue;
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
+    invalue = *inptr++;
+    *outptr++ = (JSAMPLE)invalue;
+    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[0] + 2) >> 2);
 
     for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
       /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
-      invalue = GETJSAMPLE(*inptr++) * 3;
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
+      invalue = (*inptr++) * 3;
+      *outptr++ = (JSAMPLE)((invalue + inptr[-2] + 1) >> 2);
+      *outptr++ = (JSAMPLE)((invalue + inptr[0] + 2) >> 2);
     }
 
     /* Special case for last column */
-    invalue = GETJSAMPLE(*inptr);
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
-    *outptr++ = (JSAMPLE) invalue;
+    invalue = *inptr;
+    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[-1] + 1) >> 2);
+    *outptr++ = (JSAMPLE)invalue;
   }
 }
 
@@ -311,15 +310,15 @@ h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr0, inptr1, outptr;
 #if BITS_IN_JSAMPLE == 8
-  int thiscolsum;
+  int thiscolsum, bias;
 #else
-  JLONG thiscolsum;
+  JLONG thiscolsum, bias;
 #endif
   JDIMENSION colctr;
   int inrow, outrow, v;
@@ -329,15 +328,18 @@ h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     for (v = 0; v < 2; v++) {
       /* inptr0 points to nearest input row, inptr1 points to next nearest */
       inptr0 = input_data[inrow];
-      if (v == 0)               /* next nearest is row above */
-        inptr1 = input_data[inrow-1];
-      else                      /* next nearest is row below */
-        inptr1 = input_data[inrow+1];
+      if (v == 0) {             /* next nearest is row above */
+        inptr1 = input_data[inrow - 1];
+        bias = 1;
+      } else {                  /* next nearest is row below */
+        inptr1 = input_data[inrow + 1];
+        bias = 2;
+      }
       outptr = output_data[outrow++];
 
-      for(colctr = 0; colctr < compptr->downsampled_width; colctr++) {
-        thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-        *outptr++ = (JSAMPLE) ((thiscolsum + 1) >> 2);
+      for (colctr = 0; colctr < compptr->downsampled_width; colctr++) {
+        thiscolsum = (*inptr0++) * 3 + (*inptr1++);
+        *outptr++ = (JSAMPLE)((thiscolsum + bias) >> 2);
       }
     }
     inrow++;
@@ -354,8 +356,8 @@ h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr0, inptr1, outptr;
@@ -373,30 +375,30 @@ h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
       /* inptr0 points to nearest input row, inptr1 points to next nearest */
       inptr0 = input_data[inrow];
       if (v == 0)               /* next nearest is row above */
-        inptr1 = input_data[inrow-1];
+        inptr1 = input_data[inrow - 1];
       else                      /* next nearest is row below */
-        inptr1 = input_data[inrow+1];
+        inptr1 = input_data[inrow + 1];
       outptr = output_data[outrow++];
 
       /* Special case for first column */
-      thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-      lastcolsum = thiscolsum; thiscolsum = nextcolsum;
+      thiscolsum = (*inptr0++) * 3 + (*inptr1++);
+      nextcolsum = (*inptr0++) * 3 + (*inptr1++);
+      *outptr++ = (JSAMPLE)((thiscolsum * 4 + 8) >> 4);
+      *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
+      lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
 
       for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
         /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
         /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
-        nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-        *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-        *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-        lastcolsum = thiscolsum; thiscolsum = nextcolsum;
+        nextcolsum = (*inptr0++) * 3 + (*inptr1++);
+        *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
+        *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
+        lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
       }
 
       /* Special case for last column */
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 7) >> 4);
+      *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
+      *outptr++ = (JSAMPLE)((thiscolsum * 4 + 7) >> 4);
     }
     inrow++;
   }
@@ -408,7 +410,7 @@ h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jinit_upsampler (j_decompress_ptr cinfo)
+jinit_upsampler(j_decompress_ptr cinfo)
 {
   my_upsample_ptr upsample;
   int ci;
@@ -418,14 +420,14 @@ jinit_upsampler (j_decompress_ptr cinfo)
 
   if (!cinfo->master->jinit_upsampler_no_alloc) {
     upsample = (my_upsample_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(my_upsampler));
-    cinfo->upsample = (struct jpeg_upsampler *) upsample;
+    cinfo->upsample = (struct jpeg_upsampler *)upsample;
     upsample->pub.start_pass = start_pass_upsample;
     upsample->pub.upsample = sep_upsample;
     upsample->pub.need_context_rows = FALSE; /* until we find out differently */
   } else
-    upsample = (my_upsample_ptr) cinfo->upsample;
+    upsample = (my_upsample_ptr)cinfo->upsample;
 
   if (cinfo->CCIR601_sampling)  /* this isn't supported */
     ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);
@@ -451,7 +453,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
     v_out_group = cinfo->max_v_samp_factor;
     upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
     need_buffer = TRUE;
-    if (! compptr->component_needed) {
+    if (!compptr->component_needed) {
       /* Don't bother to upsample an uninteresting component. */
       upsample->methods[ci] = noop_upsample;
       need_buffer = FALSE;
@@ -459,8 +461,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
       /* Fullsize components can be processed without any work. */
       upsample->methods[ci] = fullsize_upsample;
       need_buffer = FALSE;
-    } else if (h_in_group * 2 == h_out_group &&
-               v_in_group == v_out_group) {
+    } else if (h_in_group * 2 == h_out_group && v_in_group == v_out_group) {
       /* Special cases for 2h1v upsampling */
       if (do_fancy && compptr->downsampled_width > 2) {
         if (jsimd_can_h2v1_fancy_upsample())
@@ -476,7 +477,13 @@ jinit_upsampler (j_decompress_ptr cinfo)
     } else if (h_in_group == h_out_group &&
                v_in_group * 2 == v_out_group && do_fancy) {
       /* Non-fancy upsampling is handled by the generic method */
-      upsample->methods[ci] = h1v2_fancy_upsample;
+#if defined(__arm__) || defined(__aarch64__) || \
+    defined(_M_ARM) || defined(_M_ARM64)
+      if (jsimd_can_h1v2_fancy_upsample())
+        upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
+      else
+#endif
+        upsample->methods[ci] = h1v2_fancy_upsample;
       upsample->pub.need_context_rows = TRUE;
     } else if (h_in_group * 2 == h_out_group &&
                v_in_group * 2 == v_out_group) {
@@ -502,16 +509,16 @@ jinit_upsampler (j_decompress_ptr cinfo)
       else
 #endif
         upsample->methods[ci] = int_upsample;
-      upsample->h_expand[ci] = (UINT8) (h_out_group / h_in_group);
-      upsample->v_expand[ci] = (UINT8) (v_out_group / v_in_group);
+      upsample->h_expand[ci] = (UINT8)(h_out_group / h_in_group);
+      upsample->v_expand[ci] = (UINT8)(v_out_group / v_in_group);
     } else
       ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
     if (need_buffer && !cinfo->master->jinit_upsampler_no_alloc) {
       upsample->color_buf[ci] = (*cinfo->mem->alloc_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE,
-         (JDIMENSION) jround_up((long) cinfo->output_width,
-                                (long) cinfo->max_h_samp_factor),
-         (JDIMENSION) cinfo->max_v_samp_factor);
+        ((j_common_ptr)cinfo, JPOOL_IMAGE,
+         (JDIMENSION)jround_up((long)cinfo->output_width,
+                               (long)cinfo->max_h_samp_factor),
+         (JDIMENSION)cinfo->max_v_samp_factor);
     }
   }
 }
diff --git a/media/libjpeg/jdtrans.c b/media/libjpeg/jdtrans.c
index cfc85dd24c..d7ec4b83b3 100644
--- a/media/libjpeg/jdtrans.c
+++ b/media/libjpeg/jdtrans.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -16,10 +16,11 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
 
 /* Forward declarations */
-LOCAL(void) transdecode_master_selection (j_decompress_ptr cinfo);
+LOCAL(void) transdecode_master_selection(j_decompress_ptr cinfo);
 
 
 /*
@@ -45,7 +46,7 @@ LOCAL(void) transdecode_master_selection (j_decompress_ptr cinfo);
  */
 
 GLOBAL(jvirt_barray_ptr *)
-jpeg_read_coefficients (j_decompress_ptr cinfo)
+jpeg_read_coefficients(j_decompress_ptr cinfo)
 {
   if (cinfo->global_state == DSTATE_READY) {
     /* First call: initialize active modules */
@@ -58,7 +59,7 @@ jpeg_read_coefficients (j_decompress_ptr cinfo)
       int retcode;
       /* Call progress monitor hook if present */
       if (cinfo->progress != NULL)
-        (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
       /* Absorb some more input */
       retcode = (*cinfo->inputctl->consume_input) (cinfo);
       if (retcode == JPEG_SUSPENDED)
@@ -70,7 +71,7 @@ jpeg_read_coefficients (j_decompress_ptr cinfo)
           (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
         if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
           /* startup underestimated number of scans; ratchet up one scan */
-          cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
+          cinfo->progress->pass_limit += (long)cinfo->total_iMCU_rows;
         }
       }
     }
@@ -97,7 +98,7 @@ jpeg_read_coefficients (j_decompress_ptr cinfo)
  */
 
 LOCAL(void)
-transdecode_master_selection (j_decompress_ptr cinfo)
+transdecode_master_selection(j_decompress_ptr cinfo)
 {
   /* This is effectively a buffered-image operation. */
   cinfo->buffered_image = TRUE;
@@ -129,7 +130,7 @@ transdecode_master_selection (j_decompress_ptr cinfo)
   jinit_d_coef_controller(cinfo, TRUE);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
-  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
   /* Initialize input side of decompressor to consume first scan. */
   (*cinfo->inputctl->start_input_pass) (cinfo);
@@ -148,7 +149,7 @@ transdecode_master_selection (j_decompress_ptr cinfo)
       nscans = 1;
     }
     cinfo->progress->pass_counter = 0L;
-    cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows * nscans;
+    cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows * nscans;
     cinfo->progress->completed_passes = 0;
     cinfo->progress->total_passes = 1;
   }
diff --git a/media/libjpeg/jerror.c b/media/libjpeg/jerror.c
index c31acd9ef0..d544702937 100644
--- a/media/libjpeg/jerror.c
+++ b/media/libjpeg/jerror.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -44,7 +44,7 @@
  * want to refer to it directly.
  */
 
-#define JMESSAGE(code,string)   string ,
+#define JMESSAGE(code, string)  string,
 
 const char * const jpeg_std_message_table[] = {
 #include "jerror.h"
@@ -66,7 +66,7 @@ const char * const jpeg_std_message_table[] = {
  */
 
 METHODDEF(void)
-error_exit (j_common_ptr cinfo)
+error_exit(j_common_ptr cinfo)
 {
   /* Always display the message */
   (*cinfo->err->output_message) (cinfo);
@@ -94,7 +94,7 @@ error_exit (j_common_ptr cinfo)
  */
 
 METHODDEF(void)
-output_message (j_common_ptr cinfo)
+output_message(j_common_ptr cinfo)
 {
   char buffer[JMSG_LENGTH_MAX];
 
@@ -124,7 +124,7 @@ output_message (j_common_ptr cinfo)
  */
 
 METHODDEF(void)
-emit_message (j_common_ptr cinfo, int msg_level)
+emit_message(j_common_ptr cinfo, int msg_level)
 {
   struct jpeg_error_mgr *err = cinfo->err;
 
@@ -153,7 +153,7 @@ emit_message (j_common_ptr cinfo, int msg_level)
  */
 
 METHODDEF(void)
-format_message (j_common_ptr cinfo, char *buffer)
+format_message(j_common_ptr cinfo, char *buffer)
 {
   struct jpeg_error_mgr *err = cinfo->err;
   int msg_code = err->msg_code;
@@ -189,13 +189,13 @@ format_message (j_common_ptr cinfo, char *buffer)
 
   /* Format the message into the passed buffer */
   if (isstring)
-    sprintf(buffer, msgtext, err->msg_parm.s);
+    snprintf(buffer, JMSG_LENGTH_MAX, msgtext, err->msg_parm.s);
   else
-    sprintf(buffer, msgtext,
-            err->msg_parm.i[0], err->msg_parm.i[1],
-            err->msg_parm.i[2], err->msg_parm.i[3],
-            err->msg_parm.i[4], err->msg_parm.i[5],
-            err->msg_parm.i[6], err->msg_parm.i[7]);
+    snprintf(buffer, JMSG_LENGTH_MAX, msgtext,
+             err->msg_parm.i[0], err->msg_parm.i[1],
+             err->msg_parm.i[2], err->msg_parm.i[3],
+             err->msg_parm.i[4], err->msg_parm.i[5],
+             err->msg_parm.i[6], err->msg_parm.i[7]);
 }
 
 
@@ -208,7 +208,7 @@ format_message (j_common_ptr cinfo, char *buffer)
  */
 
 METHODDEF(void)
-reset_error_mgr (j_common_ptr cinfo)
+reset_error_mgr(j_common_ptr cinfo)
 {
   cinfo->err->num_warnings = 0;
   /* trace_level is not reset since it is an application-supplied parameter */
@@ -227,7 +227,7 @@ reset_error_mgr (j_common_ptr cinfo)
  */
 
 GLOBAL(struct jpeg_error_mgr *)
-jpeg_std_error (struct jpeg_error_mgr *err)
+jpeg_std_error(struct jpeg_error_mgr *err)
 {
   err->error_exit = error_exit;
   err->emit_message = emit_message;
@@ -241,7 +241,7 @@ jpeg_std_error (struct jpeg_error_mgr *err)
 
   /* Initialize message table pointers */
   err->jpeg_message_table = jpeg_std_message_table;
-  err->last_jpeg_message = (int) JMSG_LASTMSGCODE - 1;
+  err->last_jpeg_message = (int)JMSG_LASTMSGCODE - 1;
 
   err->addon_message_table = NULL;
   err->first_addon_message = 0; /* for safety */
diff --git a/media/libjpeg/jerror.h b/media/libjpeg/jerror.h
index 11a07cb5d0..eb44a1140a 100644
--- a/media/libjpeg/jerror.h
+++ b/media/libjpeg/jerror.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014, 2017, 2021-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -28,7 +28,7 @@
 #define JMAKE_ENUM_LIST
 #else
 /* Repeated inclusions of this file are no-ops unless JMESSAGE is defined */
-#define JMESSAGE(code,string)
+#define JMESSAGE(code, string)
 #endif /* JERROR_H */
 #endif /* JMESSAGE */
 
@@ -36,7 +36,7 @@
 
 typedef enum {
 
-#define JMESSAGE(code,string)   code ,
+#define JMESSAGE(code, string)  code,
 
 #endif /* JMAKE_ENUM_LIST */
 
@@ -44,8 +44,7 @@ JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */
 
 /* For maintenance convenience, list is alphabetical by message code name */
 #if JPEG_LIB_VERSION < 70
-JMESSAGE(JERR_ARITH_NOTIMPL,
-         "Sorry, arithmetic coding is not implemented")
+JMESSAGE(JERR_ARITH_NOTIMPL, "Sorry, arithmetic coding is not implemented")
 #endif
 JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
 JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
@@ -104,7 +103,7 @@ JMESSAGE(JERR_MISMATCHED_QUANT_TABLE,
          "Cannot transcode due to multiple use of quantization table %d")
 JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
 JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
-JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
+JMESSAGE(JERR_NOTIMPL, "Requested features are incompatible")
 JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
 #if JPEG_LIB_VERSION >= 70
 JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
@@ -154,8 +153,7 @@ JMESSAGE(JTRC_HUFFBITS, "        %3d %3d %3d %3d %3d %3d %3d %3d")
 JMESSAGE(JTRC_JFIF, "JFIF APP0 marker: version %d.%02d, density %dx%d  %d")
 JMESSAGE(JTRC_JFIF_BADTHUMBNAILSIZE,
          "Warning: thumbnail image size does not match data length %u")
-JMESSAGE(JTRC_JFIF_EXTENSION,
-         "JFIF extension marker: type 0x%02x, length %u")
+JMESSAGE(JTRC_JFIF_EXTENSION, "JFIF extension marker: type 0x%02x, length %u")
 JMESSAGE(JTRC_JFIF_THUMBNAIL, "    with %d x %d thumbnail image")
 JMESSAGE(JTRC_MISC_MARKER, "Miscellaneous marker 0x%02x, length %u")
 JMESSAGE(JTRC_PARMLESS_MARKER, "Unexpected marker 0x%02x")
@@ -208,6 +206,11 @@ JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
 JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 #endif
 #endif
+JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+         "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
+#endif
 
 #ifdef JMAKE_ENUM_LIST
 
@@ -228,90 +231,101 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 /* The first parameter is either type of cinfo pointer */
 
 /* Fatal errors (print message and exit) */
-#define ERREXIT(cinfo,code)  \
+#define ERREXIT(cinfo, code) \
   ((cinfo)->err->msg_code = (code), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT1(cinfo,code,p1)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT1(cinfo, code, p1) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT2(cinfo,code,p1,p2)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT2(cinfo, code, p1, p2) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT3(cinfo,code,p1,p2,p3)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT3(cinfo, code, p1, p2, p3) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
    (cinfo)->err->msg_parm.i[2] = (p3), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT4(cinfo,code,p1,p2,p3,p4)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT4(cinfo, code, p1, p2, p3, p4) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (cinfo)->err->msg_parm.i[3] = (p4), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT6(cinfo, code, p1, p2, p3, p4, p5, p6) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
    (cinfo)->err->msg_parm.i[2] = (p3), \
    (cinfo)->err->msg_parm.i[3] = (p4), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXITS(cinfo,code,str)  \
+   (cinfo)->err->msg_parm.i[4] = (p5), \
+   (cinfo)->err->msg_parm.i[5] = (p6), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXITS(cinfo, code, str) \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+   (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
 
 #define MAKESTMT(stuff)         do { stuff } while (0)
 
 /* Nonfatal errors (we can keep going, but the data is probably corrupt) */
-#define WARNMS(cinfo,code)  \
+#define WARNMS(cinfo, code) \
   ((cinfo)->err->msg_code = (code), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
-#define WARNMS1(cinfo,code,p1)  \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+#define WARNMS1(cinfo, code, p1) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
-#define WARNMS2(cinfo,code,p1,p2)  \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+#define WARNMS2(cinfo, code, p1, p2) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
 
 /* Informational/debugging messages */
-#define TRACEMS(cinfo,lvl,code)  \
+#define TRACEMS(cinfo, lvl, code) \
   ((cinfo)->err->msg_code = (code), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
-#define TRACEMS1(cinfo,lvl,code,p1)  \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS1(cinfo, lvl, code, p1) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
-#define TRACEMS2(cinfo,lvl,code,p1,p2)  \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS2(cinfo, lvl, code, p1, p2) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
-#define TRACEMS3(cinfo,lvl,code,p1,p2,p3)  \
-  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
-           _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS3(cinfo, lvl, code, p1, p2, p3) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+           _mp[0] = (p1);  _mp[1] = (p2);  _mp[2] = (p3); \
            (cinfo)->err->msg_code = (code); \
-           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMS4(cinfo,lvl,code,p1,p2,p3,p4)  \
-  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
-           _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS4(cinfo, lvl, code, p1, p2, p3, p4) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+           _mp[0] = (p1);  _mp[1] = (p2);  _mp[2] = (p3);  _mp[3] = (p4); \
            (cinfo)->err->msg_code = (code); \
-           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMS5(cinfo,lvl,code,p1,p2,p3,p4,p5)  \
-  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
-           _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS5(cinfo, lvl, code, p1, p2, p3, p4, p5) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+           _mp[0] = (p1);  _mp[1] = (p2);  _mp[2] = (p3);  _mp[3] = (p4); \
            _mp[4] = (p5); \
            (cinfo)->err->msg_code = (code); \
-           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMS8(cinfo,lvl,code,p1,p2,p3,p4,p5,p6,p7,p8)  \
-  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
-           _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
-           _mp[4] = (p5); _mp[5] = (p6); _mp[6] = (p7); _mp[7] = (p8); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS8(cinfo, lvl, code, p1, p2, p3, p4, p5, p6, p7, p8) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+           _mp[0] = (p1);  _mp[1] = (p2);  _mp[2] = (p3);  _mp[3] = (p4); \
+           _mp[4] = (p5);  _mp[5] = (p6);  _mp[6] = (p7);  _mp[7] = (p8); \
            (cinfo)->err->msg_code = (code); \
-           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMSS(cinfo,lvl,code,str)  \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMSS(cinfo, lvl, code, str) \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
+   (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
 
 #endif /* JERROR_H */
diff --git a/media/libjpeg/jfdctflt.c b/media/libjpeg/jfdctflt.c
index b3da3ebda8..ab6f6d0825 100644
--- a/media/libjpeg/jfdctflt.c
+++ b/media/libjpeg/jfdctflt.c
@@ -57,7 +57,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_float (FAST_FLOAT *data)
+jpeg_fdct_float(FAST_FLOAT *data)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
@@ -68,7 +68,7 @@ jpeg_fdct_float (FAST_FLOAT *data)
   /* Pass 1: process rows. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
     tmp0 = dataptr[0] + dataptr[7];
     tmp7 = dataptr[0] - dataptr[7];
     tmp1 = dataptr[1] + dataptr[6];
@@ -88,7 +88,7 @@ jpeg_fdct_float (FAST_FLOAT *data)
     dataptr[0] = tmp10 + tmp11; /* phase 3 */
     dataptr[4] = tmp10 - tmp11;
 
-    z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
+    z1 = (tmp12 + tmp13) * ((FAST_FLOAT)0.707106781); /* c4 */
     dataptr[2] = tmp13 + z1;    /* phase 5 */
     dataptr[6] = tmp13 - z1;
 
@@ -99,10 +99,10 @@ jpeg_fdct_float (FAST_FLOAT *data)
     tmp12 = tmp6 + tmp7;
 
     /* The rotator is modified from fig 4-8 to avoid extra negations. */
-    z5 = (tmp10 - tmp12) * ((FAST_FLOAT) 0.382683433); /* c6 */
-    z2 = ((FAST_FLOAT) 0.541196100) * tmp10 + z5; /* c2-c6 */
-    z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */
-    z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */
+    z5 = (tmp10 - tmp12) * ((FAST_FLOAT)0.382683433); /* c6 */
+    z2 = ((FAST_FLOAT)0.541196100) * tmp10 + z5; /* c2-c6 */
+    z4 = ((FAST_FLOAT)1.306562965) * tmp12 + z5; /* c2+c6 */
+    z3 = tmp11 * ((FAST_FLOAT)0.707106781); /* c4 */
 
     z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
@@ -118,15 +118,15 @@ jpeg_fdct_float (FAST_FLOAT *data)
   /* Pass 2: process columns. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
-    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
-    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
-    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
-    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
-    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
-    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
-    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+    tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+    tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+    tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+    tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+    tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+    tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+    tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+    tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 
     /* Even part */
 
@@ -135,12 +135,12 @@ jpeg_fdct_float (FAST_FLOAT *data)
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
-    dataptr[DCTSIZE*4] = tmp10 - tmp11;
+    dataptr[DCTSIZE * 0] = tmp10 + tmp11; /* phase 3 */
+    dataptr[DCTSIZE * 4] = tmp10 - tmp11;
 
-    z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
-    dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
-    dataptr[DCTSIZE*6] = tmp13 - z1;
+    z1 = (tmp12 + tmp13) * ((FAST_FLOAT)0.707106781); /* c4 */
+    dataptr[DCTSIZE * 2] = tmp13 + z1; /* phase 5 */
+    dataptr[DCTSIZE * 6] = tmp13 - z1;
 
     /* Odd part */
 
@@ -149,18 +149,18 @@ jpeg_fdct_float (FAST_FLOAT *data)
     tmp12 = tmp6 + tmp7;
 
     /* The rotator is modified from fig 4-8 to avoid extra negations. */
-    z5 = (tmp10 - tmp12) * ((FAST_FLOAT) 0.382683433); /* c6 */
-    z2 = ((FAST_FLOAT) 0.541196100) * tmp10 + z5; /* c2-c6 */
-    z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */
-    z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */
+    z5 = (tmp10 - tmp12) * ((FAST_FLOAT)0.382683433); /* c6 */
+    z2 = ((FAST_FLOAT)0.541196100) * tmp10 + z5; /* c2-c6 */
+    z4 = ((FAST_FLOAT)1.306562965) * tmp12 + z5; /* c2+c6 */
+    z3 = tmp11 * ((FAST_FLOAT)0.707106781); /* c4 */
 
     z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
 
-    dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
-    dataptr[DCTSIZE*3] = z13 - z2;
-    dataptr[DCTSIZE*1] = z11 + z4;
-    dataptr[DCTSIZE*7] = z11 - z4;
+    dataptr[DCTSIZE * 5] = z13 + z2; /* phase 6 */
+    dataptr[DCTSIZE * 3] = z13 - z2;
+    dataptr[DCTSIZE * 1] = z11 + z4;
+    dataptr[DCTSIZE * 7] = z11 - z4;
 
     dataptr++;                  /* advance pointer to next column */
   }
diff --git a/media/libjpeg/jfdctfst.c b/media/libjpeg/jfdctfst.c
index 5cd83a7b8e..4c9ce0de8f 100644
--- a/media/libjpeg/jfdctfst.c
+++ b/media/libjpeg/jfdctfst.c
@@ -79,10 +79,10 @@
  */
 
 #if CONST_BITS == 8
-#define FIX_0_382683433  ((JLONG)   98)         /* FIX(0.382683433) */
-#define FIX_0_541196100  ((JLONG)  139)         /* FIX(0.541196100) */
-#define FIX_0_707106781  ((JLONG)  181)         /* FIX(0.707106781) */
-#define FIX_1_306562965  ((JLONG)  334)         /* FIX(1.306562965) */
+#define FIX_0_382683433  ((JLONG)98)            /* FIX(0.382683433) */
+#define FIX_0_541196100  ((JLONG)139)           /* FIX(0.541196100) */
+#define FIX_0_707106781  ((JLONG)181)           /* FIX(0.707106781) */
+#define FIX_1_306562965  ((JLONG)334)           /* FIX(1.306562965) */
 #else
 #define FIX_0_382683433  FIX(0.382683433)
 #define FIX_0_541196100  FIX(0.541196100)
@@ -98,7 +98,7 @@
 
 #ifndef USE_ACCURATE_ROUNDING
 #undef DESCALE
-#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
+#define DESCALE(x, n)  RIGHT_SHIFT(x, n)
 #endif
 
 
@@ -106,7 +106,7 @@
  * descale to yield a DCTELEM result.
  */
 
-#define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
+#define MULTIPLY(var, const)  ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
 
 
 /*
@@ -114,7 +114,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_ifast (DCTELEM *data)
+jpeg_fdct_ifast(DCTELEM *data)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
@@ -126,7 +126,7 @@ jpeg_fdct_ifast (DCTELEM *data)
   /* Pass 1: process rows. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
     tmp0 = dataptr[0] + dataptr[7];
     tmp7 = dataptr[0] - dataptr[7];
     tmp1 = dataptr[1] + dataptr[6];
@@ -176,15 +176,15 @@ jpeg_fdct_ifast (DCTELEM *data)
   /* Pass 2: process columns. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
-    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
-    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
-    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
-    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
-    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
-    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
-    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+    tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+    tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+    tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+    tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+    tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+    tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+    tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+    tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 
     /* Even part */
 
@@ -193,12 +193,12 @@ jpeg_fdct_ifast (DCTELEM *data)
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
-    dataptr[DCTSIZE*4] = tmp10 - tmp11;
+    dataptr[DCTSIZE * 0] = tmp10 + tmp11; /* phase 3 */
+    dataptr[DCTSIZE * 4] = tmp10 - tmp11;
 
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
-    dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
-    dataptr[DCTSIZE*6] = tmp13 - z1;
+    dataptr[DCTSIZE * 2] = tmp13 + z1; /* phase 5 */
+    dataptr[DCTSIZE * 6] = tmp13 - z1;
 
     /* Odd part */
 
@@ -215,10 +215,10 @@ jpeg_fdct_ifast (DCTELEM *data)
     z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
 
-    dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
-    dataptr[DCTSIZE*3] = z13 - z2;
-    dataptr[DCTSIZE*1] = z11 + z4;
-    dataptr[DCTSIZE*7] = z11 - z4;
+    dataptr[DCTSIZE * 5] = z13 + z2; /* phase 6 */
+    dataptr[DCTSIZE * 3] = z13 - z2;
+    dataptr[DCTSIZE * 1] = z11 + z4;
+    dataptr[DCTSIZE * 7] = z11 - z4;
 
     dataptr++;                  /* advance pointer to next column */
   }
diff --git a/media/libjpeg/jfdctint.c b/media/libjpeg/jfdctint.c
index 169bb942ce..c95a3a7fb8 100644
--- a/media/libjpeg/jfdctint.c
+++ b/media/libjpeg/jfdctint.c
@@ -1,14 +1,14 @@
 /*
  * jfdctint.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
- * This file contains a slow-but-accurate integer implementation of the
+ * This file contains a slower but more accurate integer implementation of the
  * forward DCT (Discrete Cosine Transform).
  *
  * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
@@ -93,18 +93,18 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_298631336  ((JLONG)  2446)        /* FIX(0.298631336) */
-#define FIX_0_390180644  ((JLONG)  3196)        /* FIX(0.390180644) */
-#define FIX_0_541196100  ((JLONG)  4433)        /* FIX(0.541196100) */
-#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
-#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
-#define FIX_1_175875602  ((JLONG)  9633)        /* FIX(1.175875602) */
-#define FIX_1_501321110  ((JLONG)  12299)       /* FIX(1.501321110) */
-#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
-#define FIX_1_961570560  ((JLONG)  16069)       /* FIX(1.961570560) */
-#define FIX_2_053119869  ((JLONG)  16819)       /* FIX(2.053119869) */
-#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
-#define FIX_3_072711026  ((JLONG)  25172)       /* FIX(3.072711026) */
+#define FIX_0_298631336  ((JLONG)2446)          /* FIX(0.298631336) */
+#define FIX_0_390180644  ((JLONG)3196)          /* FIX(0.390180644) */
+#define FIX_0_541196100  ((JLONG)4433)          /* FIX(0.541196100) */
+#define FIX_0_765366865  ((JLONG)6270)          /* FIX(0.765366865) */
+#define FIX_0_899976223  ((JLONG)7373)          /* FIX(0.899976223) */
+#define FIX_1_175875602  ((JLONG)9633)          /* FIX(1.175875602) */
+#define FIX_1_501321110  ((JLONG)12299)         /* FIX(1.501321110) */
+#define FIX_1_847759065  ((JLONG)15137)         /* FIX(1.847759065) */
+#define FIX_1_961570560  ((JLONG)16069)         /* FIX(1.961570560) */
+#define FIX_2_053119869  ((JLONG)16819)         /* FIX(2.053119869) */
+#define FIX_2_562915447  ((JLONG)20995)         /* FIX(2.562915447) */
+#define FIX_3_072711026  ((JLONG)25172)         /* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
@@ -129,9 +129,9 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
+#define MULTIPLY(var, const)  MULTIPLY16C16(var, const)
 #else
-#define MULTIPLY(var,const)  ((var) * (const))
+#define MULTIPLY(var, const)  ((var) * (const))
 #endif
 
 
@@ -140,7 +140,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_islow (DCTELEM *data)
+jpeg_fdct_islow(DCTELEM *data)
 {
   JLONG tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   JLONG tmp10, tmp11, tmp12, tmp13;
@@ -154,7 +154,7 @@ jpeg_fdct_islow (DCTELEM *data)
   /* furthermore, we scale the results by 2**PASS1_BITS. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
     tmp0 = dataptr[0] + dataptr[7];
     tmp7 = dataptr[0] - dataptr[7];
     tmp1 = dataptr[1] + dataptr[6];
@@ -173,14 +173,14 @@ jpeg_fdct_islow (DCTELEM *data)
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS);
-    dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+    dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+    dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS);
 
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-                                   CONST_BITS-PASS1_BITS);
-    dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-                                   CONST_BITS-PASS1_BITS);
+    dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+                                  CONST_BITS - PASS1_BITS);
+    dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, -FIX_1_847759065),
+                                  CONST_BITS - PASS1_BITS);
 
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
      * cK represents cos(K*pi/16).
@@ -197,18 +197,18 @@ jpeg_fdct_islow (DCTELEM *data)
     tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
     tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
 
     z3 += z5;
     z4 += z5;
 
-    dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
-    dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
-    dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
-    dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
+    dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
+    dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
+    dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
 
     dataptr += DCTSIZE;         /* advance pointer to next row */
   }
@@ -219,15 +219,15 @@ jpeg_fdct_islow (DCTELEM *data)
    */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
-    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
-    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
-    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
-    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
-    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
-    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
-    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+    tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+    tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+    tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+    tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+    tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+    tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+    tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+    tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
@@ -238,14 +238,16 @@ jpeg_fdct_islow (DCTELEM *data)
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
-    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
+    dataptr[DCTSIZE * 0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS);
+    dataptr[DCTSIZE * 4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS);
 
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-                                           CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-                                           CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE * 2] =
+      (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+                       CONST_BITS + PASS1_BITS);
+    dataptr[DCTSIZE * 6] =
+      (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, -FIX_1_847759065),
+                       CONST_BITS + PASS1_BITS);
 
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
      * cK represents cos(K*pi/16).
@@ -262,22 +264,22 @@ jpeg_fdct_islow (DCTELEM *data)
     tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
     tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
 
     z3 += z5;
     z4 += z5;
 
-    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
-                                           CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
-                                           CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
-                                           CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
-                                           CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE * 7] = (DCTELEM)DESCALE(tmp4 + z1 + z3,
+                                            CONST_BITS + PASS1_BITS);
+    dataptr[DCTSIZE * 5] = (DCTELEM)DESCALE(tmp5 + z2 + z4,
+                                            CONST_BITS + PASS1_BITS);
+    dataptr[DCTSIZE * 3] = (DCTELEM)DESCALE(tmp6 + z2 + z3,
+                                            CONST_BITS + PASS1_BITS);
+    dataptr[DCTSIZE * 1] = (DCTELEM)DESCALE(tmp7 + z1 + z4,
+                                            CONST_BITS + PASS1_BITS);
 
     dataptr++;                  /* advance pointer to next column */
   }
diff --git a/media/libjpeg/jidctflt.c b/media/libjpeg/jidctflt.c
index 68c521ed7e..5aee74e232 100644
--- a/media/libjpeg/jidctflt.c
+++ b/media/libjpeg/jidctflt.c
@@ -61,7 +61,7 @@
  * entry; produce a float result.
  */
 
-#define DEQUANTIZE(coef,quantval)  (((FAST_FLOAT) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval)  (((FAST_FLOAT)(coef)) * (quantval))
 
 
 /*
@@ -69,9 +69,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
@@ -83,12 +83,12 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPLE *range_limit = cinfo->sample_range_limit;
   int ctr;
   FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */
-  #define _0_125 ((FLOAT_MULT_TYPE)0.125)
+#define _0_125  ((FLOAT_MULT_TYPE)0.125)
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (FLOAT_MULT_TYPE *) compptr->dct_table;
+  quantptr = (FLOAT_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; ctr--) {
     /* Due to quantization, we will usually find that many of the input
@@ -100,22 +100,22 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
      * column DCT calculations can be simplified this way.
      */
 
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-        inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+        inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero */
-      FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0],
-                                    quantptr[DCTSIZE*0] * _0_125);
-
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      wsptr[DCTSIZE*4] = dcval;
-      wsptr[DCTSIZE*5] = dcval;
-      wsptr[DCTSIZE*6] = dcval;
-      wsptr[DCTSIZE*7] = dcval;
+      FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE * 0],
+                                    quantptr[DCTSIZE * 0] * _0_125);
+
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
+      wsptr[DCTSIZE * 2] = dcval;
+      wsptr[DCTSIZE * 3] = dcval;
+      wsptr[DCTSIZE * 4] = dcval;
+      wsptr[DCTSIZE * 5] = dcval;
+      wsptr[DCTSIZE * 6] = dcval;
+      wsptr[DCTSIZE * 7] = dcval;
 
       inptr++;                  /* advance pointers to next column */
       quantptr++;
@@ -125,16 +125,16 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0] * _0_125);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2] * _0_125);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4] * _0_125);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6] * _0_125);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0] * _0_125);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2] * _0_125);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4] * _0_125);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6] * _0_125);
 
     tmp10 = tmp0 + tmp2;        /* phase 3 */
     tmp11 = tmp0 - tmp2;
 
     tmp13 = tmp1 + tmp3;        /* phases 5-3 */
-    tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */
+    tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT)1.414213562) - tmp13; /* 2*c4 */
 
     tmp0 = tmp10 + tmp13;       /* phase 2 */
     tmp3 = tmp10 - tmp13;
@@ -143,10 +143,10 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1] * _0_125);
-    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3] * _0_125);
-    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5] * _0_125);
-    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7] * _0_125);
+    tmp4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1] * _0_125);
+    tmp5 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3] * _0_125);
+    tmp6 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5] * _0_125);
+    tmp7 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7] * _0_125);
 
     z13 = tmp6 + tmp5;          /* phase 6 */
     z10 = tmp6 - tmp5;
@@ -154,24 +154,24 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     z12 = tmp4 - tmp7;
 
     tmp7 = z11 + z13;           /* phase 5 */
-    tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */
+    tmp11 = (z11 - z13) * ((FAST_FLOAT)1.414213562); /* 2*c4 */
 
-    z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
-    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
+    z5 = (z10 + z12) * ((FAST_FLOAT)1.847759065); /* 2*c2 */
+    tmp10 = z5 - z12 * ((FAST_FLOAT)1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT)2.613125930); /* 2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;        /* phase 2 */
     tmp5 = tmp11 - tmp6;
     tmp4 = tmp10 - tmp5;
 
-    wsptr[DCTSIZE*0] = tmp0 + tmp7;
-    wsptr[DCTSIZE*7] = tmp0 - tmp7;
-    wsptr[DCTSIZE*1] = tmp1 + tmp6;
-    wsptr[DCTSIZE*6] = tmp1 - tmp6;
-    wsptr[DCTSIZE*2] = tmp2 + tmp5;
-    wsptr[DCTSIZE*5] = tmp2 - tmp5;
-    wsptr[DCTSIZE*3] = tmp3 + tmp4;
-    wsptr[DCTSIZE*4] = tmp3 - tmp4;
+    wsptr[DCTSIZE * 0] = tmp0 + tmp7;
+    wsptr[DCTSIZE * 7] = tmp0 - tmp7;
+    wsptr[DCTSIZE * 1] = tmp1 + tmp6;
+    wsptr[DCTSIZE * 6] = tmp1 - tmp6;
+    wsptr[DCTSIZE * 2] = tmp2 + tmp5;
+    wsptr[DCTSIZE * 5] = tmp2 - tmp5;
+    wsptr[DCTSIZE * 3] = tmp3 + tmp4;
+    wsptr[DCTSIZE * 4] = tmp3 - tmp4;
 
     inptr++;                    /* advance pointers to next column */
     quantptr++;
@@ -192,12 +192,12 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Apply signed->unsigned and prepare float->int conversion */
-    z5 = wsptr[0] + ((FAST_FLOAT) CENTERJSAMPLE + (FAST_FLOAT) 0.5);
+    z5 = wsptr[0] + ((FAST_FLOAT)CENTERJSAMPLE + (FAST_FLOAT)0.5);
     tmp10 = z5 + wsptr[4];
     tmp11 = z5 - wsptr[4];
 
     tmp13 = wsptr[2] + wsptr[6];
-    tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;
+    tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT)1.414213562) - tmp13;
 
     tmp0 = tmp10 + tmp13;
     tmp3 = tmp10 - tmp13;
@@ -212,11 +212,11 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     z12 = wsptr[1] - wsptr[7];
 
     tmp7 = z11 + z13;
-    tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);
+    tmp11 = (z11 - z13) * ((FAST_FLOAT)1.414213562);
 
-    z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
-    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
+    z5 = (z10 + z12) * ((FAST_FLOAT)1.847759065); /* 2*c2 */
+    tmp10 = z5 - z12 * ((FAST_FLOAT)1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT)2.613125930); /* 2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;
     tmp5 = tmp11 - tmp6;
@@ -224,14 +224,14 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage: float->int conversion and range-limit */
 
-    outptr[0] = range_limit[((int) (tmp0 + tmp7)) & RANGE_MASK];
-    outptr[7] = range_limit[((int) (tmp0 - tmp7)) & RANGE_MASK];
-    outptr[1] = range_limit[((int) (tmp1 + tmp6)) & RANGE_MASK];
-    outptr[6] = range_limit[((int) (tmp1 - tmp6)) & RANGE_MASK];
-    outptr[2] = range_limit[((int) (tmp2 + tmp5)) & RANGE_MASK];
-    outptr[5] = range_limit[((int) (tmp2 - tmp5)) & RANGE_MASK];
-    outptr[3] = range_limit[((int) (tmp3 + tmp4)) & RANGE_MASK];
-    outptr[4] = range_limit[((int) (tmp3 - tmp4)) & RANGE_MASK];
+    outptr[0] = range_limit[((int)(tmp0 + tmp7)) & RANGE_MASK];
+    outptr[7] = range_limit[((int)(tmp0 - tmp7)) & RANGE_MASK];
+    outptr[1] = range_limit[((int)(tmp1 + tmp6)) & RANGE_MASK];
+    outptr[6] = range_limit[((int)(tmp1 - tmp6)) & RANGE_MASK];
+    outptr[2] = range_limit[((int)(tmp2 + tmp5)) & RANGE_MASK];
+    outptr[5] = range_limit[((int)(tmp2 - tmp5)) & RANGE_MASK];
+    outptr[3] = range_limit[((int)(tmp3 + tmp4)) & RANGE_MASK];
+    outptr[4] = range_limit[((int)(tmp3 - tmp4)) & RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
diff --git a/media/libjpeg/jidctfst.c b/media/libjpeg/jidctfst.c
index 10db739b86..89a20c937b 100644
--- a/media/libjpeg/jidctfst.c
+++ b/media/libjpeg/jidctfst.c
@@ -92,10 +92,10 @@
  */
 
 #if CONST_BITS == 8
-#define FIX_1_082392200  ((JLONG)  277)         /* FIX(1.082392200) */
-#define FIX_1_414213562  ((JLONG)  362)         /* FIX(1.414213562) */
-#define FIX_1_847759065  ((JLONG)  473)         /* FIX(1.847759065) */
-#define FIX_2_613125930  ((JLONG)  669)         /* FIX(2.613125930) */
+#define FIX_1_082392200  ((JLONG)277)           /* FIX(1.082392200) */
+#define FIX_1_414213562  ((JLONG)362)           /* FIX(1.414213562) */
+#define FIX_1_847759065  ((JLONG)473)           /* FIX(1.847759065) */
+#define FIX_2_613125930  ((JLONG)669)           /* FIX(2.613125930) */
 #else
 #define FIX_1_082392200  FIX(1.082392200)
 #define FIX_1_414213562  FIX(1.414213562)
@@ -111,7 +111,7 @@
 
 #ifndef USE_ACCURATE_ROUNDING
 #undef DESCALE
-#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
+#define DESCALE(x, n)  RIGHT_SHIFT(x, n)
 #endif
 
 
@@ -119,7 +119,7 @@
  * descale to yield a DCTELEM result.
  */
 
-#define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
+#define MULTIPLY(var, const)  ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
 
 
 /* Dequantize a coefficient by multiplying it by the multiplier-table
@@ -129,10 +129,10 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define DEQUANTIZE(coef,quantval)  (((IFAST_MULT_TYPE) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval)  (((IFAST_MULT_TYPE)(coef)) * (quantval))
 #else
-#define DEQUANTIZE(coef,quantval)  \
-        DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
+#define DEQUANTIZE(coef, quantval) \
+  DESCALE((coef) * (quantval), IFAST_SCALE_BITS - PASS1_BITS)
 #endif
 
 
@@ -147,19 +147,19 @@
 #else
 #define DCTELEMBITS  32         /* DCTELEM must be 32 bits */
 #endif
-#define IRIGHT_SHIFT(x,shft)  \
-    ((ishift_temp = (x)) < 0 ? \
-     (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \
-     (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+  ((ishift_temp = (x)) < 0 ? \
+   (ishift_temp >> (shft)) | ((~((DCTELEM)0)) << (DCTELEMBITS - (shft))) : \
+   (ishift_temp >> (shft)))
 #else
 #define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft)   ((x) >> (shft))
 #endif
 
 #ifdef USE_ACCURATE_ROUNDING
-#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n))
+#define IDESCALE(x, n)  ((int)IRIGHT_SHIFT((x) + (1 << ((n) - 1)), n))
 #else
-#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT(x, n))
+#define IDESCALE(x, n)  ((int)IRIGHT_SHIFT(x, n))
 #endif
 
 
@@ -168,9 +168,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
@@ -188,7 +188,7 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
+  quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; ctr--) {
     /* Due to quantization, we will usually find that many of the input
@@ -200,21 +200,21 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
      * column DCT calculations can be simplified this way.
      */
 
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-        inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+        inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero */
-      int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+      int dcval = (int)DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      wsptr[DCTSIZE*4] = dcval;
-      wsptr[DCTSIZE*5] = dcval;
-      wsptr[DCTSIZE*6] = dcval;
-      wsptr[DCTSIZE*7] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
+      wsptr[DCTSIZE * 2] = dcval;
+      wsptr[DCTSIZE * 3] = dcval;
+      wsptr[DCTSIZE * 4] = dcval;
+      wsptr[DCTSIZE * 5] = dcval;
+      wsptr[DCTSIZE * 6] = dcval;
+      wsptr[DCTSIZE * 7] = dcval;
 
       inptr++;                  /* advance pointers to next column */
       quantptr++;
@@ -224,10 +224,10 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp10 = tmp0 + tmp2;        /* phase 3 */
     tmp11 = tmp0 - tmp2;
@@ -242,10 +242,10 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    tmp5 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    tmp6 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    tmp7 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     z13 = tmp6 + tmp5;          /* phase 6 */
     z10 = tmp6 - tmp5;
@@ -257,20 +257,20 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
-    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
+    tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; /* -2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;        /* phase 2 */
     tmp5 = tmp11 - tmp6;
     tmp4 = tmp10 + tmp5;
 
-    wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
-    wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
-    wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
-    wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
-    wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
-    wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
-    wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
-    wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
+    wsptr[DCTSIZE * 0] = (int)(tmp0 + tmp7);
+    wsptr[DCTSIZE * 7] = (int)(tmp0 - tmp7);
+    wsptr[DCTSIZE * 1] = (int)(tmp1 + tmp6);
+    wsptr[DCTSIZE * 6] = (int)(tmp1 - tmp6);
+    wsptr[DCTSIZE * 2] = (int)(tmp2 + tmp5);
+    wsptr[DCTSIZE * 5] = (int)(tmp2 - tmp5);
+    wsptr[DCTSIZE * 4] = (int)(tmp3 + tmp4);
+    wsptr[DCTSIZE * 3] = (int)(tmp3 - tmp4);
 
     inptr++;                    /* advance pointers to next column */
     quantptr++;
@@ -296,8 +296,8 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
-                                  & RANGE_MASK];
+      JSAMPLE dcval =
+        range_limit[IDESCALE(wsptr[0], PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -315,12 +315,12 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Even part */
 
-    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
-    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
+    tmp10 = ((DCTELEM)wsptr[0] + (DCTELEM)wsptr[4]);
+    tmp11 = ((DCTELEM)wsptr[0] - (DCTELEM)wsptr[4]);
 
-    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
-    tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
-            - tmp13;
+    tmp13 = ((DCTELEM)wsptr[2] + (DCTELEM)wsptr[6]);
+    tmp12 =
+      MULTIPLY((DCTELEM)wsptr[2] - (DCTELEM)wsptr[6], FIX_1_414213562) - tmp13;
 
     tmp0 = tmp10 + tmp13;
     tmp3 = tmp10 - tmp13;
@@ -329,17 +329,17 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
-    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
-    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
-    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
+    z13 = (DCTELEM)wsptr[5] + (DCTELEM)wsptr[3];
+    z10 = (DCTELEM)wsptr[5] - (DCTELEM)wsptr[3];
+    z11 = (DCTELEM)wsptr[1] + (DCTELEM)wsptr[7];
+    z12 = (DCTELEM)wsptr[1] - (DCTELEM)wsptr[7];
 
     tmp7 = z11 + z13;           /* phase 5 */
     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
 
     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
-    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
+    tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; /* -2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;        /* phase 2 */
     tmp5 = tmp11 - tmp6;
@@ -347,22 +347,22 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage: scale down by a factor of 8 and range-limit */
 
-    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] =
+      range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[7] =
+      range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[1] =
+      range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[6] =
+      range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[2] =
+      range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[5] =
+      range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[4] =
+      range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[3] =
+      range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS + 3) & RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
diff --git a/media/libjpeg/jidctint.c b/media/libjpeg/jidctint.c
index 3ac6caf692..bb08748019 100644
--- a/media/libjpeg/jidctint.c
+++ b/media/libjpeg/jidctint.c
@@ -1,15 +1,15 @@
 /*
  * jidctint.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * Modification developed 2002-2009 by Guido Vollbeding.
+ * Modification developed 2002-2018 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
- * This file contains a slow-but-accurate integer implementation of the
+ * This file contains a slower but more accurate integer implementation of the
  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
  * must also perform dequantization of the input coefficients.
  *
@@ -115,18 +115,18 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_298631336  ((JLONG)  2446)        /* FIX(0.298631336) */
-#define FIX_0_390180644  ((JLONG)  3196)        /* FIX(0.390180644) */
-#define FIX_0_541196100  ((JLONG)  4433)        /* FIX(0.541196100) */
-#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
-#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
-#define FIX_1_175875602  ((JLONG)  9633)        /* FIX(1.175875602) */
-#define FIX_1_501321110  ((JLONG)  12299)       /* FIX(1.501321110) */
-#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
-#define FIX_1_961570560  ((JLONG)  16069)       /* FIX(1.961570560) */
-#define FIX_2_053119869  ((JLONG)  16819)       /* FIX(2.053119869) */
-#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
-#define FIX_3_072711026  ((JLONG)  25172)       /* FIX(3.072711026) */
+#define FIX_0_298631336  ((JLONG)2446)          /* FIX(0.298631336) */
+#define FIX_0_390180644  ((JLONG)3196)          /* FIX(0.390180644) */
+#define FIX_0_541196100  ((JLONG)4433)          /* FIX(0.541196100) */
+#define FIX_0_765366865  ((JLONG)6270)          /* FIX(0.765366865) */
+#define FIX_0_899976223  ((JLONG)7373)          /* FIX(0.899976223) */
+#define FIX_1_175875602  ((JLONG)9633)          /* FIX(1.175875602) */
+#define FIX_1_501321110  ((JLONG)12299)         /* FIX(1.501321110) */
+#define FIX_1_847759065  ((JLONG)15137)         /* FIX(1.847759065) */
+#define FIX_1_961570560  ((JLONG)16069)         /* FIX(1.961570560) */
+#define FIX_2_053119869  ((JLONG)16819)         /* FIX(2.053119869) */
+#define FIX_2_562915447  ((JLONG)20995)         /* FIX(2.562915447) */
+#define FIX_3_072711026  ((JLONG)25172)         /* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
@@ -151,9 +151,9 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
+#define MULTIPLY(var, const)  MULTIPLY16C16(var, const)
 #else
-#define MULTIPLY(var,const)  ((var) * (const))
+#define MULTIPLY(var, const)  ((var) * (const))
 #endif
 
 
@@ -162,7 +162,7 @@
  * are 16 bits or less, so either int or short multiply will work.
  */
 
-#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval)  (((ISLOW_MULT_TYPE)(coef)) * (quantval))
 
 
 /*
@@ -170,9 +170,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp3;
   JLONG tmp10, tmp11, tmp12, tmp13;
@@ -191,7 +191,7 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   /* furthermore, we scale the results by 2**PASS1_BITS. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; ctr--) {
     /* Due to quantization, we will usually find that many of the input
@@ -203,22 +203,22 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
      * column DCT calculations can be simplified this way.
      */
 
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-        inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+        inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero */
-      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
-                             PASS1_BITS);
-
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      wsptr[DCTSIZE*4] = dcval;
-      wsptr[DCTSIZE*5] = dcval;
-      wsptr[DCTSIZE*6] = dcval;
-      wsptr[DCTSIZE*7] = dcval;
+      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+                             quantptr[DCTSIZE * 0]), PASS1_BITS);
+
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
+      wsptr[DCTSIZE * 2] = dcval;
+      wsptr[DCTSIZE * 3] = dcval;
+      wsptr[DCTSIZE * 4] = dcval;
+      wsptr[DCTSIZE * 5] = dcval;
+      wsptr[DCTSIZE * 6] = dcval;
+      wsptr[DCTSIZE * 7] = dcval;
 
       inptr++;                  /* advance pointers to next column */
       quantptr++;
@@ -229,15 +229,15 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+    tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
 
     tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS);
     tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS);
@@ -251,10 +251,10 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
 
     z1 = tmp0 + tmp3;
     z2 = tmp1 + tmp2;
@@ -266,10 +266,10 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
 
     z3 += z5;
     z4 += z5;
@@ -281,14 +281,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE * 0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS - PASS1_BITS);
 
     inptr++;                    /* advance pointers to next column */
     quantptr++;
@@ -314,8 +314,8 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
-                                  & RANGE_MASK];
+      JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+                                               PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -334,15 +334,15 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
 
-    z2 = (JLONG) wsptr[2];
-    z3 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[2];
+    z3 = (JLONG)wsptr[6];
 
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+    tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
 
-    tmp0 = LEFT_SHIFT((JLONG) wsptr[0] + (JLONG) wsptr[4], CONST_BITS);
-    tmp1 = LEFT_SHIFT((JLONG) wsptr[0] - (JLONG) wsptr[4], CONST_BITS);
+    tmp0 = LEFT_SHIFT((JLONG)wsptr[0] + (JLONG)wsptr[4], CONST_BITS);
+    tmp1 = LEFT_SHIFT((JLONG)wsptr[0] - (JLONG)wsptr[4], CONST_BITS);
 
     tmp10 = tmp0 + tmp3;
     tmp13 = tmp0 - tmp3;
@@ -353,10 +353,10 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
 
-    tmp0 = (JLONG) wsptr[7];
-    tmp1 = (JLONG) wsptr[5];
-    tmp2 = (JLONG) wsptr[3];
-    tmp3 = (JLONG) wsptr[1];
+    tmp0 = (JLONG)wsptr[7];
+    tmp1 = (JLONG)wsptr[5];
+    tmp2 = (JLONG)wsptr[3];
+    tmp3 = (JLONG)wsptr[1];
 
     z1 = tmp0 + tmp3;
     z2 = tmp1 + tmp2;
@@ -368,10 +368,10 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
 
     z3 += z5;
     z4 += z5;
@@ -383,30 +383,30 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp3,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[7] = range_limit[(int)DESCALE(tmp10 - tmp3,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)DESCALE(tmp11 + tmp2,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[6] = range_limit[(int)DESCALE(tmp11 - tmp2,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)DESCALE(tmp12 + tmp1,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)DESCALE(tmp12 - tmp1,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)DESCALE(tmp13 + tmp0,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)DESCALE(tmp13 - tmp0,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
@@ -417,16 +417,16 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 /*
  * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 7x7 output block.
+ * producing a reduced-size 7x7 output block.
  *
  * Optimized algorithm with 12 multiplications in the 1-D kernel.
  * cK represents sqrt(2) * cos(K*pi/14).
  */
 
 GLOBAL(void)
-jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_7x7(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
   JLONG z1, z2, z3;
@@ -436,25 +436,25 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[7*7];   /* buffers data between passes */
+  int workspace[7 * 7];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp13 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp13 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
@@ -468,15 +468,15 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
 
     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
     tmp0 = tmp1 - tmp2;
     tmp1 += tmp2;
-    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
+    tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276));     /* -c1 */
     tmp1 += tmp2;
     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
     tmp0 += z2;
@@ -484,13 +484,13 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[7 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 6] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 5] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 4] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 3] = (int)RIGHT_SHIFT(tmp13, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 7 rows from work array, store into output array. */
@@ -502,12 +502,12 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp13 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp13 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[4];
-    z3 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[4];
+    z3 = (JLONG)wsptr[6];
 
     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
@@ -521,15 +521,15 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
 
     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
     tmp0 = tmp1 - tmp2;
     tmp1 += tmp2;
-    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
+    tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276));     /* -c1 */
     tmp1 += tmp2;
     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
     tmp0 += z2;
@@ -537,27 +537,27 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 7;         /* advance pointer to next row */
   }
@@ -573,9 +573,9 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
   JLONG z1, z2, z3;
@@ -585,35 +585,35 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[6*6];   /* buffers data between passes */
+  int workspace[6 * 6];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
     tmp1 = tmp0 + tmp10;
-    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
-    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS - PASS1_BITS);
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
     tmp10 = tmp1 + tmp0;
     tmp12 = tmp1 - tmp0;
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
     tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
     tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
@@ -621,12 +621,12 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[6*1] = (int) (tmp11 + tmp1);
-    wsptr[6*4] = (int) (tmp11 - tmp1);
-    wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[6 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[6 * 5] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[6 * 1] = (int)(tmp11 + tmp1);
+    wsptr[6 * 4] = (int)(tmp11 - tmp1);
+    wsptr[6 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[6 * 3] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 6 rows from work array, store into output array. */
@@ -638,22 +638,22 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
-    tmp2 = (JLONG) wsptr[4];
+    tmp2 = (JLONG)wsptr[4];
     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
     tmp1 = tmp0 + tmp10;
     tmp11 = tmp0 - tmp10 - tmp10;
-    tmp10 = (JLONG) wsptr[2];
+    tmp10 = (JLONG)wsptr[2];
     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
     tmp10 = tmp1 + tmp0;
     tmp12 = tmp1 - tmp0;
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
     tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
     tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
@@ -661,24 +661,24 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 6;         /* advance pointer to next row */
   }
@@ -694,9 +694,9 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_5x5(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp10, tmp11, tmp12;
   JLONG z1, z2, z3;
@@ -706,23 +706,23 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[5*5];   /* buffers data between passes */
+  int workspace[5 * 5];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp12 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
     z3 = tmp12 + z2;
@@ -732,8 +732,8 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
@@ -741,11 +741,11 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[5 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[5 * 4] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[5 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[5 * 3] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[5 * 2] = (int)RIGHT_SHIFT(tmp12, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 5 rows from work array, store into output array. */
@@ -757,10 +757,10 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp12 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp12 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
-    tmp0 = (JLONG) wsptr[2];
-    tmp1 = (JLONG) wsptr[4];
+    tmp0 = (JLONG)wsptr[2];
+    tmp1 = (JLONG)wsptr[4];
     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
     z3 = tmp12 + z2;
@@ -770,8 +770,8 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z2 = (JLONG) wsptr[1];
-    z3 = (JLONG) wsptr[3];
+    z2 = (JLONG)wsptr[1];
+    z3 = (JLONG)wsptr[3];
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
@@ -779,21 +779,21 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 5;         /* advance pointer to next row */
   }
@@ -809,9 +809,9 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_3x3(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp2, tmp10, tmp12;
   JCOEFPTR inptr;
@@ -820,36 +820,36 @@ jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[3*3];   /* buffers data between passes */
+  int workspace[3 * 3];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
     tmp10 = tmp0 + tmp12;
     tmp2 = tmp0 - tmp12 - tmp12;
 
     /* Odd part */
 
-    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
 
     /* Final output stage */
 
-    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[3 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[3 * 2] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[3 * 1] = (int)RIGHT_SHIFT(tmp2, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 3 rows from work array, store into output array. */
@@ -861,29 +861,29 @@ jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
-    tmp2 = (JLONG) wsptr[2];
+    tmp2 = (JLONG)wsptr[2];
     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
     tmp10 = tmp0 + tmp12;
     tmp2 = tmp0 - tmp12 - tmp12;
 
     /* Odd part */
 
-    tmp12 = (JLONG) wsptr[1];
+    tmp12 = (JLONG)wsptr[1];
     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 3;         /* advance pointer to next row */
   }
@@ -899,9 +899,9 @@ jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_9x9(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
   JLONG z1, z2, z3, z4;
@@ -911,25 +911,25 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*9];   /* buffers data between passes */
+  int workspace[8 * 9];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
     tmp1 = tmp0 + tmp3;
@@ -949,12 +949,12 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
-    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
+    z2 = MULTIPLY(z2, -FIX(1.224744871));            /* -c3 */
 
     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
@@ -966,15 +966,15 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp14, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 9 rows from work array, store into output array. */
@@ -986,12 +986,12 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[4];
-    z3 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[4];
+    z3 = (JLONG)wsptr[6];
 
     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
     tmp1 = tmp0 + tmp3;
@@ -1011,12 +1011,12 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
-    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
+    z2 = MULTIPLY(z2, -FIX(1.224744871));            /* -c3 */
 
     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
@@ -1028,33 +1028,33 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13 + tmp3,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp13 - tmp3,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp14,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1070,9 +1070,9 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24;
@@ -1083,32 +1083,32 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*10];  /* buffers data between passes */
+  int workspace[8 * 10];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z3 = LEFT_SHIFT(z3, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
     tmp10 = z3 + z1;
     tmp11 = z3 - z2;
 
     tmp22 = RIGHT_SHIFT(z3 - LEFT_SHIFT(z1 - z2, 1),
-                        CONST_BITS-PASS1_BITS);  /* c0 = (c4-c8)*2 */
+                        CONST_BITS - PASS1_BITS); /* c0 = (c4-c8)*2 */
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
@@ -1121,10 +1121,10 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = z2 + z4;
     tmp13 = z2 - z4;
@@ -1148,16 +1148,16 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2] = (int) (tmp22 + tmp12);
-    wsptr[8*7] = (int) (tmp22 - tmp12);
-    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2] = (int)(tmp22 + tmp12);
+    wsptr[8 * 7] = (int)(tmp22 - tmp12);
+    wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 10 rows from work array, store into output array. */
@@ -1169,9 +1169,9 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z3 = LEFT_SHIFT(z3, CONST_BITS);
-    z4 = (JLONG) wsptr[4];
+    z4 = (JLONG)wsptr[4];
     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
     tmp10 = z3 + z1;
@@ -1179,8 +1179,8 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     tmp22 = z3 - LEFT_SHIFT(z1 - z2, 1);         /* c0 = (c4-c8)*2 */
 
-    z2 = (JLONG) wsptr[2];
-    z3 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[2];
+    z3 = (JLONG)wsptr[6];
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
@@ -1193,11 +1193,11 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
     z3 = LEFT_SHIFT(z3, CONST_BITS);
-    z4 = (JLONG) wsptr[7];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = z2 + z4;
     tmp13 = z2 - z4;
@@ -1220,36 +1220,36 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1258,16 +1258,16 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 /*
  * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 11x11 output block.
+ * producing an 11x11 output block.
  *
  * Optimized algorithm with 24 multiplications in the 1-D kernel.
  * cK represents sqrt(2) * cos(K*pi/22).
  */
 
 GLOBAL(void)
-jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_11x11(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
@@ -1278,30 +1278,30 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*11];  /* buffers data between passes */
+  int workspace[8 * 11];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp10 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
     z4 = z1 + z3;
-    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
+    tmp24 = MULTIPLY(z4, -FIX(1.155664402));         /* -(c2-c10) */
     z4 -= z2;
     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
     tmp21 = tmp20 + tmp23 + tmp25 -
@@ -1316,10 +1316,10 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = z1 + z2;
     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
@@ -1331,26 +1331,26 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
-    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
+    z1    = MULTIPLY(z2 + z4, -FIX(1.798248910));        /* -(c1+c9) */
     tmp11 += z1;
     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
-    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
+    tmp14 += MULTIPLY(z2, -FIX(1.467221301)) +           /* -(c5+c9) */
              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 11 rows from work array, store into output array. */
@@ -1362,17 +1362,17 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp10 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp10 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[4];
-    z3 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[4];
+    z3 = (JLONG)wsptr[6];
 
     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
     z4 = z1 + z3;
-    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
+    tmp24 = MULTIPLY(z4, -FIX(1.155664402));         /* -(c2-c10) */
     z4 -= z2;
     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
     tmp21 = tmp20 + tmp23 + tmp25 -
@@ -1387,10 +1387,10 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = z1 + z2;
     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
@@ -1402,48 +1402,48 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
-    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
+    z1    = MULTIPLY(z2 + z4, -FIX(1.798248910));        /* -(c1+c9) */
     tmp11 += z1;
     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
-    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
+    tmp14 += MULTIPLY(z2, -FIX(1.467221301)) +           /* -(c5+c9) */
              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1459,9 +1459,9 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
@@ -1472,32 +1472,32 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*12];  /* buffers data between passes */
+  int workspace[8 * 12];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z3 = LEFT_SHIFT(z3, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
     z1 = LEFT_SHIFT(z1, CONST_BITS);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
     z2 = LEFT_SHIFT(z2, CONST_BITS);
 
     tmp12 = z1 - z2;
@@ -1517,19 +1517,19 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
-    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+    tmp14 = MULTIPLY(z2, -FIX_0_541196100);                  /* -c9 */
 
     tmp10 = z1 + z3;
     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
-    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580));            /* -(c7+c11) */
     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
@@ -1543,18 +1543,18 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 12 rows from work array, store into output array. */
@@ -1566,19 +1566,19 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z3 = LEFT_SHIFT(z3, CONST_BITS);
 
-    z4 = (JLONG) wsptr[4];
+    z4 = (JLONG)wsptr[4];
     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
 
-    z1 = (JLONG) wsptr[2];
+    z1 = (JLONG)wsptr[2];
     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
     z1 = LEFT_SHIFT(z1, CONST_BITS);
-    z2 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[6];
     z2 = LEFT_SHIFT(z2, CONST_BITS);
 
     tmp12 = z1 - z2;
@@ -1598,19 +1598,19 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
-    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+    tmp14 = MULTIPLY(z2, -FIX_0_541196100);                  /* -c9 */
 
     tmp10 = z1 + z3;
     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
-    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580));            /* -(c7+c11) */
     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
@@ -1624,42 +1624,42 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1675,9 +1675,9 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_13x13(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
@@ -1688,25 +1688,25 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*13];  /* buffers data between passes */
+  int workspace[8 * 13];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z1 = LEFT_SHIFT(z1, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
@@ -1721,22 +1721,22 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
 
     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
-    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+    tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13;  /* c4 */
 
     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
 
-    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
-    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+    tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13;  /* c12 */
+    tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13;  /* c8 */
 
     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
@@ -1744,13 +1744,13 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
     tmp10 = tmp11 + tmp12 + tmp13 -
             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
-    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
+    tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458));    /* -c11 */
     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
-    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
+    tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945));    /* -c5 */
     tmp11 += tmp14;
     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
-    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
+    tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813));    /* -c9 */
     tmp12 += tmp14;
     tmp13 += tmp14;
     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
@@ -1763,19 +1763,19 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 13 rows from work array, store into output array. */
@@ -1787,12 +1787,12 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z1 = LEFT_SHIFT(z1, CONST_BITS);
 
-    z2 = (JLONG) wsptr[2];
-    z3 = (JLONG) wsptr[4];
-    z4 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[2];
+    z3 = (JLONG)wsptr[4];
+    z4 = (JLONG)wsptr[6];
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
@@ -1807,22 +1807,22 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
 
     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
-    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+    tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13;  /* c4 */
 
     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
 
-    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
-    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+    tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13;  /* c12 */
+    tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13;  /* c8 */
 
     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
@@ -1830,13 +1830,13 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
     tmp10 = tmp11 + tmp12 + tmp13 -
             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
-    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
+    tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458));    /* -c11 */
     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
-    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
+    tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945));    /* -c5 */
     tmp11 += tmp14;
     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
-    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
+    tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813));    /* -c9 */
     tmp12 += tmp14;
     tmp13 += tmp14;
     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
@@ -1849,45 +1849,45 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1903,9 +1903,9 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_14x14(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
@@ -1916,22 +1916,22 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*14];  /* buffers data between passes */
+  int workspace[8 * 14];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z1 = LEFT_SHIFT(z1, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
@@ -1941,10 +1941,10 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp12 = z1 - z4;
 
     tmp23 = RIGHT_SHIFT(z1 - LEFT_SHIFT(z2 + z3 - z4, 1),
-                        CONST_BITS-PASS1_BITS);  /* c0 = (c4+c12-c8)*2 */
+                        CONST_BITS - PASS1_BITS); /* c0 = (c4+c12-c8)*2 */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
 
@@ -1962,10 +1962,10 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
     tmp13 = LEFT_SHIFT(z4, CONST_BITS);
 
     tmp14 = z1 + z3;
@@ -1978,7 +1978,7 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
     tmp16 += tmp15;
     z1    += z4;
-    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
+    z4    = MULTIPLY(z2 + z3, -FIX(0.158341681)) - tmp13;  /* -c13 */
     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
@@ -1989,20 +1989,20 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) (tmp23 + tmp13);
-    wsptr[8*10] = (int) (tmp23 - tmp13);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)(tmp23 + tmp13);
+    wsptr[8 * 10] = (int)(tmp23 - tmp13);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 14 rows from work array, store into output array. */
@@ -2014,9 +2014,9 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z1 = LEFT_SHIFT(z1, CONST_BITS);
-    z4 = (JLONG) wsptr[4];
+    z4 = (JLONG)wsptr[4];
     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
@@ -2027,8 +2027,8 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     tmp23 = z1 - LEFT_SHIFT(z2 + z3 - z4, 1);    /* c0 = (c4+c12-c8)*2 */
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[6];
 
     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
 
@@ -2046,10 +2046,10 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
     z4 = LEFT_SHIFT(z4, CONST_BITS);
 
     tmp14 = z1 + z3;
@@ -2061,7 +2061,7 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     z1    -= z2;
     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
     tmp16 += tmp15;
-    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
+    tmp13 = MULTIPLY(z2 + z3, -FIX(0.158341681)) - z4;     /* -c13 */
     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
@@ -2072,48 +2072,48 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -2129,9 +2129,9 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_15x15(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -2142,25 +2142,25 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*15];  /* buffers data between passes */
+  int workspace[8 * 15];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z1 = LEFT_SHIFT(z1, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
@@ -2195,19 +2195,19 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp13 = z2 - z4;
     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
 
-    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
-    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
+    tmp13 = MULTIPLY(z2, -FIX(0.831253876));                /* -c9 */
+    tmp15 = MULTIPLY(z2, -FIX(1.344997024));                /* -c3 */
     z2 = z1 - z4;
     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
 
@@ -2220,21 +2220,21 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp27, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 15 rows from work array, store into output array. */
@@ -2246,12 +2246,12 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z1 = LEFT_SHIFT(z1, CONST_BITS);
 
-    z2 = (JLONG) wsptr[2];
-    z3 = (JLONG) wsptr[4];
-    z4 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[2];
+    z3 = (JLONG)wsptr[4];
+    z4 = (JLONG)wsptr[6];
 
     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
@@ -2286,19 +2286,19 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z4 = (JLONG) wsptr[5];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z4 = (JLONG)wsptr[5];
     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
-    z4 = (JLONG) wsptr[7];
+    z4 = (JLONG)wsptr[7];
 
     tmp13 = z2 - z4;
     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
 
-    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
-    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
+    tmp13 = MULTIPLY(z2, -FIX(0.831253876));                /* -c9 */
+    tmp15 = MULTIPLY(z2, -FIX(1.344997024));                /* -c3 */
     z2 = z1 - z4;
     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
 
@@ -2311,51 +2311,51 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp27,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -2371,9 +2371,9 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_16x16(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -2384,23 +2384,23 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*16];  /* buffers data between passes */
+  int workspace[8 * 16];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
 
@@ -2409,8 +2409,8 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp0 - tmp2;
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
     z3 = z1 - z2;
     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
@@ -2431,10 +2431,10 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = z1 + z3;
 
@@ -2455,13 +2455,13 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
     z2    += z4;
-    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    z1    = MULTIPLY(z2, -FIX(0.666655658));       /* -c11 */
     tmp1  += z1;
     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
-    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    z2    = MULTIPLY(z2, -FIX(1.247225013));       /* -c5 */
     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
     tmp12 += z2;
-    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    z2    = MULTIPLY(z3 + z4, -FIX(1.353318001));  /* -c3 */
     tmp2  += z2;
     tmp3  += z2;
     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
@@ -2470,22 +2470,22 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
-    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
-    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
-    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
-    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 15] = (int)RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 16 rows from work array, store into output array. */
@@ -2497,10 +2497,10 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
 
-    z1 = (JLONG) wsptr[4];
+    z1 = (JLONG)wsptr[4];
     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
 
@@ -2509,8 +2509,8 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp0 - tmp2;
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[6];
     z3 = z1 - z2;
     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
@@ -2531,10 +2531,10 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = z1 + z3;
 
@@ -2555,13 +2555,13 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
     z2    += z4;
-    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    z1    = MULTIPLY(z2, -FIX(0.666655658));       /* -c11 */
     tmp1  += z1;
     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
-    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    z2    = MULTIPLY(z2, -FIX(1.247225013));       /* -c5 */
     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
     tmp12 += z2;
-    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    z2    = MULTIPLY(z3 + z4, -FIX(1.353318001));  /* -c3 */
     tmp2  += z2;
     tmp3  += z2;
     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
@@ -2570,54 +2570,54 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp0,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[15] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp0,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp1,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp1,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp2,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp2,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp3,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp3,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp27 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp27 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
diff --git a/media/libjpeg/jidctred.c b/media/libjpeg/jidctred.c
index 7a81803b8d..1dd65a94d9 100644
--- a/media/libjpeg/jidctred.c
+++ b/media/libjpeg/jidctred.c
@@ -1,7 +1,7 @@
 /*
  * jidctred.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, D. R. Commander.
@@ -58,20 +58,20 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_211164243  ((JLONG)  1730)        /* FIX(0.211164243) */
-#define FIX_0_509795579  ((JLONG)  4176)        /* FIX(0.509795579) */
-#define FIX_0_601344887  ((JLONG)  4926)        /* FIX(0.601344887) */
-#define FIX_0_720959822  ((JLONG)  5906)        /* FIX(0.720959822) */
-#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
-#define FIX_0_850430095  ((JLONG)  6967)        /* FIX(0.850430095) */
-#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
-#define FIX_1_061594337  ((JLONG)  8697)        /* FIX(1.061594337) */
-#define FIX_1_272758580  ((JLONG)  10426)       /* FIX(1.272758580) */
-#define FIX_1_451774981  ((JLONG)  11893)       /* FIX(1.451774981) */
-#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
-#define FIX_2_172734803  ((JLONG)  17799)       /* FIX(2.172734803) */
-#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
-#define FIX_3_624509785  ((JLONG)  29692)       /* FIX(3.624509785) */
+#define FIX_0_211164243  ((JLONG)1730)          /* FIX(0.211164243) */
+#define FIX_0_509795579  ((JLONG)4176)          /* FIX(0.509795579) */
+#define FIX_0_601344887  ((JLONG)4926)          /* FIX(0.601344887) */
+#define FIX_0_720959822  ((JLONG)5906)          /* FIX(0.720959822) */
+#define FIX_0_765366865  ((JLONG)6270)          /* FIX(0.765366865) */
+#define FIX_0_850430095  ((JLONG)6967)          /* FIX(0.850430095) */
+#define FIX_0_899976223  ((JLONG)7373)          /* FIX(0.899976223) */
+#define FIX_1_061594337  ((JLONG)8697)          /* FIX(1.061594337) */
+#define FIX_1_272758580  ((JLONG)10426)         /* FIX(1.272758580) */
+#define FIX_1_451774981  ((JLONG)11893)         /* FIX(1.451774981) */
+#define FIX_1_847759065  ((JLONG)15137)         /* FIX(1.847759065) */
+#define FIX_2_172734803  ((JLONG)17799)         /* FIX(2.172734803) */
+#define FIX_2_562915447  ((JLONG)20995)         /* FIX(2.562915447) */
+#define FIX_3_624509785  ((JLONG)29692)         /* FIX(3.624509785) */
 #else
 #define FIX_0_211164243  FIX(0.211164243)
 #define FIX_0_509795579  FIX(0.509795579)
@@ -98,9 +98,9 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
+#define MULTIPLY(var, const)  MULTIPLY16C16(var, const)
 #else
-#define MULTIPLY(var,const)  ((var) * (const))
+#define MULTIPLY(var, const)  ((var) * (const))
 #endif
 
 
@@ -109,7 +109,7 @@
  * are 16 bits or less, so either int or short multiply will work.
  */
 
-#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval)  (((ISLOW_MULT_TYPE)(coef)) * (quantval))
 
 
 /*
@@ -118,9 +118,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp2, tmp10, tmp12;
   JLONG z1, z2, z3, z4;
@@ -130,69 +130,73 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[DCTSIZE*4];     /* buffers data between passes */
+  int workspace[DCTSIZE * 4];   /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
     /* Don't bother to process column 4, because second pass won't use it */
-    if (ctr == DCTSIZE-4)
+    if (ctr == DCTSIZE - 4)
       continue;
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*5] == 0 &&
-        inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 5] == 0 &&
+        inptr[DCTSIZE * 6] == 0 && inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero; we need not examine term 4 for 4x4 output */
-      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
-                             PASS1_BITS);
+      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+                                        quantptr[DCTSIZE * 0]), PASS1_BITS);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
+      wsptr[DCTSIZE * 2] = dcval;
+      wsptr[DCTSIZE * 3] = dcval;
 
       continue;
     }
 
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS+1);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS + 1);
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
-    tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, - FIX_0_765366865);
+    tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, -FIX_0_765366865);
 
     tmp10 = tmp0 + tmp2;
     tmp12 = tmp0 - tmp2;
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
 
-    tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-         + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-         + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-         + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
+    tmp0 = MULTIPLY(z1, -FIX_0_211164243) + /* sqrt(2) * ( c3-c1) */
+           MULTIPLY(z2,  FIX_1_451774981) + /* sqrt(2) * ( c3+c7) */
+           MULTIPLY(z3, -FIX_2_172734803) + /* sqrt(2) * (-c1-c5) */
+           MULTIPLY(z4,  FIX_1_061594337);  /* sqrt(2) * ( c5+c7) */
 
-    tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-         + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-         + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-         + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
+    tmp2 = MULTIPLY(z1, -FIX_0_509795579) + /* sqrt(2) * (c7-c5) */
+           MULTIPLY(z2, -FIX_0_601344887) + /* sqrt(2) * (c5-c1) */
+           MULTIPLY(z3,  FIX_0_899976223) + /* sqrt(2) * (c3-c7) */
+           MULTIPLY(z4,  FIX_2_562915447);  /* sqrt(2) * (c1+c3) */
 
     /* Final output stage */
 
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1);
+    wsptr[DCTSIZE * 0] =
+      (int)DESCALE(tmp10 + tmp2, CONST_BITS - PASS1_BITS + 1);
+    wsptr[DCTSIZE * 3] =
+      (int)DESCALE(tmp10 - tmp2, CONST_BITS - PASS1_BITS + 1);
+    wsptr[DCTSIZE * 1] =
+      (int)DESCALE(tmp12 + tmp0, CONST_BITS - PASS1_BITS + 1);
+    wsptr[DCTSIZE * 2] =
+      (int)DESCALE(tmp12 - tmp0, CONST_BITS - PASS1_BITS + 1);
   }
 
   /* Pass 2: process 4 rows from work array, store into output array. */
@@ -206,8 +210,8 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
-                                  & RANGE_MASK];
+      JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+                                               PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -221,45 +225,45 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Even part */
 
-    tmp0 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+1);
+    tmp0 = LEFT_SHIFT((JLONG)wsptr[0], CONST_BITS + 1);
 
-    tmp2 = MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-         + MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865);
+    tmp2 = MULTIPLY((JLONG)wsptr[2],  FIX_1_847759065) +
+           MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865);
 
     tmp10 = tmp0 + tmp2;
     tmp12 = tmp0 - tmp2;
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[7];
-    z2 = (JLONG) wsptr[5];
-    z3 = (JLONG) wsptr[3];
-    z4 = (JLONG) wsptr[1];
+    z1 = (JLONG)wsptr[7];
+    z2 = (JLONG)wsptr[5];
+    z3 = (JLONG)wsptr[3];
+    z4 = (JLONG)wsptr[1];
 
-    tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-         + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-         + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-         + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
+    tmp0 = MULTIPLY(z1, -FIX_0_211164243) + /* sqrt(2) * ( c3-c1) */
+           MULTIPLY(z2,  FIX_1_451774981) + /* sqrt(2) * ( c3+c7) */
+           MULTIPLY(z3, -FIX_2_172734803) + /* sqrt(2) * (-c1-c5) */
+           MULTIPLY(z4,  FIX_1_061594337);  /* sqrt(2) * ( c5+c7) */
 
-    tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-         + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-         + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-         + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
+    tmp2 = MULTIPLY(z1, -FIX_0_509795579) + /* sqrt(2) * (c7-c5) */
+           MULTIPLY(z2, -FIX_0_601344887) + /* sqrt(2) * (c5-c1) */
+           MULTIPLY(z3, FIX_0_899976223) +  /* sqrt(2) * (c3-c7) */
+           MULTIPLY(z4, FIX_2_562915447);   /* sqrt(2) * (c1+c3) */
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2,
-                                          CONST_BITS+PASS1_BITS+3+1)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2,
-                                          CONST_BITS+PASS1_BITS+3+1)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0,
-                                          CONST_BITS+PASS1_BITS+3+1)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0,
-                                          CONST_BITS+PASS1_BITS+3+1)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp2,
+                                         CONST_BITS + PASS1_BITS + 3 + 1) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)DESCALE(tmp10 - tmp2,
+                                         CONST_BITS + PASS1_BITS + 3 + 1) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)DESCALE(tmp12 + tmp0,
+                                         CONST_BITS + PASS1_BITS + 3 + 1) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)DESCALE(tmp12 - tmp0,
+                                         CONST_BITS + PASS1_BITS + 3 + 1) &
+                            RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
@@ -272,9 +276,9 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp10, z1;
   JCOEFPTR inptr;
@@ -283,50 +287,52 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[DCTSIZE*2];     /* buffers data between passes */
+  int workspace[DCTSIZE * 2];   /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
     /* Don't bother to process columns 2,4,6 */
-    if (ctr == DCTSIZE-2 || ctr == DCTSIZE-4 || ctr == DCTSIZE-6)
+    if (ctr == DCTSIZE - 2 || ctr == DCTSIZE - 4 || ctr == DCTSIZE - 6)
       continue;
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*3] == 0 &&
-        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 3] == 0 &&
+        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero; we need not examine terms 2,4,6 for 2x2 output */
-      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
-                             PASS1_BITS);
+      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+                             quantptr[DCTSIZE * 0]), PASS1_BITS);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
 
       continue;
     }
 
     /* Even part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp10 = LEFT_SHIFT(z1, CONST_BITS+2);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+    tmp10 = LEFT_SHIFT(z1, CONST_BITS + 2);
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    tmp0 = MULTIPLY(z1, - FIX_0_720959822); /* sqrt(2) * (c7-c5+c3-c1) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp0 += MULTIPLY(z1, FIX_0_850430095); /* sqrt(2) * (-c1+c3+c5+c7) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp0 += MULTIPLY(z1, - FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    tmp0 += MULTIPLY(z1, FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+    tmp0 = MULTIPLY(z1, -FIX_0_720959822);  /* sqrt(2) * ( c7-c5+c3-c1) */
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    tmp0 += MULTIPLY(z1, FIX_0_850430095);  /* sqrt(2) * (-c1+c3+c5+c7) */
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    tmp0 += MULTIPLY(z1, -FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    tmp0 += MULTIPLY(z1, FIX_3_624509785);  /* sqrt(2) * ( c1+c3+c5+c7) */
 
     /* Final output stage */
 
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS+2);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS+2);
+    wsptr[DCTSIZE * 0] =
+      (int)DESCALE(tmp10 + tmp0, CONST_BITS - PASS1_BITS + 2);
+    wsptr[DCTSIZE * 1] =
+      (int)DESCALE(tmp10 - tmp0, CONST_BITS - PASS1_BITS + 2);
   }
 
   /* Pass 2: process 2 rows from work array, store into output array. */
@@ -339,8 +345,8 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 #ifndef NO_ZERO_ROW_TEST
     if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
-                                  & RANGE_MASK];
+      JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+                                               PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -352,23 +358,23 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Even part */
 
-    tmp10 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+2);
+    tmp10 = LEFT_SHIFT((JLONG)wsptr[0], CONST_BITS + 2);
 
     /* Odd part */
 
-    tmp0 = MULTIPLY((JLONG) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */
-         + MULTIPLY((JLONG) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */
-         + MULTIPLY((JLONG) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */
-         + MULTIPLY((JLONG) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
+    tmp0 = MULTIPLY((JLONG)wsptr[7], -FIX_0_720959822) + /* sqrt(2) * ( c7-c5+c3-c1) */
+           MULTIPLY((JLONG)wsptr[5],  FIX_0_850430095) + /* sqrt(2) * (-c1+c3+c5+c7) */
+           MULTIPLY((JLONG)wsptr[3], -FIX_1_272758580) + /* sqrt(2) * (-c1+c3-c5-c7) */
+           MULTIPLY((JLONG)wsptr[1],  FIX_3_624509785);  /* sqrt(2) * ( c1+c3+c5+c7) */
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0,
-                                          CONST_BITS+PASS1_BITS+3+2)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp10 - tmp0,
-                                          CONST_BITS+PASS1_BITS+3+2)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp0,
+                                         CONST_BITS + PASS1_BITS + 3 + 2) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)DESCALE(tmp10 - tmp0,
+                                         CONST_BITS + PASS1_BITS + 3 + 2) &
+                            RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
@@ -381,9 +387,9 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_1x1(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   int dcval;
   ISLOW_MULT_TYPE *quantptr;
@@ -393,9 +399,9 @@ jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   /* We hardly need an inverse DCT routine for this: just take the
    * average pixel value, which is one-eighth of the DC coefficient.
    */
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
-  dcval = (int) DESCALE((JLONG) dcval, 3);
+  dcval = (int)DESCALE((JLONG)dcval, 3);
 
   output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
 }
diff --git a/media/libjpeg/jinclude.h b/media/libjpeg/jinclude.h
index d461a1aa16..120614b25c 100644
--- a/media/libjpeg/jinclude.h
+++ b/media/libjpeg/jinclude.h
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1994, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -17,68 +17,117 @@
  * JPEG library.  Most applications need only include jpeglib.h.
  */
 
+#ifndef __JINCLUDE_H__
+#define __JINCLUDE_H__
 
 /* Include auto-config file to find out which system include files we need. */
 
 #include "jconfig.h"            /* auto configuration options */
+#include "jconfigint.h"
 #define JCONFIG_INCLUDED        /* so that jpeglib.h doesn't do it again */
 
 /*
- * We need the NULL macro and size_t typedef.
- * On an ANSI-conforming system it is sufficient to include <stddef.h>.
- * Otherwise, we get them from <stdlib.h> or <stdio.h>; we may have to
- * pull in <sys/types.h> as well.
  * Note that the core JPEG library does not require <stdio.h>;
  * only the default error handler and data source/destination modules do.
  * But we must pull it in because of the references to FILE in jpeglib.h.
  * You can remove those references if you want to compile without <stdio.h>.
  */
 
-#ifdef HAVE_STDDEF_H
 #include <stddef.h>
-#endif
-
-#ifdef HAVE_STDLIB_H
 #include <stdlib.h>
-#endif
-
-#ifdef NEED_SYS_TYPES_H
-#include <sys/types.h>
-#endif
-
 #include <stdio.h>
+#include <string.h>
 
 /*
- * We need memory copying and zeroing functions, plus strncpy().
- * ANSI and System V implementations declare these in <string.h>.
- * BSD doesn't have the mem() functions, but it does have bcopy()/bzero().
- * Some systems may declare memset and memcpy in <memory.h>.
- *
- * NOTE: we assume the size parameters to these functions are of type size_t.
- * Change the casts in these macros if not!
+ * These macros/inline functions facilitate using Microsoft's "safe string"
+ * functions with Visual Studio builds without the need to scatter #ifdefs
+ * throughout the code base.
  */
 
-#ifdef NEED_BSD_STRINGS
 
-#include <strings.h>
-#define MEMZERO(target,size)    bzero((void *)(target), (size_t)(size))
-#define MEMCOPY(dest,src,size)  bcopy((const void *)(src), (void *)(dest), (size_t)(size))
+#ifndef NO_GETENV
 
-#else /* not BSD, assume ANSI/SysV string lib */
+#ifdef _MSC_VER
 
-#include <string.h>
-#define MEMZERO(target,size)    memset((void *)(target), 0, (size_t)(size))
-#define MEMCOPY(dest,src,size)  memcpy((void *)(dest), (const void *)(src), (size_t)(size))
+static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
+{
+  size_t required_size;
 
-#endif
+  return (int)getenv_s(&required_size, buffer, buffer_size, name);
+}
 
-/*
- * The modules that use fread() and fwrite() always invoke them through
- * these macros.  On some systems you may need to twiddle the argument casts.
- * CAUTION: argument order is different from underlying functions!
+#else /* _MSC_VER */
+
+#include <errno.h>
+
+/* This provides a similar interface to the Microsoft/C11 getenv_s() function,
+ * but other than parameter validation, it has no advantages over getenv().
+ */
+
+static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
+{
+  char *env;
+
+  if (!buffer) {
+    if (buffer_size == 0)
+      return 0;
+    else
+      return (errno = EINVAL);
+  }
+  if (buffer_size == 0)
+    return (errno = EINVAL);
+  if (!name) {
+    *buffer = 0;
+    return 0;
+  }
+
+  env = getenv(name);
+  if (!env)
+  {
+    *buffer = 0;
+    return 0;
+  }
+
+  if (strlen(env) + 1 > buffer_size) {
+    *buffer = 0;
+    return ERANGE;
+  }
+
+  strncpy(buffer, env, buffer_size);
+
+  return 0;
+}
+
+#endif /* _MSC_VER */
+
+#endif /* NO_GETENV */
+
+
+#ifndef NO_PUTENV
+
+#ifdef _WIN32
+
+#define PUTENV_S(name, value)  _putenv_s(name, value)
+
+#else
+
+/* This provides a similar interface to the Microsoft _putenv_s() function, but
+ * other than parameter validation, it has no advantages over setenv().
  */
 
-#define JFREAD(file,buf,sizeofbuf)  \
-  ((size_t) fread((void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
-#define JFWRITE(file,buf,sizeofbuf)  \
-  ((size_t) fwrite((const void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
+static INLINE int PUTENV_S(const char *name, const char *value)
+{
+  if (!name || !value)
+    return (errno = EINVAL);
+
+  setenv(name, value, 1);
+
+  return errno;
+}
+
+#endif /* _WIN32 */
+
+#endif /* NO_PUTENV */
+
+
+#endif /* JINCLUDE_H */
diff --git a/media/libjpeg/jmemmgr.c b/media/libjpeg/jmemmgr.c
index 2a8d8401f4..8f5a4ab1c7 100644
--- a/media/libjpeg/jmemmgr.c
+++ b/media/libjpeg/jmemmgr.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2016, D. R. Commander.
+ * Copyright (C) 2016, 2021-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -32,18 +32,14 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jmemsys.h"            /* import the system-dependent declarations */
+#if !defined(_MSC_VER) || _MSC_VER > 1600
 #include <stdint.h>
-#include <limits.h>             /* some NDKs define SIZE_MAX in limits.h */
-
-#ifndef NO_GETENV
-#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare getenv() */
-extern char *getenv (const char *name);
-#endif
 #endif
+#include <limits.h>
 
 
 LOCAL(size_t)
-round_up_pow2 (size_t a, size_t b)
+round_up_pow2(size_t a, size_t b)
 /* a rounded up to the next multiple of b, i.e. ceil(a/b)*b */
 /* Assumes a >= 0, b > 0, and b is a power of 2 */
 {
@@ -87,7 +83,9 @@ round_up_pow2 (size_t a, size_t b)
 #ifndef WITH_SIMD
 #define ALIGN_SIZE  sizeof(double)
 #else
-#define ALIGN_SIZE  16 /* Most SIMD implementations require this */
+#define ALIGN_SIZE  32 /* Most of the SIMD instructions we support require
+                          16-byte (128-bit) alignment, but AVX2 requires
+                          32-byte alignment. */
 #endif
 #endif
 
@@ -102,7 +100,7 @@ round_up_pow2 (size_t a, size_t b)
 typedef struct small_pool_struct *small_pool_ptr;
 
 typedef struct small_pool_struct {
-  small_pool_ptr next;  /* next in list of pools */
+  small_pool_ptr next;          /* next in list of pools */
   size_t bytes_used;            /* how many bytes already used within pool */
   size_t bytes_left;            /* bytes still available in this pool */
 } small_pool_hdr;
@@ -110,7 +108,7 @@ typedef struct small_pool_struct {
 typedef struct large_pool_struct *large_pool_ptr;
 
 typedef struct large_pool_struct {
-  large_pool_ptr next;  /* next in list of pools */
+  large_pool_ptr next;          /* next in list of pools */
   size_t bytes_used;            /* how many bytes already used within pool */
   size_t bytes_left;            /* bytes still available in this pool */
 } large_pool_hdr;
@@ -189,9 +187,9 @@ struct jvirt_barray_control {
 #ifdef MEM_STATS                /* optional extra stuff for statistics */
 
 LOCAL(void)
-print_mem_stats (j_common_ptr cinfo, int pool_id)
+print_mem_stats(j_common_ptr cinfo, int pool_id)
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   small_pool_ptr shdr_ptr;
   large_pool_ptr lhdr_ptr;
 
@@ -204,15 +202,13 @@ print_mem_stats (j_common_ptr cinfo, int pool_id)
 
   for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL;
        lhdr_ptr = lhdr_ptr->next) {
-    fprintf(stderr, "  Large chunk used %ld\n",
-            (long) lhdr_ptr->bytes_used);
+    fprintf(stderr, "  Large chunk used %ld\n", (long)lhdr_ptr->bytes_used);
   }
 
   for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL;
        shdr_ptr = shdr_ptr->next) {
     fprintf(stderr, "  Small chunk used %ld free %ld\n",
-            (long) shdr_ptr->bytes_used,
-            (long) shdr_ptr->bytes_left);
+            (long)shdr_ptr->bytes_used, (long)shdr_ptr->bytes_left);
   }
 }
 
@@ -220,7 +216,7 @@ print_mem_stats (j_common_ptr cinfo, int pool_id)
 
 
 LOCAL(void)
-out_of_memory (j_common_ptr cinfo, int which)
+out_of_memory(j_common_ptr cinfo, int which)
 /* Report an out-of-memory error and stop execution */
 /* If we compiled MEM_STATS support, report alloc requests before dying */
 {
@@ -248,26 +244,24 @@ out_of_memory (j_common_ptr cinfo, int which)
  * adjustment.
  */
 
-static const size_t first_pool_slop[JPOOL_NUMPOOLS] =
-{
-        1600,                   /* first PERMANENT pool */
-        16000                   /* first IMAGE pool */
+static const size_t first_pool_slop[JPOOL_NUMPOOLS] = {
+  1600,                         /* first PERMANENT pool */
+  16000                         /* first IMAGE pool */
 };
 
-static const size_t extra_pool_slop[JPOOL_NUMPOOLS] =
-{
-        0,                      /* additional PERMANENT pools */
-        5000                    /* additional IMAGE pools */
+static const size_t extra_pool_slop[JPOOL_NUMPOOLS] = {
+  0,                            /* additional PERMANENT pools */
+  5000                          /* additional IMAGE pools */
 };
 
 #define MIN_SLOP  50            /* greater than 0 to avoid futile looping */
 
 
 METHODDEF(void *)
-alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
+alloc_small(j_common_ptr cinfo, int pool_id, size_t sizeofobject)
 /* Allocate a "small" object */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   small_pool_ptr hdr_ptr, prev_hdr_ptr;
   char *data_ptr;
   size_t min_request, slop;
@@ -311,11 +305,11 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
     else
       slop = extra_pool_slop[pool_id];
     /* Don't ask for more than MAX_ALLOC_CHUNK */
-    if (slop > (size_t) (MAX_ALLOC_CHUNK-min_request))
-      slop = (size_t) (MAX_ALLOC_CHUNK-min_request);
+    if (slop > (size_t)(MAX_ALLOC_CHUNK - min_request))
+      slop = (size_t)(MAX_ALLOC_CHUNK - min_request);
     /* Try to get space, if fail reduce slop and try again */
     for (;;) {
-      hdr_ptr = (small_pool_ptr) jpeg_get_small(cinfo, min_request + slop);
+      hdr_ptr = (small_pool_ptr)jpeg_get_small(cinfo, min_request + slop);
       if (hdr_ptr != NULL)
         break;
       slop /= 2;
@@ -334,7 +328,7 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
   }
 
   /* OK, allocate the object from the current pool */
-  data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+  data_ptr = (char *)hdr_ptr; /* point to first data byte in pool... */
   data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */
   if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
     data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
@@ -342,7 +336,7 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
   hdr_ptr->bytes_used += sizeofobject;
   hdr_ptr->bytes_left -= sizeofobject;
 
-  return (void *) data_ptr;
+  return (void *)data_ptr;
 }
 
 
@@ -360,10 +354,10 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
  */
 
 METHODDEF(void *)
-alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
+alloc_large(j_common_ptr cinfo, int pool_id, size_t sizeofobject)
 /* Allocate a "large" object */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   large_pool_ptr hdr_ptr;
   char *data_ptr;
 
@@ -388,9 +382,9 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
   if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
-  hdr_ptr = (large_pool_ptr) jpeg_get_large(cinfo, sizeofobject +
-                                            sizeof(large_pool_hdr) +
-                                            ALIGN_SIZE - 1);
+  hdr_ptr = (large_pool_ptr)jpeg_get_large(cinfo, sizeofobject +
+                                           sizeof(large_pool_hdr) +
+                                           ALIGN_SIZE - 1);
   if (hdr_ptr == NULL)
     out_of_memory(cinfo, 4);    /* jpeg_get_large failed */
   mem->total_space_allocated += sizeofobject + sizeof(large_pool_hdr) +
@@ -405,12 +399,12 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
   hdr_ptr->bytes_left = 0;
   mem->large_list[pool_id] = hdr_ptr;
 
-  data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+  data_ptr = (char *)hdr_ptr; /* point to first data byte in pool... */
   data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */
   if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
     data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
 
-  return (void *) data_ptr;
+  return (void *)data_ptr;
 }
 
 
@@ -431,11 +425,11 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
  */
 
 METHODDEF(JSAMPARRAY)
-alloc_sarray (j_common_ptr cinfo, int pool_id,
-              JDIMENSION samplesperrow, JDIMENSION numrows)
+alloc_sarray(j_common_ptr cinfo, int pool_id, JDIMENSION samplesperrow,
+             JDIMENSION numrows)
 /* Allocate a 2-D sample array */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   JSAMPARRAY result;
   JSAMPROW workspace;
   JDIMENSION rowsperchunk, currow, i;
@@ -454,27 +448,27 @@ alloc_sarray (j_common_ptr cinfo, int pool_id,
                                                            sizeof(JSAMPLE));
 
   /* Calculate max # of rows allowed in one allocation chunk */
-  ltemp = (MAX_ALLOC_CHUNK-sizeof(large_pool_hdr)) /
-          ((long) samplesperrow * sizeof(JSAMPLE));
+  ltemp = (MAX_ALLOC_CHUNK - sizeof(large_pool_hdr)) /
+          ((long)samplesperrow * sizeof(JSAMPLE));
   if (ltemp <= 0)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
-  if (ltemp < (long) numrows)
-    rowsperchunk = (JDIMENSION) ltemp;
+  if (ltemp < (long)numrows)
+    rowsperchunk = (JDIMENSION)ltemp;
   else
     rowsperchunk = numrows;
   mem->last_rowsperchunk = rowsperchunk;
 
   /* Get space for row pointers (small object) */
-  result = (JSAMPARRAY) alloc_small(cinfo, pool_id,
-                                    (size_t) (numrows * sizeof(JSAMPROW)));
+  result = (JSAMPARRAY)alloc_small(cinfo, pool_id,
+                                   (size_t)(numrows * sizeof(JSAMPROW)));
 
   /* Get the rows themselves (large objects) */
   currow = 0;
   while (currow < numrows) {
     rowsperchunk = MIN(rowsperchunk, numrows - currow);
-    workspace = (JSAMPROW) alloc_large(cinfo, pool_id,
-        (size_t) ((size_t) rowsperchunk * (size_t) samplesperrow
-                  * sizeof(JSAMPLE)));
+    workspace = (JSAMPROW)alloc_large(cinfo, pool_id,
+      (size_t)((size_t)rowsperchunk * (size_t)samplesperrow *
+               sizeof(JSAMPLE)));
     for (i = rowsperchunk; i > 0; i--) {
       result[currow++] = workspace;
       workspace += samplesperrow;
@@ -491,11 +485,11 @@ alloc_sarray (j_common_ptr cinfo, int pool_id,
  */
 
 METHODDEF(JBLOCKARRAY)
-alloc_barray (j_common_ptr cinfo, int pool_id,
-              JDIMENSION blocksperrow, JDIMENSION numrows)
+alloc_barray(j_common_ptr cinfo, int pool_id, JDIMENSION blocksperrow,
+             JDIMENSION numrows)
 /* Allocate a 2-D coefficient-block array */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   JBLOCKARRAY result;
   JBLOCKROW workspace;
   JDIMENSION rowsperchunk, currow, i;
@@ -506,27 +500,27 @@ alloc_barray (j_common_ptr cinfo, int pool_id,
     out_of_memory(cinfo, 6);    /* safety check */
 
   /* Calculate max # of rows allowed in one allocation chunk */
-  ltemp = (MAX_ALLOC_CHUNK-sizeof(large_pool_hdr)) /
-          ((long) blocksperrow * sizeof(JBLOCK));
+  ltemp = (MAX_ALLOC_CHUNK - sizeof(large_pool_hdr)) /
+          ((long)blocksperrow * sizeof(JBLOCK));
   if (ltemp <= 0)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
-  if (ltemp < (long) numrows)
-    rowsperchunk = (JDIMENSION) ltemp;
+  if (ltemp < (long)numrows)
+    rowsperchunk = (JDIMENSION)ltemp;
   else
     rowsperchunk = numrows;
   mem->last_rowsperchunk = rowsperchunk;
 
   /* Get space for row pointers (small object) */
-  result = (JBLOCKARRAY) alloc_small(cinfo, pool_id,
-                                     (size_t) (numrows * sizeof(JBLOCKROW)));
+  result = (JBLOCKARRAY)alloc_small(cinfo, pool_id,
+                                    (size_t)(numrows * sizeof(JBLOCKROW)));
 
   /* Get the rows themselves (large objects) */
   currow = 0;
   while (currow < numrows) {
     rowsperchunk = MIN(rowsperchunk, numrows - currow);
-    workspace = (JBLOCKROW) alloc_large(cinfo, pool_id,
-        (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
-                  * sizeof(JBLOCK)));
+    workspace = (JBLOCKROW)alloc_large(cinfo, pool_id,
+        (size_t)((size_t)rowsperchunk * (size_t)blocksperrow *
+                  sizeof(JBLOCK)));
     for (i = rowsperchunk; i > 0; i--) {
       result[currow++] = workspace;
       workspace += blocksperrow;
@@ -575,12 +569,12 @@ alloc_barray (j_common_ptr cinfo, int pool_id,
 
 
 METHODDEF(jvirt_sarray_ptr)
-request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
-                     JDIMENSION samplesperrow, JDIMENSION numrows,
-                     JDIMENSION maxaccess)
+request_virt_sarray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+                    JDIMENSION samplesperrow, JDIMENSION numrows,
+                    JDIMENSION maxaccess)
 /* Request a virtual 2-D sample array */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   jvirt_sarray_ptr result;
 
   /* Only IMAGE-lifetime virtual arrays are currently supported */
@@ -588,8 +582,8 @@ request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
   /* get control block */
-  result = (jvirt_sarray_ptr) alloc_small(cinfo, pool_id,
-                                          sizeof(struct jvirt_sarray_control));
+  result = (jvirt_sarray_ptr)alloc_small(cinfo, pool_id,
+                                         sizeof(struct jvirt_sarray_control));
 
   result->mem_buffer = NULL;    /* marks array not yet realized */
   result->rows_in_array = numrows;
@@ -605,12 +599,12 @@ request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
 
 
 METHODDEF(jvirt_barray_ptr)
-request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
-                     JDIMENSION blocksperrow, JDIMENSION numrows,
-                     JDIMENSION maxaccess)
+request_virt_barray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+                    JDIMENSION blocksperrow, JDIMENSION numrows,
+                    JDIMENSION maxaccess)
 /* Request a virtual 2-D coefficient-block array */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   jvirt_barray_ptr result;
 
   /* Only IMAGE-lifetime virtual arrays are currently supported */
@@ -618,8 +612,8 @@ request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
   /* get control block */
-  result = (jvirt_barray_ptr) alloc_small(cinfo, pool_id,
-                                          sizeof(struct jvirt_barray_control));
+  result = (jvirt_barray_ptr)alloc_small(cinfo, pool_id,
+                                         sizeof(struct jvirt_barray_control));
 
   result->mem_buffer = NULL;    /* marks array not yet realized */
   result->rows_in_array = numrows;
@@ -635,10 +629,10 @@ request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
 
 
 METHODDEF(void)
-realize_virt_arrays (j_common_ptr cinfo)
+realize_virt_arrays(j_common_ptr cinfo)
 /* Allocate the in-memory buffers for any unrealized virtual arrays */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   size_t space_per_minheight, maximum_space, avail_mem;
   size_t minheights, max_minheights;
   jvirt_sarray_ptr sptr;
@@ -652,11 +646,11 @@ realize_virt_arrays (j_common_ptr cinfo)
   maximum_space = 0;
   for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
     if (sptr->mem_buffer == NULL) { /* if not realized yet */
-      size_t new_space = (long) sptr->rows_in_array *
-                         (long) sptr->samplesperrow * sizeof(JSAMPLE);
+      size_t new_space = (long)sptr->rows_in_array *
+                         (long)sptr->samplesperrow * sizeof(JSAMPLE);
 
-      space_per_minheight += (long) sptr->maxaccess *
-                             (long) sptr->samplesperrow * sizeof(JSAMPLE);
+      space_per_minheight += (long)sptr->maxaccess *
+                             (long)sptr->samplesperrow * sizeof(JSAMPLE);
       if (SIZE_MAX - maximum_space < new_space)
         out_of_memory(cinfo, 10);
       maximum_space += new_space;
@@ -664,11 +658,11 @@ realize_virt_arrays (j_common_ptr cinfo)
   }
   for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
     if (bptr->mem_buffer == NULL) { /* if not realized yet */
-      size_t new_space = (long) bptr->rows_in_array *
-                         (long) bptr->blocksperrow * sizeof(JBLOCK);
+      size_t new_space = (long)bptr->rows_in_array *
+                         (long)bptr->blocksperrow * sizeof(JBLOCK);
 
-      space_per_minheight += (long) bptr->maxaccess *
-                             (long) bptr->blocksperrow * sizeof(JBLOCK);
+      space_per_minheight += (long)bptr->maxaccess *
+                             (long)bptr->blocksperrow * sizeof(JBLOCK);
       if (SIZE_MAX - maximum_space < new_space)
         out_of_memory(cinfo, 11);
       maximum_space += new_space;
@@ -701,17 +695,17 @@ realize_virt_arrays (j_common_ptr cinfo)
 
   for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
     if (sptr->mem_buffer == NULL) { /* if not realized yet */
-      minheights = ((long) sptr->rows_in_array - 1L) / sptr->maxaccess + 1L;
+      minheights = ((long)sptr->rows_in_array - 1L) / sptr->maxaccess + 1L;
       if (minheights <= max_minheights) {
         /* This buffer fits in memory */
         sptr->rows_in_mem = sptr->rows_in_array;
       } else {
         /* It doesn't fit in memory, create backing store. */
-        sptr->rows_in_mem = (JDIMENSION) (max_minheights * sptr->maxaccess);
-        jpeg_open_backing_store(cinfo, & sptr->b_s_info,
-                                (long) sptr->rows_in_array *
-                                (long) sptr->samplesperrow *
-                                (long) sizeof(JSAMPLE));
+        sptr->rows_in_mem = (JDIMENSION)(max_minheights * sptr->maxaccess);
+        jpeg_open_backing_store(cinfo, &sptr->b_s_info,
+                                (long)sptr->rows_in_array *
+                                (long)sptr->samplesperrow *
+                                (long)sizeof(JSAMPLE));
         sptr->b_s_open = TRUE;
       }
       sptr->mem_buffer = alloc_sarray(cinfo, JPOOL_IMAGE,
@@ -725,17 +719,17 @@ realize_virt_arrays (j_common_ptr cinfo)
 
   for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
     if (bptr->mem_buffer == NULL) { /* if not realized yet */
-      minheights = ((long) bptr->rows_in_array - 1L) / bptr->maxaccess + 1L;
+      minheights = ((long)bptr->rows_in_array - 1L) / bptr->maxaccess + 1L;
       if (minheights <= max_minheights) {
         /* This buffer fits in memory */
         bptr->rows_in_mem = bptr->rows_in_array;
       } else {
         /* It doesn't fit in memory, create backing store. */
-        bptr->rows_in_mem = (JDIMENSION) (max_minheights * bptr->maxaccess);
-        jpeg_open_backing_store(cinfo, & bptr->b_s_info,
-                                (long) bptr->rows_in_array *
-                                (long) bptr->blocksperrow *
-                                (long) sizeof(JBLOCK));
+        bptr->rows_in_mem = (JDIMENSION)(max_minheights * bptr->maxaccess);
+        jpeg_open_backing_store(cinfo, &bptr->b_s_info,
+                                (long)bptr->rows_in_array *
+                                (long)bptr->blocksperrow *
+                                (long)sizeof(JBLOCK));
         bptr->b_s_open = TRUE;
       }
       bptr->mem_buffer = alloc_barray(cinfo, JPOOL_IMAGE,
@@ -750,32 +744,32 @@ realize_virt_arrays (j_common_ptr cinfo)
 
 
 LOCAL(void)
-do_sarray_io (j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
+do_sarray_io(j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
 /* Do backing store read or write of a virtual sample array */
 {
   long bytesperrow, file_offset, byte_count, rows, thisrow, i;
 
-  bytesperrow = (long) ptr->samplesperrow * sizeof(JSAMPLE);
+  bytesperrow = (long)ptr->samplesperrow * sizeof(JSAMPLE);
   file_offset = ptr->cur_start_row * bytesperrow;
   /* Loop to read or write each allocation chunk in mem_buffer */
-  for (i = 0; i < (long) ptr->rows_in_mem; i += ptr->rowsperchunk) {
+  for (i = 0; i < (long)ptr->rows_in_mem; i += ptr->rowsperchunk) {
     /* One chunk, but check for short chunk at end of buffer */
-    rows = MIN((long) ptr->rowsperchunk, (long) ptr->rows_in_mem - i);
+    rows = MIN((long)ptr->rowsperchunk, (long)ptr->rows_in_mem - i);
     /* Transfer no more than is currently defined */
-    thisrow = (long) ptr->cur_start_row + i;
-    rows = MIN(rows, (long) ptr->first_undef_row - thisrow);
+    thisrow = (long)ptr->cur_start_row + i;
+    rows = MIN(rows, (long)ptr->first_undef_row - thisrow);
     /* Transfer no more than fits in file */
-    rows = MIN(rows, (long) ptr->rows_in_array - thisrow);
+    rows = MIN(rows, (long)ptr->rows_in_array - thisrow);
     if (rows <= 0)              /* this chunk might be past end of file! */
       break;
     byte_count = rows * bytesperrow;
     if (writing)
-      (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info,
-                                            (void *) ptr->mem_buffer[i],
+      (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+                                            (void *)ptr->mem_buffer[i],
                                             file_offset, byte_count);
     else
-      (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info,
-                                           (void *) ptr->mem_buffer[i],
+      (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+                                           (void *)ptr->mem_buffer[i],
                                            file_offset, byte_count);
     file_offset += byte_count;
   }
@@ -783,32 +777,32 @@ do_sarray_io (j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
 
 
 LOCAL(void)
-do_barray_io (j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing)
+do_barray_io(j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing)
 /* Do backing store read or write of a virtual coefficient-block array */
 {
   long bytesperrow, file_offset, byte_count, rows, thisrow, i;
 
-  bytesperrow = (long) ptr->blocksperrow * sizeof(JBLOCK);
+  bytesperrow = (long)ptr->blocksperrow * sizeof(JBLOCK);
   file_offset = ptr->cur_start_row * bytesperrow;
   /* Loop to read or write each allocation chunk in mem_buffer */
-  for (i = 0; i < (long) ptr->rows_in_mem; i += ptr->rowsperchunk) {
+  for (i = 0; i < (long)ptr->rows_in_mem; i += ptr->rowsperchunk) {
     /* One chunk, but check for short chunk at end of buffer */
-    rows = MIN((long) ptr->rowsperchunk, (long) ptr->rows_in_mem - i);
+    rows = MIN((long)ptr->rowsperchunk, (long)ptr->rows_in_mem - i);
     /* Transfer no more than is currently defined */
-    thisrow = (long) ptr->cur_start_row + i;
-    rows = MIN(rows, (long) ptr->first_undef_row - thisrow);
+    thisrow = (long)ptr->cur_start_row + i;
+    rows = MIN(rows, (long)ptr->first_undef_row - thisrow);
     /* Transfer no more than fits in file */
-    rows = MIN(rows, (long) ptr->rows_in_array - thisrow);
+    rows = MIN(rows, (long)ptr->rows_in_array - thisrow);
     if (rows <= 0)              /* this chunk might be past end of file! */
       break;
     byte_count = rows * bytesperrow;
     if (writing)
-      (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info,
-                                            (void *) ptr->mem_buffer[i],
+      (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+                                            (void *)ptr->mem_buffer[i],
                                             file_offset, byte_count);
     else
-      (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info,
-                                           (void *) ptr->mem_buffer[i],
+      (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+                                           (void *)ptr->mem_buffer[i],
                                            file_offset, byte_count);
     file_offset += byte_count;
   }
@@ -816,9 +810,8 @@ do_barray_io (j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing)
 
 
 METHODDEF(JSAMPARRAY)
-access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
-                    JDIMENSION start_row, JDIMENSION num_rows,
-                    boolean writable)
+access_virt_sarray(j_common_ptr cinfo, jvirt_sarray_ptr ptr,
+                   JDIMENSION start_row, JDIMENSION num_rows, boolean writable)
 /* Access the part of a virtual sample array starting at start_row */
 /* and extending for num_rows rows.  writable is true if  */
 /* caller intends to modify the accessed area. */
@@ -833,8 +826,8 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
 
   /* Make the desired part of the virtual array accessible */
   if (start_row < ptr->cur_start_row ||
-      end_row > ptr->cur_start_row+ptr->rows_in_mem) {
-    if (! ptr->b_s_open)
+      end_row > ptr->cur_start_row + ptr->rows_in_mem) {
+    if (!ptr->b_s_open)
       ERREXIT(cinfo, JERR_VIRTUAL_BUG);
     /* Flush old buffer contents if necessary */
     if (ptr->dirty) {
@@ -854,10 +847,10 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
       /* use long arithmetic here to avoid overflow & unsigned problems */
       long ltemp;
 
-      ltemp = (long) end_row - (long) ptr->rows_in_mem;
+      ltemp = (long)end_row - (long)ptr->rows_in_mem;
       if (ltemp < 0)
         ltemp = 0;              /* don't fall off front end of file */
-      ptr->cur_start_row = (JDIMENSION) ltemp;
+      ptr->cur_start_row = (JDIMENSION)ltemp;
     }
     /* Read in the selected part of the array.
      * During the initial write pass, we will do no actual read
@@ -880,15 +873,15 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
     if (writable)
       ptr->first_undef_row = end_row;
     if (ptr->pre_zero) {
-      size_t bytesperrow = (size_t) ptr->samplesperrow * sizeof(JSAMPLE);
+      size_t bytesperrow = (size_t)ptr->samplesperrow * sizeof(JSAMPLE);
       undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
       end_row -= ptr->cur_start_row;
       while (undef_row < end_row) {
-        jzero_far((void *) ptr->mem_buffer[undef_row], bytesperrow);
+        jzero_far((void *)ptr->mem_buffer[undef_row], bytesperrow);
         undef_row++;
       }
     } else {
-      if (! writable)           /* reader looking at undefined data */
+      if (!writable)            /* reader looking at undefined data */
         ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
     }
   }
@@ -901,9 +894,8 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
 
 
 METHODDEF(JBLOCKARRAY)
-access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
-                    JDIMENSION start_row, JDIMENSION num_rows,
-                    boolean writable)
+access_virt_barray(j_common_ptr cinfo, jvirt_barray_ptr ptr,
+                   JDIMENSION start_row, JDIMENSION num_rows, boolean writable)
 /* Access the part of a virtual block array starting at start_row */
 /* and extending for num_rows rows.  writable is true if  */
 /* caller intends to modify the accessed area. */
@@ -918,8 +910,8 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
 
   /* Make the desired part of the virtual array accessible */
   if (start_row < ptr->cur_start_row ||
-      end_row > ptr->cur_start_row+ptr->rows_in_mem) {
-    if (! ptr->b_s_open)
+      end_row > ptr->cur_start_row + ptr->rows_in_mem) {
+    if (!ptr->b_s_open)
       ERREXIT(cinfo, JERR_VIRTUAL_BUG);
     /* Flush old buffer contents if necessary */
     if (ptr->dirty) {
@@ -939,10 +931,10 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
       /* use long arithmetic here to avoid overflow & unsigned problems */
       long ltemp;
 
-      ltemp = (long) end_row - (long) ptr->rows_in_mem;
+      ltemp = (long)end_row - (long)ptr->rows_in_mem;
       if (ltemp < 0)
         ltemp = 0;              /* don't fall off front end of file */
-      ptr->cur_start_row = (JDIMENSION) ltemp;
+      ptr->cur_start_row = (JDIMENSION)ltemp;
     }
     /* Read in the selected part of the array.
      * During the initial write pass, we will do no actual read
@@ -965,15 +957,15 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
     if (writable)
       ptr->first_undef_row = end_row;
     if (ptr->pre_zero) {
-      size_t bytesperrow = (size_t) ptr->blocksperrow * sizeof(JBLOCK);
+      size_t bytesperrow = (size_t)ptr->blocksperrow * sizeof(JBLOCK);
       undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
       end_row -= ptr->cur_start_row;
       while (undef_row < end_row) {
-        jzero_far((void *) ptr->mem_buffer[undef_row], bytesperrow);
+        jzero_far((void *)ptr->mem_buffer[undef_row], bytesperrow);
         undef_row++;
       }
     } else {
-      if (! writable)           /* reader looking at undefined data */
+      if (!writable)            /* reader looking at undefined data */
         ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
     }
   }
@@ -990,9 +982,9 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
  */
 
 METHODDEF(void)
-free_pool (j_common_ptr cinfo, int pool_id)
+free_pool(j_common_ptr cinfo, int pool_id)
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   small_pool_ptr shdr_ptr;
   large_pool_ptr lhdr_ptr;
   size_t space_freed;
@@ -1013,14 +1005,14 @@ free_pool (j_common_ptr cinfo, int pool_id)
     for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
       if (sptr->b_s_open) {     /* there may be no backing store */
         sptr->b_s_open = FALSE; /* prevent recursive close if error */
-        (*sptr->b_s_info.close_backing_store) (cinfo, & sptr->b_s_info);
+        (*sptr->b_s_info.close_backing_store) (cinfo, &sptr->b_s_info);
       }
     }
     mem->virt_sarray_list = NULL;
     for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
       if (bptr->b_s_open) {     /* there may be no backing store */
         bptr->b_s_open = FALSE; /* prevent recursive close if error */
-        (*bptr->b_s_info.close_backing_store) (cinfo, & bptr->b_s_info);
+        (*bptr->b_s_info.close_backing_store) (cinfo, &bptr->b_s_info);
       }
     }
     mem->virt_barray_list = NULL;
@@ -1034,8 +1026,8 @@ free_pool (j_common_ptr cinfo, int pool_id)
     large_pool_ptr next_lhdr_ptr = lhdr_ptr->next;
     space_freed = lhdr_ptr->bytes_used +
                   lhdr_ptr->bytes_left +
-                  sizeof(large_pool_hdr);
-    jpeg_free_large(cinfo, (void *) lhdr_ptr, space_freed);
+                  sizeof(large_pool_hdr) + ALIGN_SIZE - 1;
+    jpeg_free_large(cinfo, (void *)lhdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
     lhdr_ptr = next_lhdr_ptr;
   }
@@ -1046,10 +1038,9 @@ free_pool (j_common_ptr cinfo, int pool_id)
 
   while (shdr_ptr != NULL) {
     small_pool_ptr next_shdr_ptr = shdr_ptr->next;
-    space_freed = shdr_ptr->bytes_used +
-                  shdr_ptr->bytes_left +
-                  sizeof(small_pool_hdr);
-    jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed);
+    space_freed = shdr_ptr->bytes_used + shdr_ptr->bytes_left +
+                  sizeof(small_pool_hdr) + ALIGN_SIZE - 1;
+    jpeg_free_small(cinfo, (void *)shdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
     shdr_ptr = next_shdr_ptr;
   }
@@ -1062,7 +1053,7 @@ free_pool (j_common_ptr cinfo, int pool_id)
  */
 
 METHODDEF(void)
-self_destruct (j_common_ptr cinfo)
+self_destruct(j_common_ptr cinfo)
 {
   int pool;
 
@@ -1070,12 +1061,12 @@ self_destruct (j_common_ptr cinfo)
    * Releasing pools in reverse order might help avoid fragmentation
    * with some (brain-damaged) malloc libraries.
    */
-  for (pool = JPOOL_NUMPOOLS-1; pool >= JPOOL_PERMANENT; pool--) {
+  for (pool = JPOOL_NUMPOOLS - 1; pool >= JPOOL_PERMANENT; pool--) {
     free_pool(cinfo, pool);
   }
 
   /* Release the memory manager control block too. */
-  jpeg_free_small(cinfo, (void *) cinfo->mem, sizeof(my_memory_mgr));
+  jpeg_free_small(cinfo, (void *)cinfo->mem, sizeof(my_memory_mgr));
   cinfo->mem = NULL;            /* ensures I will be called only once */
 
   jpeg_mem_term(cinfo);         /* system-dependent cleanup */
@@ -1088,7 +1079,7 @@ self_destruct (j_common_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_memory_mgr (j_common_ptr cinfo)
+jinit_memory_mgr(j_common_ptr cinfo)
 {
   my_mem_ptr mem;
   long max_to_use;
@@ -1104,22 +1095,22 @@ jinit_memory_mgr (j_common_ptr cinfo)
    * in common if and only if X is a power of 2, ie has only one one-bit.
    * Some compilers may give an "unreachable code" warning here; ignore it.
    */
-  if ((ALIGN_SIZE & (ALIGN_SIZE-1)) != 0)
+  if ((ALIGN_SIZE & (ALIGN_SIZE - 1)) != 0)
     ERREXIT(cinfo, JERR_BAD_ALIGN_TYPE);
   /* MAX_ALLOC_CHUNK must be representable as type size_t, and must be
    * a multiple of ALIGN_SIZE.
    * Again, an "unreachable code" warning may be ignored here.
    * But a "constant too large" warning means you need to fix MAX_ALLOC_CHUNK.
    */
-  test_mac = (size_t) MAX_ALLOC_CHUNK;
-  if ((long) test_mac != MAX_ALLOC_CHUNK ||
+  test_mac = (size_t)MAX_ALLOC_CHUNK;
+  if ((long)test_mac != MAX_ALLOC_CHUNK ||
       (MAX_ALLOC_CHUNK % ALIGN_SIZE) != 0)
     ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
 
   max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */
 
   /* Attempt to allocate memory manager's control block */
-  mem = (my_mem_ptr) jpeg_get_small(cinfo, sizeof(my_memory_mgr));
+  mem = (my_mem_ptr)jpeg_get_small(cinfo, sizeof(my_memory_mgr));
 
   if (mem == NULL) {
     jpeg_mem_term(cinfo);       /* system-dependent cleanup */
@@ -1145,7 +1136,7 @@ jinit_memory_mgr (j_common_ptr cinfo)
   /* Initialize working state */
   mem->pub.max_memory_to_use = max_to_use;
 
-  for (pool = JPOOL_NUMPOOLS-1; pool >= JPOOL_PERMANENT; pool--) {
+  for (pool = JPOOL_NUMPOOLS - 1; pool >= JPOOL_PERMANENT; pool--) {
     mem->small_list[pool] = NULL;
     mem->large_list[pool] = NULL;
   }
@@ -1155,7 +1146,7 @@ jinit_memory_mgr (j_common_ptr cinfo)
   mem->total_space_allocated = sizeof(my_memory_mgr);
 
   /* Declare ourselves open for business */
-  cinfo->mem = & mem->pub;
+  cinfo->mem = &mem->pub;
 
   /* Check for an environment variable JPEGMEM; if found, override the
    * default max_memory setting from jpeg_mem_init.  Note that the
@@ -1164,12 +1155,17 @@ jinit_memory_mgr (j_common_ptr cinfo)
    * this feature.
    */
 #ifndef NO_GETENV
-  { char *memenv;
+  {
+    char memenv[30] = { 0 };
 
-    if ((memenv = getenv("JPEGMEM")) != NULL) {
+    if (!GETENV_S(memenv, 30, "JPEGMEM") && strlen(memenv) > 0) {
       char ch = 'x';
 
+#ifdef _MSC_VER
+      if (sscanf_s(memenv, "%ld%c", &max_to_use, &ch, 1) > 0) {
+#else
       if (sscanf(memenv, "%ld%c", &max_to_use, &ch) > 0) {
+#endif
         if (ch == 'm' || ch == 'M')
           max_to_use *= 1000L;
         mem->pub.max_memory_to_use = max_to_use * 1000L;
diff --git a/media/libjpeg/jmemnobs.c b/media/libjpeg/jmemnobs.c
index 5797198de8..cd6571ba1c 100644
--- a/media/libjpeg/jmemnobs.c
+++ b/media/libjpeg/jmemnobs.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1992-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code and
- * information relevant to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2017-2018, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -15,7 +15,6 @@
  * This is very portable in the sense that it'll compile on almost anything,
  * but you'd better have lots of main memory (or virtual memory) if you want
  * to process big images.
- * Note that the max_memory_to_use option is ignored by this implementation.
  */
 
 #define JPEG_INTERNALS
@@ -23,11 +22,6 @@
 #include "jpeglib.h"
 #include "jmemsys.h"            /* import the system-dependent declarations */
 
-#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
-extern void *malloc (size_t size);
-extern void free (void *ptr);
-#endif
-
 
 /*
  * Memory allocation and freeing are controlled by the regular library
@@ -35,13 +29,13 @@ extern void free (void *ptr);
  */
 
 GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
+jpeg_get_small(j_common_ptr cinfo, size_t sizeofobject)
 {
-  return (void *) malloc(sizeofobject);
+  return (void *)malloc(sizeofobject);
 }
 
 GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void *object, size_t sizeofobject)
+jpeg_free_small(j_common_ptr cinfo, void *object, size_t sizeofobject)
 {
   free(object);
 }
@@ -52,13 +46,13 @@ jpeg_free_small (j_common_ptr cinfo, void *object, size_t sizeofobject)
  */
 
 GLOBAL(void *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
+jpeg_get_large(j_common_ptr cinfo, size_t sizeofobject)
 {
-  return (void *) malloc(sizeofobject);
+  return (void *)malloc(sizeofobject);
 }
 
 GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void *object, size_t sizeofobject)
+jpeg_free_large(j_common_ptr cinfo, void *object, size_t sizeofobject)
 {
   free(object);
 }
@@ -66,14 +60,21 @@ jpeg_free_large (j_common_ptr cinfo, void *object, size_t sizeofobject)
 
 /*
  * This routine computes the total memory space available for allocation.
- * Here we always say, "we got all you want bud!"
  */
 
 GLOBAL(size_t)
-jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
-                    size_t max_bytes_needed, size_t already_allocated)
+jpeg_mem_available(j_common_ptr cinfo, size_t min_bytes_needed,
+                   size_t max_bytes_needed, size_t already_allocated)
 {
-  return max_bytes_needed;
+  if (cinfo->mem->max_memory_to_use) {
+    if ((size_t)cinfo->mem->max_memory_to_use > already_allocated)
+      return cinfo->mem->max_memory_to_use - already_allocated;
+    else
+      return 0;
+  } else {
+    /* Here we always say, "we got all you want bud!" */
+    return max_bytes_needed;
+  }
 }
 
 
@@ -84,8 +85,8 @@ jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
  */
 
 GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-                         long total_bytes_needed)
+jpeg_open_backing_store(j_common_ptr cinfo, backing_store_ptr info,
+                        long total_bytes_needed)
 {
   ERREXIT(cinfo, JERR_NO_BACKING_STORE);
 }
@@ -97,13 +98,13 @@ jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
  */
 
 GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
+jpeg_mem_init(j_common_ptr cinfo)
 {
   return 0;                     /* just set max_memory_to_use to 0 */
 }
 
 GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
+jpeg_mem_term(j_common_ptr cinfo)
 {
   /* no work */
 }
diff --git a/media/libjpeg/jmemsys.h b/media/libjpeg/jmemsys.h
index f7dfe87a83..9229550afd 100644
--- a/media/libjpeg/jmemsys.h
+++ b/media/libjpeg/jmemsys.h
@@ -31,9 +31,9 @@
  * size of the object being freed, just in case it's needed.
  */
 
-EXTERN(void *) jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject);
-EXTERN(void) jpeg_free_small (j_common_ptr cinfo, void *object,
-                              size_t sizeofobject);
+EXTERN(void *) jpeg_get_small(j_common_ptr cinfo, size_t sizeofobject);
+EXTERN(void) jpeg_free_small(j_common_ptr cinfo, void *object,
+                             size_t sizeofobject);
 
 /*
  * These two functions are used to allocate and release large chunks of
@@ -43,9 +43,9 @@ EXTERN(void) jpeg_free_small (j_common_ptr cinfo, void *object,
  * large chunks.
  */
 
-EXTERN(void *) jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject);
-EXTERN(void) jpeg_free_large (j_common_ptr cinfo, void *object,
-                              size_t sizeofobject);
+EXTERN(void *) jpeg_get_large(j_common_ptr cinfo, size_t sizeofobject);
+EXTERN(void) jpeg_free_large(j_common_ptr cinfo, void *object,
+                             size_t sizeofobject);
 
 /*
  * The macro MAX_ALLOC_CHUNK designates the maximum number of bytes that may
@@ -84,9 +84,9 @@ EXTERN(void) jpeg_free_large (j_common_ptr cinfo, void *object,
  * Conversely, zero may be returned to always use the minimum amount of memory.
  */
 
-EXTERN(size_t) jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
-                                   size_t max_bytes_needed,
-                                   size_t already_allocated);
+EXTERN(size_t) jpeg_mem_available(j_common_ptr cinfo, size_t min_bytes_needed,
+                                  size_t max_bytes_needed,
+                                  size_t already_allocated);
 
 
 /*
@@ -157,9 +157,9 @@ typedef struct backing_store_struct {
  * just take an error exit.)
  */
 
-EXTERN(void) jpeg_open_backing_store (j_common_ptr cinfo,
-                                      backing_store_ptr info,
-                                      long total_bytes_needed);
+EXTERN(void) jpeg_open_backing_store(j_common_ptr cinfo,
+                                     backing_store_ptr info,
+                                     long total_bytes_needed);
 
 
 /*
@@ -174,5 +174,5 @@ EXTERN(void) jpeg_open_backing_store (j_common_ptr cinfo,
  * all opened backing-store objects have been closed.
  */
 
-EXTERN(long) jpeg_mem_init (j_common_ptr cinfo);
-EXTERN(void) jpeg_mem_term (j_common_ptr cinfo);
+EXTERN(long) jpeg_mem_init(j_common_ptr cinfo);
+EXTERN(void) jpeg_mem_term(j_common_ptr cinfo);
diff --git a/media/libjpeg/jmorecfg.h b/media/libjpeg/jmorecfg.h
index d73b1090d5..8cda8041b2 100644
--- a/media/libjpeg/jmorecfg.h
+++ b/media/libjpeg/jmorecfg.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2011, 2014-2015, D. R. Commander.
+ * Copyright (C) 2009, 2011, 2014-2015, 2018, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -18,9 +18,9 @@
 
 /*
  * Maximum number of components (color channels) allowed in JPEG image.
- * To meet the letter of the JPEG spec, set this to 255.  However, darn
- * few applications need more than 4 channels (maybe 5 for CMYK + alpha
- * mask).  We recommend 10 as a reasonable compromise; use 4 if you are
+ * To meet the letter of Rec. ITU-T T.81 | ISO/IEC 10918-1, set this to 255.
+ * However, darn few applications need more than 4 channels (maybe 5 for CMYK +
+ * alpha mask).  We recommend 10 as a reasonable compromise; use 4 if you are
  * really short on memory.  (Each allowed component costs a hundred or so
  * bytes of storage, whether actually used in an image or not.)
  */
@@ -44,24 +44,10 @@
 
 #if BITS_IN_JSAMPLE == 8
 /* JSAMPLE should be the smallest type that will hold the values 0..255.
- * You can use a signed char by having GETJSAMPLE mask it with 0xFF.
  */
 
-#ifdef HAVE_UNSIGNED_CHAR
-
 typedef unsigned char JSAMPLE;
-#define GETJSAMPLE(value)  ((int) (value))
-
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JSAMPLE;
-#ifdef __CHAR_UNSIGNED__
-#define GETJSAMPLE(value)  ((int) (value))
-#else
-#define GETJSAMPLE(value)  ((int) (value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
+#define GETJSAMPLE(value)  ((int)(value))
 
 #define MAXJSAMPLE      255
 #define CENTERJSAMPLE   128
@@ -75,7 +61,7 @@ typedef char JSAMPLE;
  */
 
 typedef short JSAMPLE;
-#define GETJSAMPLE(value)  ((int) (value))
+#define GETJSAMPLE(value)  ((int)(value))
 
 #define MAXJSAMPLE      4095
 #define CENTERJSAMPLE   2048
@@ -98,22 +84,9 @@ typedef short JCOEF;
  * managers, this is also the data type passed to fread/fwrite.
  */
 
-#ifdef HAVE_UNSIGNED_CHAR
-
 typedef unsigned char JOCTET;
 #define GETJOCTET(value)  (value)
 
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JOCTET;
-#ifdef __CHAR_UNSIGNED__
-#define GETJOCTET(value)  (value)
-#else
-#define GETJOCTET(value)  ((value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
-
 
 /* These typedefs are used for various table entries and so forth.
  * They must be at least as wide as specified; but making them too big
@@ -199,7 +172,7 @@ typedef unsigned int JDIMENSION;
  * software out there that uses it.
  */
 
-#define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
+#define JMETHOD(type, methodname, arglist)  type (*methodname) arglist
 
 
 /* libjpeg-turbo no longer supports platforms that have far symbols (MS-DOS),
@@ -252,9 +225,9 @@ typedef int boolean;
 
 /* Capability options common to encoder and decoder: */
 
-#define DCT_ISLOW_SUPPORTED     /* slow but accurate integer algorithm */
-#define DCT_IFAST_SUPPORTED     /* faster, less accurate integer method */
-#define DCT_FLOAT_SUPPORTED     /* floating-point: accurate, fast on fast HW */
+#define DCT_ISLOW_SUPPORTED     /* accurate integer method */
+#define DCT_IFAST_SUPPORTED     /* less accurate int method [legacy feature] */
+#define DCT_FLOAT_SUPPORTED     /* floating-point method [legacy feature] */
 
 /* Encoder capability options: */
 
@@ -294,10 +267,10 @@ typedef int boolean;
  * with it.  In reality, few people ever did this, because there were some
  * severe restrictions involved (cjpeg and djpeg no longer worked properly,
  * compressing/decompressing RGB JPEGs no longer worked properly, and the color
- * quantizer wouldn't work with pixel sizes other than 3.)  Further, since all
- * of the O/S-supplied versions of libjpeg were built with the default values
- * of RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE, many applications have
- * come to regard these values as immutable.
+ * quantizer wouldn't work with pixel sizes other than 3.)  Furthermore, since
+ * all of the O/S-supplied versions of libjpeg were built with the default
+ * values of RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE, many applications
+ * have come to regard these values as immutable.
  *
  * The libjpeg-turbo colorspace extensions provide a much cleaner way of
  * compressing from/decompressing to buffers with arbitrary component orders
@@ -312,37 +285,37 @@ typedef int boolean;
 #define RGB_BLUE        2       /* Offset of Blue */
 #define RGB_PIXELSIZE   3       /* JSAMPLEs per RGB scanline element */
 
-#define JPEG_NUMCS 17
+#define JPEG_NUMCS  17
 
-#define EXT_RGB_RED        0
-#define EXT_RGB_GREEN      1
-#define EXT_RGB_BLUE       2
-#define EXT_RGB_PIXELSIZE  3
+#define EXT_RGB_RED         0
+#define EXT_RGB_GREEN       1
+#define EXT_RGB_BLUE        2
+#define EXT_RGB_PIXELSIZE   3
 
-#define EXT_RGBX_RED       0
-#define EXT_RGBX_GREEN     1
-#define EXT_RGBX_BLUE      2
-#define EXT_RGBX_PIXELSIZE 4
+#define EXT_RGBX_RED        0
+#define EXT_RGBX_GREEN      1
+#define EXT_RGBX_BLUE       2
+#define EXT_RGBX_PIXELSIZE  4
 
-#define EXT_BGR_RED        2
-#define EXT_BGR_GREEN      1
-#define EXT_BGR_BLUE       0
-#define EXT_BGR_PIXELSIZE  3
+#define EXT_BGR_RED         2
+#define EXT_BGR_GREEN       1
+#define EXT_BGR_BLUE        0
+#define EXT_BGR_PIXELSIZE   3
 
-#define EXT_BGRX_RED       2
-#define EXT_BGRX_GREEN     1
-#define EXT_BGRX_BLUE      0
-#define EXT_BGRX_PIXELSIZE 4
+#define EXT_BGRX_RED        2
+#define EXT_BGRX_GREEN      1
+#define EXT_BGRX_BLUE       0
+#define EXT_BGRX_PIXELSIZE  4
 
-#define EXT_XBGR_RED       3
-#define EXT_XBGR_GREEN     2
-#define EXT_XBGR_BLUE      1
-#define EXT_XBGR_PIXELSIZE 4
+#define EXT_XBGR_RED        3
+#define EXT_XBGR_GREEN      2
+#define EXT_XBGR_BLUE       1
+#define EXT_XBGR_PIXELSIZE  4
 
-#define EXT_XRGB_RED       1
-#define EXT_XRGB_GREEN     2
-#define EXT_XRGB_BLUE      3
-#define EXT_XRGB_PIXELSIZE 4
+#define EXT_XRGB_RED        1
+#define EXT_XRGB_GREEN      2
+#define EXT_XRGB_BLUE       3
+#define EXT_XRGB_PIXELSIZE  4
 
 static const int rgb_red[JPEG_NUMCS] = {
   -1, -1, RGB_RED, -1, -1, -1, EXT_RGB_RED, EXT_RGBX_RED,
@@ -383,7 +356,7 @@ static const int rgb_pixelsize[JPEG_NUMCS] = {
 #ifndef WITH_SIMD
 #define MULTIPLIER  int         /* type for fastest integer multiply */
 #else
-#define MULTIPLIER short  /* prefer 16-bit with SIMD for parellelism */
+#define MULTIPLIER  short       /* prefer 16-bit with SIMD for parellelism */
 #endif
 #endif
 
diff --git a/media/libjpeg/jpegcomp.h b/media/libjpeg/jpegcomp.h
index ade0d1edcd..c4834ac0df 100644
--- a/media/libjpeg/jpegcomp.h
+++ b/media/libjpeg/jpegcomp.h
@@ -1,7 +1,7 @@
 /*
  * jpegcomp.h
  *
- * Copyright (C) 2010, D. R. Commander.
+ * Copyright (C) 2010, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -11,21 +11,22 @@
  */
 
 #if JPEG_LIB_VERSION >= 70
-#define _DCT_scaled_size DCT_h_scaled_size
-#define _DCT_h_scaled_size DCT_h_scaled_size
-#define _DCT_v_scaled_size DCT_v_scaled_size
-#define _min_DCT_scaled_size min_DCT_h_scaled_size
-#define _min_DCT_h_scaled_size min_DCT_h_scaled_size
-#define _min_DCT_v_scaled_size min_DCT_v_scaled_size
-#define _jpeg_width jpeg_width
-#define _jpeg_height jpeg_height
+#define _DCT_scaled_size  DCT_h_scaled_size
+#define _DCT_h_scaled_size  DCT_h_scaled_size
+#define _DCT_v_scaled_size  DCT_v_scaled_size
+#define _min_DCT_scaled_size  min_DCT_h_scaled_size
+#define _min_DCT_h_scaled_size  min_DCT_h_scaled_size
+#define _min_DCT_v_scaled_size  min_DCT_v_scaled_size
+#define _jpeg_width  jpeg_width
+#define _jpeg_height  jpeg_height
+#define JERR_ARITH_NOTIMPL  JERR_NOT_COMPILED
 #else
-#define _DCT_scaled_size DCT_scaled_size
-#define _DCT_h_scaled_size DCT_scaled_size
-#define _DCT_v_scaled_size DCT_scaled_size
-#define _min_DCT_scaled_size min_DCT_scaled_size
-#define _min_DCT_h_scaled_size min_DCT_scaled_size
-#define _min_DCT_v_scaled_size min_DCT_scaled_size
-#define _jpeg_width image_width
-#define _jpeg_height image_height
+#define _DCT_scaled_size  DCT_scaled_size
+#define _DCT_h_scaled_size  DCT_scaled_size
+#define _DCT_v_scaled_size  DCT_scaled_size
+#define _min_DCT_scaled_size  min_DCT_scaled_size
+#define _min_DCT_h_scaled_size  min_DCT_scaled_size
+#define _min_DCT_v_scaled_size  min_DCT_scaled_size
+#define _jpeg_width  image_width
+#define _jpeg_height  image_height
 #endif
diff --git a/media/libjpeg/jpegint.h b/media/libjpeg/jpegint.h
index 9979a912d9..6af9e2a179 100644
--- a/media/libjpeg/jpegint.h
+++ b/media/libjpeg/jpegint.h
@@ -5,8 +5,9 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2016, 2019, 2021, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2021, Alex Richardson.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -27,33 +28,45 @@ typedef enum {            /* Operating modes for buffer controllers */
 } J_BUF_MODE;
 
 /* Values of global_state field (jdapi.c has some dependencies on ordering!) */
-#define CSTATE_START    100     /* after create_compress */
-#define CSTATE_SCANNING 101     /* start_compress done, write_scanlines OK */
-#define CSTATE_RAW_OK   102     /* start_compress done, write_raw_data OK */
-#define CSTATE_WRCOEFS  103     /* jpeg_write_coefficients done */
-#define DSTATE_START    200     /* after create_decompress */
-#define DSTATE_INHEADER 201     /* reading header markers, no SOS yet */
-#define DSTATE_READY    202     /* found SOS, ready for start_decompress */
-#define DSTATE_PRELOAD  203     /* reading multiscan file in start_decompress*/
-#define DSTATE_PRESCAN  204     /* performing dummy pass for 2-pass quant */
-#define DSTATE_SCANNING 205     /* start_decompress done, read_scanlines OK */
-#define DSTATE_RAW_OK   206     /* start_decompress done, read_raw_data OK */
-#define DSTATE_BUFIMAGE 207     /* expecting jpeg_start_output */
-#define DSTATE_BUFPOST  208     /* looking for SOS/EOI in jpeg_finish_output */
-#define DSTATE_RDCOEFS  209     /* reading file in jpeg_read_coefficients */
-#define DSTATE_STOPPING 210     /* looking for EOI in jpeg_finish_decompress */
+#define CSTATE_START     100    /* after create_compress */
+#define CSTATE_SCANNING  101    /* start_compress done, write_scanlines OK */
+#define CSTATE_RAW_OK    102    /* start_compress done, write_raw_data OK */
+#define CSTATE_WRCOEFS   103    /* jpeg_write_coefficients done */
+#define DSTATE_START     200    /* after create_decompress */
+#define DSTATE_INHEADER  201    /* reading header markers, no SOS yet */
+#define DSTATE_READY     202    /* found SOS, ready for start_decompress */
+#define DSTATE_PRELOAD   203    /* reading multiscan file in start_decompress*/
+#define DSTATE_PRESCAN   204    /* performing dummy pass for 2-pass quant */
+#define DSTATE_SCANNING  205    /* start_decompress done, read_scanlines OK */
+#define DSTATE_RAW_OK    206    /* start_decompress done, read_raw_data OK */
+#define DSTATE_BUFIMAGE  207    /* expecting jpeg_start_output */
+#define DSTATE_BUFPOST   208    /* looking for SOS/EOI in jpeg_finish_output */
+#define DSTATE_RDCOEFS   209    /* reading file in jpeg_read_coefficients */
+#define DSTATE_STOPPING  210    /* looking for EOI in jpeg_finish_decompress */
 
 
 /* JLONG must hold at least signed 32-bit values. */
 typedef long JLONG;
 
+/* JUINTPTR must hold pointer values. */
+#ifdef __UINTPTR_TYPE__
+/*
+ * __UINTPTR_TYPE__ is GNU-specific and available in GCC 4.6+ and Clang 3.0+.
+ * Fortunately, that is sufficient to support the few architectures for which
+ * sizeof(void *) != sizeof(size_t).  The only other options would require C99
+ * or Clang-specific builtins.
+ */
+typedef __UINTPTR_TYPE__ JUINTPTR;
+#else
+typedef size_t JUINTPTR;
+#endif
 
 /*
  * Left shift macro that handles a negative operand without causing any
  * sanitizer warnings
  */
 
-#define LEFT_SHIFT(a, b) ((JLONG)((unsigned long)(a) << (b)))
+#define LEFT_SHIFT(a, b)  ((JLONG)((unsigned long)(a) << (b)))
 
 
 /* Declarations for compression modules */
@@ -158,6 +171,9 @@ struct jpeg_decomp_master {
   JDIMENSION first_MCU_col[MAX_COMPONENTS];
   JDIMENSION last_MCU_col[MAX_COMPONENTS];
   boolean jinit_upsampler_no_alloc;
+
+  /* Last iMCU row that was successfully decoded */
+  JDIMENSION last_good_iMCU_row;
 };
 
 /* Input control module */
@@ -274,9 +290,9 @@ struct jpeg_color_quantizer {
 /* Miscellaneous useful macros */
 
 #undef MAX
-#define MAX(a,b)        ((a) > (b) ? (a) : (b))
+#define MAX(a, b)       ((a) > (b) ? (a) : (b))
 #undef MIN
-#define MIN(a,b)        ((a) < (b) ? (a) : (b))
+#define MIN(a, b)       ((a) < (b) ? (a) : (b))
 
 
 /* We assume that right shift corresponds to signed division by 2 with
@@ -291,64 +307,64 @@ struct jpeg_color_quantizer {
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
 #define SHIFT_TEMPS     JLONG shift_temp;
-#define RIGHT_SHIFT(x,shft)  \
-        ((shift_temp = (x)) < 0 ? \
-         (shift_temp >> (shft)) | ((~((JLONG) 0)) << (32-(shft))) : \
-         (shift_temp >> (shft)))
+#define RIGHT_SHIFT(x, shft) \
+  ((shift_temp = (x)) < 0 ? \
+   (shift_temp >> (shft)) | ((~((JLONG)0)) << (32 - (shft))) : \
+   (shift_temp >> (shft)))
 #else
 #define SHIFT_TEMPS
-#define RIGHT_SHIFT(x,shft)     ((x) >> (shft))
+#define RIGHT_SHIFT(x, shft)    ((x) >> (shft))
 #endif
 
 
 /* Compression module initialization routines */
-EXTERN(void) jinit_compress_master (j_compress_ptr cinfo);
-EXTERN(void) jinit_c_master_control (j_compress_ptr cinfo,
-                                     boolean transcode_only);
-EXTERN(void) jinit_c_main_controller (j_compress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_c_prep_controller (j_compress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_c_coef_controller (j_compress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_color_converter (j_compress_ptr cinfo);
-EXTERN(void) jinit_downsampler (j_compress_ptr cinfo);
-EXTERN(void) jinit_forward_dct (j_compress_ptr cinfo);
-EXTERN(void) jinit_huff_encoder (j_compress_ptr cinfo);
-EXTERN(void) jinit_phuff_encoder (j_compress_ptr cinfo);
-EXTERN(void) jinit_arith_encoder (j_compress_ptr cinfo);
-EXTERN(void) jinit_marker_writer (j_compress_ptr cinfo);
+EXTERN(void) jinit_compress_master(j_compress_ptr cinfo);
+EXTERN(void) jinit_c_master_control(j_compress_ptr cinfo,
+                                    boolean transcode_only);
+EXTERN(void) jinit_c_main_controller(j_compress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_c_prep_controller(j_compress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_c_coef_controller(j_compress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_color_converter(j_compress_ptr cinfo);
+EXTERN(void) jinit_downsampler(j_compress_ptr cinfo);
+EXTERN(void) jinit_forward_dct(j_compress_ptr cinfo);
+EXTERN(void) jinit_huff_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_phuff_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_arith_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_marker_writer(j_compress_ptr cinfo);
 /* Decompression module initialization routines */
-EXTERN(void) jinit_master_decompress (j_decompress_ptr cinfo);
-EXTERN(void) jinit_d_main_controller (j_decompress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_d_coef_controller (j_decompress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_d_post_controller (j_decompress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_input_controller (j_decompress_ptr cinfo);
-EXTERN(void) jinit_marker_reader (j_decompress_ptr cinfo);
-EXTERN(void) jinit_huff_decoder (j_decompress_ptr cinfo);
-EXTERN(void) jinit_phuff_decoder (j_decompress_ptr cinfo);
-EXTERN(void) jinit_arith_decoder (j_decompress_ptr cinfo);
-EXTERN(void) jinit_inverse_dct (j_decompress_ptr cinfo);
-EXTERN(void) jinit_upsampler (j_decompress_ptr cinfo);
-EXTERN(void) jinit_color_deconverter (j_decompress_ptr cinfo);
-EXTERN(void) jinit_1pass_quantizer (j_decompress_ptr cinfo);
-EXTERN(void) jinit_2pass_quantizer (j_decompress_ptr cinfo);
-EXTERN(void) jinit_merged_upsampler (j_decompress_ptr cinfo);
+EXTERN(void) jinit_master_decompress(j_decompress_ptr cinfo);
+EXTERN(void) jinit_d_main_controller(j_decompress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_d_coef_controller(j_decompress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_d_post_controller(j_decompress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_input_controller(j_decompress_ptr cinfo);
+EXTERN(void) jinit_marker_reader(j_decompress_ptr cinfo);
+EXTERN(void) jinit_huff_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_phuff_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_arith_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_inverse_dct(j_decompress_ptr cinfo);
+EXTERN(void) jinit_upsampler(j_decompress_ptr cinfo);
+EXTERN(void) jinit_color_deconverter(j_decompress_ptr cinfo);
+EXTERN(void) jinit_1pass_quantizer(j_decompress_ptr cinfo);
+EXTERN(void) jinit_2pass_quantizer(j_decompress_ptr cinfo);
+EXTERN(void) jinit_merged_upsampler(j_decompress_ptr cinfo);
 /* Memory manager initialization */
-EXTERN(void) jinit_memory_mgr (j_common_ptr cinfo);
+EXTERN(void) jinit_memory_mgr(j_common_ptr cinfo);
 
 /* Utility routines in jutils.c */
-EXTERN(long) jdiv_round_up (long a, long b);
-EXTERN(long) jround_up (long a, long b);
-EXTERN(void) jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
-                                JSAMPARRAY output_array, int dest_row,
-                                int num_rows, JDIMENSION num_cols);
-EXTERN(void) jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
-                              JDIMENSION num_blocks);
-EXTERN(void) jzero_far (void *target, size_t bytestozero);
+EXTERN(long) jdiv_round_up(long a, long b);
+EXTERN(long) jround_up(long a, long b);
+EXTERN(void) jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
+                               JSAMPARRAY output_array, int dest_row,
+                               int num_rows, JDIMENSION num_cols);
+EXTERN(void) jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
+                             JDIMENSION num_blocks);
+EXTERN(void) jzero_far(void *target, size_t bytestozero);
 /* Constant tables in jutils.c */
 #if 0                           /* This table is not actually needed in v6a */
 extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
@@ -357,12 +373,3 @@ extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
 
 /* Arithmetic coding probability estimation tables in jaricom.c */
 extern const JLONG jpeg_aritab[];
-
-/* Suppress undefined-structure complaints if necessary. */
-
-#ifdef INCOMPLETE_TYPES_BROKEN
-#ifndef AM_MEMORY_MANAGER       /* only jmemmgr.c defines these */
-struct jvirt_sarray_control { long dummy; };
-struct jvirt_barray_control { long dummy; };
-#endif
-#endif /* INCOMPLETE_TYPES_BROKEN */
diff --git a/media/libjpeg/jpeglib.h b/media/libjpeg/jpeglib.h
index 6c63f58222..d7664f0630 100644
--- a/media/libjpeg/jpeglib.h
+++ b/media/libjpeg/jpeglib.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2013-2014, 2016-2017, 2020, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -211,8 +211,8 @@ struct jpeg_marker_struct {
 
 /* Known color spaces. */
 
-#define JCS_EXTENSIONS 1
-#define JCS_ALPHA_EXTENSIONS 1
+#define JCS_EXTENSIONS  1
+#define JCS_ALPHA_EXTENSIONS  1
 
 typedef enum {
   JCS_UNKNOWN,            /* error/unspecified */
@@ -244,9 +244,9 @@ typedef enum {
 /* DCT/IDCT algorithm options. */
 
 typedef enum {
-  JDCT_ISLOW,             /* slow but accurate integer algorithm */
-  JDCT_IFAST,             /* faster, less accurate integer method */
-  JDCT_FLOAT              /* floating-point: accurate, fast on fast HW */
+  JDCT_ISLOW,             /* accurate integer method */
+  JDCT_IFAST,             /* less accurate integer method [legacy feature] */
+  JDCT_FLOAT              /* floating-point method [legacy feature] */
 } J_DCT_METHOD;
 
 #ifndef JDCT_DEFAULT            /* may be overridden in jconfig.h */
@@ -268,11 +268,11 @@ typedef enum {
 /* Common fields between JPEG compression and decompression master structs. */
 
 #define jpeg_common_fields \
-  struct jpeg_error_mgr *err;   /* Error handler module */\
-  struct jpeg_memory_mgr *mem;  /* Memory manager module */\
-  struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */\
-  void *client_data;            /* Available for use by application */\
-  boolean is_decompressor;      /* So common code can tell which is which */\
+  struct jpeg_error_mgr *err;   /* Error handler module */ \
+  struct jpeg_memory_mgr *mem;  /* Memory manager module */ \
+  struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */ \
+  void *client_data;            /* Available for use by application */ \
+  boolean is_decompressor;      /* So common code can tell which is which */ \
   int global_state              /* For checking call sequence validity */
 
 /* Routines that are to be used by both halves of the library are declared
@@ -822,9 +822,9 @@ struct jpeg_source_mgr {
  * successful.
  */
 
-#define JPOOL_PERMANENT 0       /* lasts until master record is destroyed */
-#define JPOOL_IMAGE     1       /* lasts until done with image/datastream */
-#define JPOOL_NUMPOOLS  2
+#define JPOOL_PERMANENT  0      /* lasts until master record is destroyed */
+#define JPOOL_IMAGE      1      /* lasts until done with image/datastream */
+#define JPOOL_NUMPOOLS   2
 
 typedef struct jvirt_sarray_control *jvirt_sarray_ptr;
 typedef struct jvirt_barray_control *jvirt_barray_ptr;
@@ -888,7 +888,7 @@ typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo);
 
 
 /* Default error-management setup */
-EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr *err);
+EXTERN(struct jpeg_error_mgr *) jpeg_std_error(struct jpeg_error_mgr *err);
 
 /* Initialization of JPEG compression objects.
  * jpeg_create_compress() and jpeg_create_decompress() are the exported
@@ -898,90 +898,95 @@ EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr *err);
  * NB: you must set up the error-manager BEFORE calling jpeg_create_xxx.
  */
 #define jpeg_create_compress(cinfo) \
-    jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
-                        (size_t) sizeof(struct jpeg_compress_struct))
+  jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
+                      (size_t)sizeof(struct jpeg_compress_struct))
 #define jpeg_create_decompress(cinfo) \
-    jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
-                          (size_t) sizeof(struct jpeg_decompress_struct))
-EXTERN(void) jpeg_CreateCompress (j_compress_ptr cinfo, int version,
-                                  size_t structsize);
-EXTERN(void) jpeg_CreateDecompress (j_decompress_ptr cinfo, int version,
-                                    size_t structsize);
+  jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
+                        (size_t)sizeof(struct jpeg_decompress_struct))
+EXTERN(void) jpeg_CreateCompress(j_compress_ptr cinfo, int version,
+                                 size_t structsize);
+EXTERN(void) jpeg_CreateDecompress(j_decompress_ptr cinfo, int version,
+                                   size_t structsize);
 /* Destruction of JPEG compression objects */
-EXTERN(void) jpeg_destroy_compress (j_compress_ptr cinfo);
-EXTERN(void) jpeg_destroy_decompress (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_destroy_compress(j_compress_ptr cinfo);
+EXTERN(void) jpeg_destroy_decompress(j_decompress_ptr cinfo);
 
 /* Standard data source and destination managers: stdio streams. */
 /* Caller is responsible for opening the file before and closing after. */
-EXTERN(void) jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile);
-EXTERN(void) jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile);
+EXTERN(void) jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile);
+EXTERN(void) jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile);
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 /* Data source and destination managers: memory buffers. */
-EXTERN(void) jpeg_mem_dest (j_compress_ptr cinfo, unsigned char **outbuffer,
-                            unsigned long *outsize);
-EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo,
-                           const unsigned char *inbuffer,
-                           unsigned long insize);
+EXTERN(void) jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+                           unsigned long *outsize);
+EXTERN(void) jpeg_mem_src(j_decompress_ptr cinfo,
+                          const unsigned char *inbuffer, unsigned long insize);
 #endif
 
 /* Default parameter setup for compression */
-EXTERN(void) jpeg_set_defaults (j_compress_ptr cinfo);
+EXTERN(void) jpeg_set_defaults(j_compress_ptr cinfo);
 /* Compression parameter setup aids */
-EXTERN(void) jpeg_set_colorspace (j_compress_ptr cinfo,
-                                  J_COLOR_SPACE colorspace);
-EXTERN(void) jpeg_default_colorspace (j_compress_ptr cinfo);
-EXTERN(void) jpeg_set_quality (j_compress_ptr cinfo, int quality,
-                               boolean force_baseline);
-EXTERN(void) jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
-                                      boolean force_baseline);
+EXTERN(void) jpeg_set_colorspace(j_compress_ptr cinfo,
+                                 J_COLOR_SPACE colorspace);
+EXTERN(void) jpeg_default_colorspace(j_compress_ptr cinfo);
+EXTERN(void) jpeg_set_quality(j_compress_ptr cinfo, int quality,
+                              boolean force_baseline);
+EXTERN(void) jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                                     boolean force_baseline);
 #if JPEG_LIB_VERSION >= 70
-EXTERN(void) jpeg_default_qtables (j_compress_ptr cinfo,
-                                   boolean force_baseline);
+EXTERN(void) jpeg_default_qtables(j_compress_ptr cinfo,
+                                  boolean force_baseline);
 #endif
-EXTERN(void) jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
-                                   const unsigned int *basic_table,
-                                   int scale_factor, boolean force_baseline);
-EXTERN(int) jpeg_quality_scaling (int quality);
-EXTERN(void) jpeg_simple_progression (j_compress_ptr cinfo);
-EXTERN(void) jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress);
-EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table (j_common_ptr cinfo);
-EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table (j_common_ptr cinfo);
+EXTERN(void) jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                                  const unsigned int *basic_table,
+                                  int scale_factor, boolean force_baseline);
+EXTERN(int) jpeg_quality_scaling(int quality);
+EXTERN(void) jpeg_simple_progression(j_compress_ptr cinfo);
+EXTERN(void) jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress);
+EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table(j_common_ptr cinfo);
+EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table(j_common_ptr cinfo);
 
 /* Main entry points for compression */
-EXTERN(void) jpeg_start_compress (j_compress_ptr cinfo,
-                                  boolean write_all_tables);
-EXTERN(JDIMENSION) jpeg_write_scanlines (j_compress_ptr cinfo,
-                                         JSAMPARRAY scanlines,
-                                         JDIMENSION num_lines);
-EXTERN(void) jpeg_finish_compress (j_compress_ptr cinfo);
+EXTERN(void) jpeg_start_compress(j_compress_ptr cinfo,
+                                 boolean write_all_tables);
+EXTERN(JDIMENSION) jpeg_write_scanlines(j_compress_ptr cinfo,
+                                        JSAMPARRAY scanlines,
+                                        JDIMENSION num_lines);
+EXTERN(void) jpeg_finish_compress(j_compress_ptr cinfo);
 
 #if JPEG_LIB_VERSION >= 70
 /* Precalculate JPEG dimensions for current compression parameters. */
-EXTERN(void) jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo);
+EXTERN(void) jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo);
 #endif
 
 /* Replaces jpeg_write_scanlines when writing raw downsampled data. */
-EXTERN(JDIMENSION) jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
-                                        JDIMENSION num_lines);
+EXTERN(JDIMENSION) jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                                       JDIMENSION num_lines);
 
 /* Write a special marker.  See libjpeg.txt concerning safe usage. */
-EXTERN(void) jpeg_write_marker (j_compress_ptr cinfo, int marker,
-                                const JOCTET *dataptr, unsigned int datalen);
+EXTERN(void) jpeg_write_marker(j_compress_ptr cinfo, int marker,
+                               const JOCTET *dataptr, unsigned int datalen);
 /* Same, but piecemeal. */
-EXTERN(void) jpeg_write_m_header (j_compress_ptr cinfo, int marker,
-                                  unsigned int datalen);
-EXTERN(void) jpeg_write_m_byte (j_compress_ptr cinfo, int val);
+EXTERN(void) jpeg_write_m_header(j_compress_ptr cinfo, int marker,
+                                 unsigned int datalen);
+EXTERN(void) jpeg_write_m_byte(j_compress_ptr cinfo, int val);
 
 /* Alternate compression function: just write an abbreviated table file */
-EXTERN(void) jpeg_write_tables (j_compress_ptr cinfo);
+EXTERN(void) jpeg_write_tables(j_compress_ptr cinfo);
+
+/* Write ICC profile.  See libjpeg.txt for usage information. */
+EXTERN(void) jpeg_write_icc_profile(j_compress_ptr cinfo,
+                                    const JOCTET *icc_data_ptr,
+                                    unsigned int icc_data_len);
+
 
 /* Decompression startup: read start of JPEG datastream to see what's there */
-EXTERN(int) jpeg_read_header (j_decompress_ptr cinfo, boolean require_image);
+EXTERN(int) jpeg_read_header(j_decompress_ptr cinfo, boolean require_image);
 /* Return value is one of: */
-#define JPEG_SUSPENDED          0 /* Suspended due to lack of input data */
-#define JPEG_HEADER_OK          1 /* Found valid image datastream */
-#define JPEG_HEADER_TABLES_ONLY 2 /* Found valid table-specs-only datastream */
+#define JPEG_SUSPENDED           0 /* Suspended due to lack of input data */
+#define JPEG_HEADER_OK           1 /* Found valid image datastream */
+#define JPEG_HEADER_TABLES_ONLY  2 /* Found valid table-specs-only datastream */
 /* If you pass require_image = TRUE (normal case), you need not check for
  * a TABLES_ONLY return code; an abbreviated file will cause an error exit.
  * JPEG_SUSPENDED is only possible if you use a data source module that can
@@ -989,27 +994,27 @@ EXTERN(int) jpeg_read_header (j_decompress_ptr cinfo, boolean require_image);
  */
 
 /* Main entry points for decompression */
-EXTERN(boolean) jpeg_start_decompress (j_decompress_ptr cinfo);
-EXTERN(JDIMENSION) jpeg_read_scanlines (j_decompress_ptr cinfo,
-                                        JSAMPARRAY scanlines,
-                                        JDIMENSION max_lines);
-EXTERN(JDIMENSION) jpeg_skip_scanlines (j_decompress_ptr cinfo,
-                                        JDIMENSION num_lines);
-EXTERN(void) jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
-                                 JDIMENSION *width);
-EXTERN(boolean) jpeg_finish_decompress (j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_start_decompress(j_decompress_ptr cinfo);
+EXTERN(JDIMENSION) jpeg_read_scanlines(j_decompress_ptr cinfo,
+                                       JSAMPARRAY scanlines,
+                                       JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg_skip_scanlines(j_decompress_ptr cinfo,
+                                       JDIMENSION num_lines);
+EXTERN(void) jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                                JDIMENSION *width);
+EXTERN(boolean) jpeg_finish_decompress(j_decompress_ptr cinfo);
 
 /* Replaces jpeg_read_scanlines when reading raw downsampled data. */
-EXTERN(JDIMENSION) jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
-                                       JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                                      JDIMENSION max_lines);
 
 /* Additional entry points for buffered-image mode. */
-EXTERN(boolean) jpeg_has_multiple_scans (j_decompress_ptr cinfo);
-EXTERN(boolean) jpeg_start_output (j_decompress_ptr cinfo, int scan_number);
-EXTERN(boolean) jpeg_finish_output (j_decompress_ptr cinfo);
-EXTERN(boolean) jpeg_input_complete (j_decompress_ptr cinfo);
-EXTERN(void) jpeg_new_colormap (j_decompress_ptr cinfo);
-EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_has_multiple_scans(j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_start_output(j_decompress_ptr cinfo, int scan_number);
+EXTERN(boolean) jpeg_finish_output(j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_input_complete(j_decompress_ptr cinfo);
+EXTERN(void) jpeg_new_colormap(j_decompress_ptr cinfo);
+EXTERN(int) jpeg_consume_input(j_decompress_ptr cinfo);
 /* Return value is one of: */
 /* #define JPEG_SUSPENDED       0    Suspended due to lack of input data */
 #define JPEG_REACHED_SOS        1 /* Reached start of new scan */
@@ -1019,25 +1024,25 @@ EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo);
 
 /* Precalculate output dimensions for current decompression parameters. */
 #if JPEG_LIB_VERSION >= 80
-EXTERN(void) jpeg_core_output_dimensions (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_core_output_dimensions(j_decompress_ptr cinfo);
 #endif
-EXTERN(void) jpeg_calc_output_dimensions (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_calc_output_dimensions(j_decompress_ptr cinfo);
 
 /* Control saving of COM and APPn markers into marker_list. */
-EXTERN(void) jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
-                                unsigned int length_limit);
+EXTERN(void) jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+                               unsigned int length_limit);
 
 /* Install a special processing method for COM or APPn markers. */
-EXTERN(void) jpeg_set_marker_processor (j_decompress_ptr cinfo,
-                                        int marker_code,
-                                        jpeg_marker_parser_method routine);
+EXTERN(void) jpeg_set_marker_processor(j_decompress_ptr cinfo,
+                                       int marker_code,
+                                       jpeg_marker_parser_method routine);
 
 /* Read or write raw DCT coefficients --- useful for lossless transcoding. */
-EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients (j_decompress_ptr cinfo);
-EXTERN(void) jpeg_write_coefficients (j_compress_ptr cinfo,
-                                      jvirt_barray_ptr *coef_arrays);
-EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
-                                            j_compress_ptr dstinfo);
+EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients(j_decompress_ptr cinfo);
+EXTERN(void) jpeg_write_coefficients(j_compress_ptr cinfo,
+                                     jvirt_barray_ptr *coef_arrays);
+EXTERN(void) jpeg_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                           j_compress_ptr dstinfo);
 
 /* If you choose to abort compression or decompression before completing
  * jpeg_finish_(de)compress, then you need to clean up to release memory,
@@ -1045,17 +1050,22 @@ EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
  * if you're done with the JPEG object, but if you want to clean it up and
  * reuse it, call this:
  */
-EXTERN(void) jpeg_abort_compress (j_compress_ptr cinfo);
-EXTERN(void) jpeg_abort_decompress (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_abort_compress(j_compress_ptr cinfo);
+EXTERN(void) jpeg_abort_decompress(j_decompress_ptr cinfo);
 
 /* Generic versions of jpeg_abort and jpeg_destroy that work on either
  * flavor of JPEG object.  These may be more convenient in some places.
  */
-EXTERN(void) jpeg_abort (j_common_ptr cinfo);
-EXTERN(void) jpeg_destroy (j_common_ptr cinfo);
+EXTERN(void) jpeg_abort(j_common_ptr cinfo);
+EXTERN(void) jpeg_destroy(j_common_ptr cinfo);
 
 /* Default restart-marker-resync procedure for use by data source modules */
-EXTERN(boolean) jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired);
+EXTERN(boolean) jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired);
+
+/* Read ICC profile.  See libjpeg.txt for usage information. */
+EXTERN(boolean) jpeg_read_icc_profile(j_decompress_ptr cinfo,
+                                      JOCTET **icc_data_ptr,
+                                      unsigned int *icc_data_len);
 
 
 /* These marker codes are exported since applications and data source modules
diff --git a/media/libjpeg/jquant1.c b/media/libjpeg/jquant1.c
index e7814815ef..73b83e16e5 100644
--- a/media/libjpeg/jquant1.c
+++ b/media/libjpeg/jquant1.c
@@ -73,8 +73,9 @@
 
 #define ODITHER_SIZE  16        /* dimension of dither matrix */
 /* NB: if ODITHER_SIZE is not a power of 2, ODITHER_MASK uses will break */
-#define ODITHER_CELLS (ODITHER_SIZE*ODITHER_SIZE)       /* # cells in matrix */
-#define ODITHER_MASK  (ODITHER_SIZE-1) /* mask for wrapping around counters */
+#define ODITHER_CELLS  (ODITHER_SIZE * ODITHER_SIZE) /* # cells in matrix */
+#define ODITHER_MASK  (ODITHER_SIZE - 1) /* mask for wrapping around
+                                            counters */
 
 typedef int ODITHER_MATRIX[ODITHER_SIZE][ODITHER_SIZE];
 typedef int (*ODITHER_MATRIX_PTR)[ODITHER_SIZE];
@@ -132,12 +133,12 @@ typedef JLONG FSERROR;          /* may need more than 16 bits */
 typedef JLONG LOCFSERROR;       /* be sure calculation temps are big enough */
 #endif
 
-typedef FSERROR *FSERRPTR;  /* pointer to error array */
+typedef FSERROR *FSERRPTR;      /* pointer to error array */
 
 
 /* Private subobject */
 
-#define MAX_Q_COMPS 4           /* max components I can handle */
+#define MAX_Q_COMPS  4          /* max components I can handle */
 
 typedef struct {
   struct jpeg_color_quantizer pub; /* public fields */
@@ -153,7 +154,7 @@ typedef struct {
    */
   boolean is_padded;            /* is the colorindex padded for odither? */
 
-  int Ncolors[MAX_Q_COMPS];     /* # of values alloced to each component */
+  int Ncolors[MAX_Q_COMPS];     /* # of values allocated to each component */
 
   /* Variables for ordered dithering */
   int row_index;                /* cur row's vertical index in dither matrix */
@@ -183,7 +184,7 @@ typedef my_cquantizer *my_cquantize_ptr;
 
 
 LOCAL(int)
-select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
+select_ncolors(j_decompress_ptr cinfo, int Ncolors[])
 /* Determine allocation of desired colors to components, */
 /* and fill in Ncolors[] array to indicate choice. */
 /* Return value is total number of colors (product of Ncolors[] values). */
@@ -206,12 +207,12 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
     temp = iroot;               /* set temp = iroot ** nc */
     for (i = 1; i < nc; i++)
       temp *= iroot;
-  } while (temp <= (long) max_colors); /* repeat till iroot exceeds root */
+  } while (temp <= (long)max_colors); /* repeat till iroot exceeds root */
   iroot--;                      /* now iroot = floor(root) */
 
   /* Must have at least 2 color values per component */
   if (iroot < 2)
-    ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, (int) temp);
+    ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, (int)temp);
 
   /* Initialize to iroot color values for each component */
   total_colors = 1;
@@ -231,11 +232,11 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
       j = (cinfo->out_color_space == JCS_RGB ? RGB_order[i] : i);
       /* calculate new total_colors if Ncolors[j] is incremented */
       temp = total_colors / Ncolors[j];
-      temp *= Ncolors[j]+1;     /* done in long arith to avoid oflo */
-      if (temp > (long) max_colors)
+      temp *= Ncolors[j] + 1;   /* done in long arith to avoid oflo */
+      if (temp > (long)max_colors)
         break;                  /* won't fit, done with this pass */
       Ncolors[j]++;             /* OK, apply the increment */
-      total_colors = (int) temp;
+      total_colors = (int)temp;
       changed = TRUE;
     }
   } while (changed);
@@ -245,7 +246,7 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
 
 
 LOCAL(int)
-output_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
+output_value(j_decompress_ptr cinfo, int ci, int j, int maxj)
 /* Return j'th output value, where j will range from 0 to maxj */
 /* The output values must fall in 0..MAXJSAMPLE in increasing order */
 {
@@ -254,17 +255,17 @@ output_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
    * (Forcing the upper and lower values to the limits ensures that
    * dithering can't produce a color outside the selected gamut.)
    */
-  return (int) (((JLONG) j * MAXJSAMPLE + maxj/2) / maxj);
+  return (int)(((JLONG)j * MAXJSAMPLE + maxj / 2) / maxj);
 }
 
 
 LOCAL(int)
-largest_input_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
+largest_input_value(j_decompress_ptr cinfo, int ci, int j, int maxj)
 /* Return largest input value that should map to j'th output value */
 /* Must have largest(j=0) >= 0, and largest(j=maxj) >= MAXJSAMPLE */
 {
   /* Breakpoints are halfway between values returned by output_value */
-  return (int) (((JLONG) (2*j + 1) * MAXJSAMPLE + maxj) / (2*maxj));
+  return (int)(((JLONG)(2 * j + 1) * MAXJSAMPLE + maxj) / (2 * maxj));
 }
 
 
@@ -273,21 +274,21 @@ largest_input_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
  */
 
 LOCAL(void)
-create_colormap (j_decompress_ptr cinfo)
+create_colormap(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   JSAMPARRAY colormap;          /* Created colormap */
   int total_colors;             /* Number of distinct output colors */
-  int i,j,k, nci, blksize, blkdist, ptr, val;
+  int i, j, k, nci, blksize, blkdist, ptr, val;
 
   /* Select number of colors for each component */
   total_colors = select_ncolors(cinfo, cquantize->Ncolors);
 
   /* Report selected color counts */
   if (cinfo->out_color_components == 3)
-    TRACEMS4(cinfo, 1, JTRC_QUANT_3_NCOLORS,
-             total_colors, cquantize->Ncolors[0],
-             cquantize->Ncolors[1], cquantize->Ncolors[2]);
+    TRACEMS4(cinfo, 1, JTRC_QUANT_3_NCOLORS, total_colors,
+             cquantize->Ncolors[0], cquantize->Ncolors[1],
+             cquantize->Ncolors[2]);
   else
     TRACEMS1(cinfo, 1, JTRC_QUANT_NCOLORS, total_colors);
 
@@ -296,8 +297,8 @@ create_colormap (j_decompress_ptr cinfo)
   /* i.e. rightmost (highest-indexed) color changes most rapidly. */
 
   colormap = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE,
-     (JDIMENSION) total_colors, (JDIMENSION) cinfo->out_color_components);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE,
+     (JDIMENSION)total_colors, (JDIMENSION)cinfo->out_color_components);
 
   /* blksize is number of adjacent repeated entries for a component */
   /* blkdist is distance between groups of identical entries for a component */
@@ -309,12 +310,12 @@ create_colormap (j_decompress_ptr cinfo)
     blksize = blkdist / nci;
     for (j = 0; j < nci; j++) {
       /* Compute j'th output value (out of nci) for component */
-      val = output_value(cinfo, i, j, nci-1);
+      val = output_value(cinfo, i, j, nci - 1);
       /* Fill in all colormap entries that have this value of this component */
       for (ptr = j * blksize; ptr < total_colors; ptr += blkdist) {
         /* fill in blksize entries beginning at ptr */
         for (k = 0; k < blksize; k++)
-          colormap[i][ptr+k] = (JSAMPLE) val;
+          colormap[i][ptr + k] = (JSAMPLE)val;
       }
     }
     blkdist = blksize;          /* blksize of this color is blkdist of next */
@@ -333,11 +334,11 @@ create_colormap (j_decompress_ptr cinfo)
  */
 
 LOCAL(void)
-create_colorindex (j_decompress_ptr cinfo)
+create_colorindex(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   JSAMPROW indexptr;
-  int i,j,k, nci, blksize, val, pad;
+  int i, j, k, nci, blksize, val, pad;
 
   /* For ordered dither, we pad the color index tables by MAXJSAMPLE in
    * each direction (input index values can be -MAXJSAMPLE .. 2*MAXJSAMPLE).
@@ -345,7 +346,7 @@ create_colorindex (j_decompress_ptr cinfo)
    * flag whether it was done in case user changes dithering mode.
    */
   if (cinfo->dither_mode == JDITHER_ORDERED) {
-    pad = MAXJSAMPLE*2;
+    pad = MAXJSAMPLE * 2;
     cquantize->is_padded = TRUE;
   } else {
     pad = 0;
@@ -353,9 +354,9 @@ create_colorindex (j_decompress_ptr cinfo)
   }
 
   cquantize->colorindex = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE,
-     (JDIMENSION) (MAXJSAMPLE+1 + pad),
-     (JDIMENSION) cinfo->out_color_components);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE,
+     (JDIMENSION)(MAXJSAMPLE + 1 + pad),
+     (JDIMENSION)cinfo->out_color_components);
 
   /* blksize is number of adjacent repeated entries for a component */
   blksize = cquantize->sv_actual;
@@ -373,18 +374,18 @@ create_colorindex (j_decompress_ptr cinfo)
     /* and k = largest j that maps to current val */
     indexptr = cquantize->colorindex[i];
     val = 0;
-    k = largest_input_value(cinfo, i, 0, nci-1);
+    k = largest_input_value(cinfo, i, 0, nci - 1);
     for (j = 0; j <= MAXJSAMPLE; j++) {
       while (j > k)             /* advance val if past boundary */
-        k = largest_input_value(cinfo, i, ++val, nci-1);
+        k = largest_input_value(cinfo, i, ++val, nci - 1);
       /* premultiply so that no multiplication needed in main processing */
-      indexptr[j] = (JSAMPLE) (val * blksize);
+      indexptr[j] = (JSAMPLE)(val * blksize);
     }
     /* Pad at both ends if necessary */
     if (pad)
       for (j = 1; j <= MAXJSAMPLE; j++) {
         indexptr[-j] = indexptr[0];
-        indexptr[MAXJSAMPLE+j] = indexptr[MAXJSAMPLE];
+        indexptr[MAXJSAMPLE + j] = indexptr[MAXJSAMPLE];
       }
   }
 }
@@ -396,29 +397,29 @@ create_colorindex (j_decompress_ptr cinfo)
  */
 
 LOCAL(ODITHER_MATRIX_PTR)
-make_odither_array (j_decompress_ptr cinfo, int ncolors)
+make_odither_array(j_decompress_ptr cinfo, int ncolors)
 {
   ODITHER_MATRIX_PTR odither;
-  int j,k;
-  JLONG num,den;
+  int j, k;
+  JLONG num, den;
 
   odither = (ODITHER_MATRIX_PTR)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(ODITHER_MATRIX));
   /* The inter-value distance for this color is MAXJSAMPLE/(ncolors-1).
    * Hence the dither value for the matrix cell with fill order f
    * (f=0..N-1) should be (N-1-2*f)/(2*N) * MAXJSAMPLE/(ncolors-1).
    * On 16-bit-int machine, be careful to avoid overflow.
    */
-  den = 2 * ODITHER_CELLS * ((JLONG) (ncolors - 1));
+  den = 2 * ODITHER_CELLS * ((JLONG)(ncolors - 1));
   for (j = 0; j < ODITHER_SIZE; j++) {
     for (k = 0; k < ODITHER_SIZE; k++) {
-      num = ((JLONG) (ODITHER_CELLS-1 - 2*((int)base_dither_matrix[j][k])))
-            * MAXJSAMPLE;
+      num = ((JLONG)(ODITHER_CELLS - 1 -
+                     2 * ((int)base_dither_matrix[j][k]))) * MAXJSAMPLE;
       /* Ensure round towards zero despite C's lack of consistency
        * about rounding negative values in integer division...
        */
-      odither[j][k] = (int) (num<0 ? -((-num)/den) : num/den);
+      odither[j][k] = (int)(num < 0 ? -((-num) / den) : num / den);
     }
   }
   return odither;
@@ -432,9 +433,9 @@ make_odither_array (j_decompress_ptr cinfo, int ncolors)
  */
 
 LOCAL(void)
-create_odither_tables (j_decompress_ptr cinfo)
+create_odither_tables(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   ODITHER_MATRIX_PTR odither;
   int i, j, nci;
 
@@ -459,11 +460,11 @@ create_odither_tables (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-color_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                JSAMPARRAY output_buf, int num_rows)
+color_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+               JSAMPARRAY output_buf, int num_rows)
 /* General case, no dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   JSAMPARRAY colorindex = cquantize->colorindex;
   register int pixcode, ci;
   register JSAMPROW ptrin, ptrout;
@@ -478,20 +479,20 @@ color_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     for (col = width; col > 0; col--) {
       pixcode = 0;
       for (ci = 0; ci < nc; ci++) {
-        pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]);
+        pixcode += colorindex[ci][*ptrin++];
       }
-      *ptrout++ = (JSAMPLE) pixcode;
+      *ptrout++ = (JSAMPLE)pixcode;
     }
   }
 }
 
 
 METHODDEF(void)
-color_quantize3 (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                 JSAMPARRAY output_buf, int num_rows)
+color_quantize3(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPARRAY output_buf, int num_rows)
 /* Fast path for out_color_components==3, no dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register int pixcode;
   register JSAMPROW ptrin, ptrout;
   JSAMPROW colorindex0 = cquantize->colorindex[0];
@@ -505,21 +506,21 @@ color_quantize3 (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     ptrin = input_buf[row];
     ptrout = output_buf[row];
     for (col = width; col > 0; col--) {
-      pixcode  = GETJSAMPLE(colorindex0[GETJSAMPLE(*ptrin++)]);
-      pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*ptrin++)]);
-      pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*ptrin++)]);
-      *ptrout++ = (JSAMPLE) pixcode;
+      pixcode  = colorindex0[*ptrin++];
+      pixcode += colorindex1[*ptrin++];
+      pixcode += colorindex2[*ptrin++];
+      *ptrout++ = (JSAMPLE)pixcode;
     }
   }
 }
 
 
 METHODDEF(void)
-quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                     JSAMPARRAY output_buf, int num_rows)
+quantize_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                    JSAMPARRAY output_buf, int num_rows)
 /* General case, with ordered dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register JSAMPROW input_ptr;
   register JSAMPROW output_ptr;
   JSAMPROW colorindex_ci;
@@ -533,7 +534,7 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE)));
+    jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
     row_index = cquantize->row_index;
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
@@ -550,7 +551,8 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
          * inputs.  The maximum dither is +- MAXJSAMPLE; this sets the
          * required amount of padding.
          */
-        *output_ptr += colorindex_ci[GETJSAMPLE(*input_ptr)+dither[col_index]];
+        *output_ptr +=
+          colorindex_ci[*input_ptr + dither[col_index]];
         input_ptr += nc;
         output_ptr++;
         col_index = (col_index + 1) & ODITHER_MASK;
@@ -564,11 +566,11 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
 
 METHODDEF(void)
-quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                      JSAMPARRAY output_buf, int num_rows)
+quantize3_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPARRAY output_buf, int num_rows)
 /* Fast path for out_color_components==3, with ordered dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register int pixcode;
   register JSAMPROW input_ptr;
   register JSAMPROW output_ptr;
@@ -593,13 +595,10 @@ quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     col_index = 0;
 
     for (col = width; col > 0; col--) {
-      pixcode  = GETJSAMPLE(colorindex0[GETJSAMPLE(*input_ptr++) +
-                                        dither0[col_index]]);
-      pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*input_ptr++) +
-                                        dither1[col_index]]);
-      pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*input_ptr++) +
-                                        dither2[col_index]]);
-      *output_ptr++ = (JSAMPLE) pixcode;
+      pixcode  = colorindex0[(*input_ptr++) + dither0[col_index]];
+      pixcode += colorindex1[(*input_ptr++) + dither1[col_index]];
+      pixcode += colorindex2[(*input_ptr++) + dither2[col_index]];
+      *output_ptr++ = (JSAMPLE)pixcode;
       col_index = (col_index + 1) & ODITHER_MASK;
     }
     row_index = (row_index + 1) & ODITHER_MASK;
@@ -609,11 +608,11 @@ quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
 
 METHODDEF(void)
-quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                    JSAMPARRAY output_buf, int num_rows)
+quantize_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                   JSAMPARRAY output_buf, int num_rows)
 /* General case, with Floyd-Steinberg dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register LOCFSERROR cur;      /* current error or pixel value */
   LOCFSERROR belowerr;          /* error for pixel below cur */
   LOCFSERROR bpreverr;          /* error for below/prev col */
@@ -637,17 +636,17 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE)));
+    jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
       output_ptr = output_buf[row];
       if (cquantize->on_odd_row) {
         /* work right to left in this row */
-        input_ptr += (width-1) * nc; /* so point to rightmost pixel */
-        output_ptr += width-1;
+        input_ptr += (width - 1) * nc; /* so point to rightmost pixel */
+        output_ptr += width - 1;
         dir = -1;
         dirnc = -nc;
-        errorptr = cquantize->fserrors[ci] + (width+1); /* => entry after last column */
+        errorptr = cquantize->fserrors[ci] + (width + 1); /* => entry after last column */
       } else {
         /* work left to right in this row */
         dir = 1;
@@ -675,15 +674,15 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
          * The maximum error is +- MAXJSAMPLE; this sets the required size
          * of the range_limit array.
          */
-        cur += GETJSAMPLE(*input_ptr);
-        cur = GETJSAMPLE(range_limit[cur]);
+        cur += *input_ptr;
+        cur = range_limit[cur];
         /* Select output value, accumulate into output code for this pixel */
-        pixcode = GETJSAMPLE(colorindex_ci[cur]);
-        *output_ptr += (JSAMPLE) pixcode;
+        pixcode = colorindex_ci[cur];
+        *output_ptr += (JSAMPLE)pixcode;
         /* Compute actual representation error at this pixel */
         /* Note: we can do this even though we don't have the final */
         /* pixel code, because the colormap is orthogonal. */
-        cur -= GETJSAMPLE(colormap_ci[pixcode]);
+        cur -= colormap_ci[pixcode];
         /* Compute error fractions to be propagated to adjacent pixels.
          * Add these into the running sums, and simultaneously shift the
          * next-line error sums left by 1 column.
@@ -691,7 +690,7 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
         bnexterr = cur;
         delta = cur * 2;
         cur += delta;           /* form error * 3 */
-        errorptr[0] = (FSERROR) (bpreverr + cur);
+        errorptr[0] = (FSERROR)(bpreverr + cur);
         cur += delta;           /* form error * 5 */
         bpreverr = belowerr + cur;
         belowerr = bnexterr;
@@ -708,7 +707,7 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
        * final fserrors[] entry.  Note we need not unload belowerr because
        * it is for the dummy column before or after the actual array.
        */
-      errorptr[0] = (FSERROR) bpreverr; /* unload prev err into array */
+      errorptr[0] = (FSERROR)bpreverr; /* unload prev err into array */
     }
     cquantize->on_odd_row = (cquantize->on_odd_row ? FALSE : TRUE);
   }
@@ -720,16 +719,16 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
  */
 
 LOCAL(void)
-alloc_fs_workspace (j_decompress_ptr cinfo)
+alloc_fs_workspace(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   size_t arraysize;
   int i;
 
-  arraysize = (size_t) ((cinfo->output_width + 2) * sizeof(FSERROR));
+  arraysize = (size_t)((cinfo->output_width + 2) * sizeof(FSERROR));
   for (i = 0; i < cinfo->out_color_components; i++) {
     cquantize->fserrors[i] = (FSERRPTR)
-      (*cinfo->mem->alloc_large)((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE, arraysize);
   }
 }
 
@@ -739,9 +738,9 @@ alloc_fs_workspace (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
+start_pass_1_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   size_t arraysize;
   int i;
 
@@ -767,7 +766,7 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
      * we must recreate the color index table with padding.
      * This will cost extra space, but probably isn't very likely.
      */
-    if (! cquantize->is_padded)
+    if (!cquantize->is_padded)
       create_colorindex(cinfo);
     /* Create ordered-dither tables if we didn't already. */
     if (cquantize->odither[0] == NULL)
@@ -780,9 +779,9 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
     if (cquantize->fserrors[0] == NULL)
       alloc_fs_workspace(cinfo);
     /* Initialize the propagated errors to zero. */
-    arraysize = (size_t) ((cinfo->output_width + 2) * sizeof(FSERROR));
+    arraysize = (size_t)((cinfo->output_width + 2) * sizeof(FSERROR));
     for (i = 0; i < cinfo->out_color_components; i++)
-      jzero_far((void *) cquantize->fserrors[i], arraysize);
+      jzero_far((void *)cquantize->fserrors[i], arraysize);
     break;
   default:
     ERREXIT(cinfo, JERR_NOT_COMPILED);
@@ -796,7 +795,7 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
  */
 
 METHODDEF(void)
-finish_pass_1_quant (j_decompress_ptr cinfo)
+finish_pass_1_quant(j_decompress_ptr cinfo)
 {
   /* no work in 1-pass case */
 }
@@ -808,7 +807,7 @@ finish_pass_1_quant (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-new_color_map_1_quant (j_decompress_ptr cinfo)
+new_color_map_1_quant(j_decompress_ptr cinfo)
 {
   ERREXIT(cinfo, JERR_MODE_CHANGE);
 }
@@ -819,14 +818,14 @@ new_color_map_1_quant (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_1pass_quantizer (j_decompress_ptr cinfo)
+jinit_1pass_quantizer(j_decompress_ptr cinfo)
 {
   my_cquantize_ptr cquantize;
 
   cquantize = (my_cquantize_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_cquantizer));
-  cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize;
+  cinfo->cquantize = (struct jpeg_color_quantizer *)cquantize;
   cquantize->pub.start_pass = start_pass_1_quant;
   cquantize->pub.finish_pass = finish_pass_1_quant;
   cquantize->pub.new_color_map = new_color_map_1_quant;
@@ -837,8 +836,8 @@ jinit_1pass_quantizer (j_decompress_ptr cinfo)
   if (cinfo->out_color_components > MAX_Q_COMPS)
     ERREXIT1(cinfo, JERR_QUANT_COMPONENTS, MAX_Q_COMPS);
   /* Make sure colormap indexes can be represented by JSAMPLEs */
-  if (cinfo->desired_number_of_colors > (MAXJSAMPLE+1))
-    ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXJSAMPLE+1);
+  if (cinfo->desired_number_of_colors > (MAXJSAMPLE + 1))
+    ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXJSAMPLE + 1);
 
   /* Create the colormap and color index table. */
   create_colormap(cinfo);
diff --git a/media/libjpeg/jquant2.c b/media/libjpeg/jquant2.c
index cfbd0f1526..44efb18cad 100644
--- a/media/libjpeg/jquant2.c
+++ b/media/libjpeg/jquant2.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2014-2015, D. R. Commander.
+ * Copyright (C) 2009, 2014-2015, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -73,14 +73,14 @@
  * probably need to change these scale factors.
  */
 
-#define R_SCALE 2               /* scale R distances by this much */
-#define G_SCALE 3               /* scale G distances by this much */
-#define B_SCALE 1               /* and B by this much */
+#define R_SCALE  2              /* scale R distances by this much */
+#define G_SCALE  3              /* scale G distances by this much */
+#define B_SCALE  1              /* and B by this much */
 
-static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
-#define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]]
-#define C1_SCALE c_scales[rgb_green[cinfo->out_color_space]]
-#define C2_SCALE c_scales[rgb_blue[cinfo->out_color_space]]
+static const int c_scales[3] = { R_SCALE, G_SCALE, B_SCALE };
+#define C0_SCALE  c_scales[rgb_red[cinfo->out_color_space]]
+#define C1_SCALE  c_scales[rgb_green[cinfo->out_color_space]]
+#define C2_SCALE  c_scales[rgb_blue[cinfo->out_color_space]]
 
 /*
  * First we have the histogram data structure and routines for creating it.
@@ -106,7 +106,7 @@ static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
  * each 2-D array has 2^6*2^5 = 2048 or 2^6*2^6 = 4096 entries.
  */
 
-#define MAXNUMCOLORS  (MAXJSAMPLE+1) /* maximum size of colormap */
+#define MAXNUMCOLORS  (MAXJSAMPLE + 1) /* maximum size of colormap */
 
 /* These will do the right thing for either R,G,B or B,G,R color order,
  * but you may not like the results for other color orders.
@@ -116,19 +116,19 @@ static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
 #define HIST_C2_BITS  5         /* bits of precision in B/R histogram */
 
 /* Number of elements along histogram axes. */
-#define HIST_C0_ELEMS  (1<<HIST_C0_BITS)
-#define HIST_C1_ELEMS  (1<<HIST_C1_BITS)
-#define HIST_C2_ELEMS  (1<<HIST_C2_BITS)
+#define HIST_C0_ELEMS  (1 << HIST_C0_BITS)
+#define HIST_C1_ELEMS  (1 << HIST_C1_BITS)
+#define HIST_C2_ELEMS  (1 << HIST_C2_BITS)
 
 /* These are the amounts to shift an input value to get a histogram index. */
-#define C0_SHIFT  (BITS_IN_JSAMPLE-HIST_C0_BITS)
-#define C1_SHIFT  (BITS_IN_JSAMPLE-HIST_C1_BITS)
-#define C2_SHIFT  (BITS_IN_JSAMPLE-HIST_C2_BITS)
+#define C0_SHIFT  (BITS_IN_JSAMPLE - HIST_C0_BITS)
+#define C1_SHIFT  (BITS_IN_JSAMPLE - HIST_C1_BITS)
+#define C2_SHIFT  (BITS_IN_JSAMPLE - HIST_C2_BITS)
 
 
 typedef UINT16 histcell;        /* histogram cell; prefer an unsigned type */
 
-typedef histcell *histptr; /* for pointers to histogram cells */
+typedef histcell *histptr;      /* for pointers to histogram cells */
 
 typedef histcell hist1d[HIST_C2_ELEMS]; /* typedefs for the array */
 typedef hist1d *hist2d;         /* type for the 2nd-level pointers */
@@ -200,10 +200,10 @@ typedef my_cquantizer *my_cquantize_ptr;
  */
 
 METHODDEF(void)
-prescan_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                  JSAMPARRAY output_buf, int num_rows)
+prescan_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                 JSAMPARRAY output_buf, int num_rows)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register JSAMPROW ptr;
   register histptr histp;
   register hist3d histogram = cquantize->histogram;
@@ -215,9 +215,9 @@ prescan_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     ptr = input_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the histogram */
-      histp = & histogram[GETJSAMPLE(ptr[0]) >> C0_SHIFT]
-                         [GETJSAMPLE(ptr[1]) >> C1_SHIFT]
-                         [GETJSAMPLE(ptr[2]) >> C2_SHIFT];
+      histp = &histogram[ptr[0] >> C0_SHIFT]
+                        [ptr[1] >> C1_SHIFT]
+                        [ptr[2] >> C2_SHIFT];
       /* increment, check for overflow and undo increment if so. */
       if (++(*histp) <= 0)
         (*histp)--;
@@ -249,7 +249,7 @@ typedef box *boxptr;
 
 
 LOCAL(boxptr)
-find_biggest_color_pop (boxptr boxlist, int numboxes)
+find_biggest_color_pop(boxptr boxlist, int numboxes)
 /* Find the splittable box with the largest color population */
 /* Returns NULL if no splittable boxes remain */
 {
@@ -269,7 +269,7 @@ find_biggest_color_pop (boxptr boxlist, int numboxes)
 
 
 LOCAL(boxptr)
-find_biggest_volume (boxptr boxlist, int numboxes)
+find_biggest_volume(boxptr boxlist, int numboxes)
 /* Find the splittable box with the largest (scaled) volume */
 /* Returns NULL if no splittable boxes remain */
 {
@@ -289,16 +289,16 @@ find_biggest_volume (boxptr boxlist, int numboxes)
 
 
 LOCAL(void)
-update_box (j_decompress_ptr cinfo, boxptr boxp)
+update_box(j_decompress_ptr cinfo, boxptr boxp)
 /* Shrink the min/max bounds of a box to enclose only nonzero elements, */
 /* and recompute its volume and population */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   histptr histp;
-  int c0,c1,c2;
-  int c0min,c0max,c1min,c1max,c2min,c2max;
-  JLONG dist0,dist1,dist2;
+  int c0, c1, c2;
+  int c0min, c0max, c1min, c1max, c2min, c2max;
+  JLONG dist0, dist1, dist2;
   long ccount;
 
   c0min = boxp->c0min;  c0max = boxp->c0max;
@@ -308,69 +308,69 @@ update_box (j_decompress_ptr cinfo, boxptr boxp)
   if (c0max > c0min)
     for (c0 = c0min; c0 <= c0max; c0++)
       for (c1 = c1min; c1 <= c1max; c1++) {
-        histp = & histogram[c0][c1][c2min];
+        histp = &histogram[c0][c1][c2min];
         for (c2 = c2min; c2 <= c2max; c2++)
           if (*histp++ != 0) {
             boxp->c0min = c0min = c0;
             goto have_c0min;
           }
       }
- have_c0min:
+have_c0min:
   if (c0max > c0min)
     for (c0 = c0max; c0 >= c0min; c0--)
       for (c1 = c1min; c1 <= c1max; c1++) {
-        histp = & histogram[c0][c1][c2min];
+        histp = &histogram[c0][c1][c2min];
         for (c2 = c2min; c2 <= c2max; c2++)
           if (*histp++ != 0) {
             boxp->c0max = c0max = c0;
             goto have_c0max;
           }
       }
- have_c0max:
+have_c0max:
   if (c1max > c1min)
     for (c1 = c1min; c1 <= c1max; c1++)
       for (c0 = c0min; c0 <= c0max; c0++) {
-        histp = & histogram[c0][c1][c2min];
+        histp = &histogram[c0][c1][c2min];
         for (c2 = c2min; c2 <= c2max; c2++)
           if (*histp++ != 0) {
             boxp->c1min = c1min = c1;
             goto have_c1min;
           }
       }
- have_c1min:
+have_c1min:
   if (c1max > c1min)
     for (c1 = c1max; c1 >= c1min; c1--)
       for (c0 = c0min; c0 <= c0max; c0++) {
-        histp = & histogram[c0][c1][c2min];
+        histp = &histogram[c0][c1][c2min];
         for (c2 = c2min; c2 <= c2max; c2++)
           if (*histp++ != 0) {
             boxp->c1max = c1max = c1;
             goto have_c1max;
           }
       }
- have_c1max:
+have_c1max:
   if (c2max > c2min)
     for (c2 = c2min; c2 <= c2max; c2++)
       for (c0 = c0min; c0 <= c0max; c0++) {
-        histp = & histogram[c0][c1min][c2];
+        histp = &histogram[c0][c1min][c2];
         for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
           if (*histp != 0) {
             boxp->c2min = c2min = c2;
             goto have_c2min;
           }
       }
- have_c2min:
+have_c2min:
   if (c2max > c2min)
     for (c2 = c2max; c2 >= c2min; c2--)
       for (c0 = c0min; c0 <= c0max; c0++) {
-        histp = & histogram[c0][c1min][c2];
+        histp = &histogram[c0][c1min][c2];
         for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
           if (*histp != 0) {
             boxp->c2max = c2max = c2;
             goto have_c2max;
           }
       }
- have_c2max:
+have_c2max:
 
   /* Update box volume.
    * We use 2-norm rather than real volume here; this biases the method
@@ -383,13 +383,13 @@ update_box (j_decompress_ptr cinfo, boxptr boxp)
   dist0 = ((c0max - c0min) << C0_SHIFT) * C0_SCALE;
   dist1 = ((c1max - c1min) << C1_SHIFT) * C1_SCALE;
   dist2 = ((c2max - c2min) << C2_SHIFT) * C2_SCALE;
-  boxp->volume = dist0*dist0 + dist1*dist1 + dist2*dist2;
+  boxp->volume = dist0 * dist0 + dist1 * dist1 + dist2 * dist2;
 
   /* Now scan remaining volume of box and compute population */
   ccount = 0;
   for (c0 = c0min; c0 <= c0max; c0++)
     for (c1 = c1min; c1 <= c1max; c1++) {
-      histp = & histogram[c0][c1][c2min];
+      histp = &histogram[c0][c1][c2min];
       for (c2 = c2min; c2 <= c2max; c2++, histp++)
         if (*histp != 0) {
           ccount++;
@@ -400,19 +400,19 @@ update_box (j_decompress_ptr cinfo, boxptr boxp)
 
 
 LOCAL(int)
-median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
-            int desired_colors)
+median_cut(j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
+           int desired_colors)
 /* Repeatedly select and split the largest box until we have enough boxes */
 {
-  int n,lb;
-  int c0,c1,c2,cmax;
-  register boxptr b1,b2;
+  int n, lb;
+  int c0, c1, c2, cmax;
+  register boxptr b1, b2;
 
   while (numboxes < desired_colors) {
     /* Select box to split.
      * Current algorithm: by population for first half, then by volume.
      */
-    if (numboxes*2 <= desired_colors) {
+    if (numboxes * 2 <= desired_colors) {
       b1 = find_biggest_color_pop(boxlist, numboxes);
     } else {
       b1 = find_biggest_volume(boxlist, numboxes);
@@ -421,8 +421,8 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
       break;
     b2 = &boxlist[numboxes];    /* where new box will go */
     /* Copy the color bounds to the new box. */
-    b2->c0max = b1->c0max; b2->c1max = b1->c1max; b2->c2max = b1->c2max;
-    b2->c0min = b1->c0min; b2->c1min = b1->c1min; b2->c2min = b1->c2min;
+    b2->c0max = b1->c0max;  b2->c1max = b1->c1max;  b2->c2max = b1->c2max;
+    b2->c0min = b1->c0min;  b2->c1min = b1->c1min;  b2->c2min = b1->c2min;
     /* Choose which axis to split the box on.
      * Current algorithm: longest scaled axis.
      * See notes in update_box about scaling distances.
@@ -434,13 +434,12 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
      * This code does the right thing for R,G,B or B,G,R color orders only.
      */
     if (rgb_red[cinfo->out_color_space] == 0) {
-      cmax = c1; n = 1;
-      if (c0 > cmax) { cmax = c0; n = 0; }
+      cmax = c1;  n = 1;
+      if (c0 > cmax) { cmax = c0;  n = 0; }
       if (c2 > cmax) { n = 2; }
-    }
-    else {
-      cmax = c1; n = 1;
-      if (c2 > cmax) { cmax = c2; n = 2; }
+    } else {
+      cmax = c1;  n = 1;
+      if (c2 > cmax) { cmax = c2;  n = 2; }
       if (c0 > cmax) { n = 0; }
     }
     /* Choose split point along selected axis, and update box bounds.
@@ -453,17 +452,17 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
     case 0:
       lb = (b1->c0max + b1->c0min) / 2;
       b1->c0max = lb;
-      b2->c0min = lb+1;
+      b2->c0min = lb + 1;
       break;
     case 1:
       lb = (b1->c1max + b1->c1min) / 2;
       b1->c1max = lb;
-      b2->c1min = lb+1;
+      b2->c1min = lb + 1;
       break;
     case 2:
       lb = (b1->c2max + b1->c2min) / 2;
       b1->c2max = lb;
-      b2->c2min = lb+1;
+      b2->c2min = lb + 1;
       break;
     }
     /* Update stats for boxes */
@@ -476,16 +475,16 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
 
 
 LOCAL(void)
-compute_color (j_decompress_ptr cinfo, boxptr boxp, int icolor)
+compute_color(j_decompress_ptr cinfo, boxptr boxp, int icolor)
 /* Compute representative color for a box, put it in colormap[icolor] */
 {
   /* Current algorithm: mean weighted by pixels (not colors) */
   /* Note it is important to get the rounding correct! */
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   histptr histp;
-  int c0,c1,c2;
-  int c0min,c0max,c1min,c1max,c2min,c2max;
+  int c0, c1, c2;
+  int c0min, c0max, c1min, c1max, c2min, c2max;
   long count;
   long total = 0;
   long c0total = 0;
@@ -498,25 +497,25 @@ compute_color (j_decompress_ptr cinfo, boxptr boxp, int icolor)
 
   for (c0 = c0min; c0 <= c0max; c0++)
     for (c1 = c1min; c1 <= c1max; c1++) {
-      histp = & histogram[c0][c1][c2min];
+      histp = &histogram[c0][c1][c2min];
       for (c2 = c2min; c2 <= c2max; c2++) {
         if ((count = *histp++) != 0) {
           total += count;
-          c0total += ((c0 << C0_SHIFT) + ((1<<C0_SHIFT)>>1)) * count;
-          c1total += ((c1 << C1_SHIFT) + ((1<<C1_SHIFT)>>1)) * count;
-          c2total += ((c2 << C2_SHIFT) + ((1<<C2_SHIFT)>>1)) * count;
+          c0total += ((c0 << C0_SHIFT) + ((1 << C0_SHIFT) >> 1)) * count;
+          c1total += ((c1 << C1_SHIFT) + ((1 << C1_SHIFT) >> 1)) * count;
+          c2total += ((c2 << C2_SHIFT) + ((1 << C2_SHIFT) >> 1)) * count;
         }
       }
     }
 
-  cinfo->colormap[0][icolor] = (JSAMPLE) ((c0total + (total>>1)) / total);
-  cinfo->colormap[1][icolor] = (JSAMPLE) ((c1total + (total>>1)) / total);
-  cinfo->colormap[2][icolor] = (JSAMPLE) ((c2total + (total>>1)) / total);
+  cinfo->colormap[0][icolor] = (JSAMPLE)((c0total + (total >> 1)) / total);
+  cinfo->colormap[1][icolor] = (JSAMPLE)((c1total + (total >> 1)) / total);
+  cinfo->colormap[2][icolor] = (JSAMPLE)((c2total + (total >> 1)) / total);
 }
 
 
 LOCAL(void)
-select_colors (j_decompress_ptr cinfo, int desired_colors)
+select_colors(j_decompress_ptr cinfo, int desired_colors)
 /* Master routine for color selection */
 {
   boxptr boxlist;
@@ -524,8 +523,8 @@ select_colors (j_decompress_ptr cinfo, int desired_colors)
   int i;
 
   /* Allocate workspace for box list */
-  boxlist = (boxptr) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, desired_colors * sizeof(box));
+  boxlist = (boxptr)(*cinfo->mem->alloc_small)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, desired_colors * sizeof(box));
   /* Initialize one box containing whole space */
   numboxes = 1;
   boxlist[0].c0min = 0;
@@ -535,12 +534,12 @@ select_colors (j_decompress_ptr cinfo, int desired_colors)
   boxlist[0].c2min = 0;
   boxlist[0].c2max = MAXJSAMPLE >> C2_SHIFT;
   /* Shrink it to actually-used volume and set its statistics */
-  update_box(cinfo, & boxlist[0]);
+  update_box(cinfo, &boxlist[0]);
   /* Perform median-cut to produce final box list */
   numboxes = median_cut(cinfo, boxlist, numboxes, desired_colors);
   /* Compute the representative color for each box, fill colormap */
   for (i = 0; i < numboxes; i++)
-    compute_color(cinfo, & boxlist[i], i);
+    compute_color(cinfo, &boxlist[i], i);
   cinfo->actual_number_of_colors = numboxes;
   TRACEMS1(cinfo, 1, JTRC_QUANT_SELECTED, numboxes);
 }
@@ -601,13 +600,13 @@ select_colors (j_decompress_ptr cinfo, int desired_colors)
 
 
 /* log2(histogram cells in update box) for each axis; this can be adjusted */
-#define BOX_C0_LOG  (HIST_C0_BITS-3)
-#define BOX_C1_LOG  (HIST_C1_BITS-3)
-#define BOX_C2_LOG  (HIST_C2_BITS-3)
+#define BOX_C0_LOG  (HIST_C0_BITS - 3)
+#define BOX_C1_LOG  (HIST_C1_BITS - 3)
+#define BOX_C2_LOG  (HIST_C2_BITS - 3)
 
-#define BOX_C0_ELEMS  (1<<BOX_C0_LOG) /* # of hist cells in update box */
-#define BOX_C1_ELEMS  (1<<BOX_C1_LOG)
-#define BOX_C2_ELEMS  (1<<BOX_C2_LOG)
+#define BOX_C0_ELEMS  (1 << BOX_C0_LOG) /* # of hist cells in update box */
+#define BOX_C1_ELEMS  (1 << BOX_C1_LOG)
+#define BOX_C2_ELEMS  (1 << BOX_C2_LOG)
 
 #define BOX_C0_SHIFT  (C0_SHIFT + BOX_C0_LOG)
 #define BOX_C1_SHIFT  (C1_SHIFT + BOX_C1_LOG)
@@ -623,8 +622,8 @@ select_colors (j_decompress_ptr cinfo, int desired_colors)
  */
 
 LOCAL(int)
-find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
-                    JSAMPLE colorlist[])
+find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
+                   JSAMPLE colorlist[])
 /* Locate the colormap entries close enough to an update box to be candidates
  * for the nearest entry to some cell(s) in the update box.  The update box
  * is specified by the center coordinates of its first cell.  The number of
@@ -666,70 +665,70 @@ find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 
   for (i = 0; i < numcolors; i++) {
     /* We compute the squared-c0-distance term, then add in the other two. */
-    x = GETJSAMPLE(cinfo->colormap[0][i]);
+    x = cinfo->colormap[0][i];
     if (x < minc0) {
       tdist = (x - minc0) * C0_SCALE;
-      min_dist = tdist*tdist;
+      min_dist = tdist * tdist;
       tdist = (x - maxc0) * C0_SCALE;
-      max_dist = tdist*tdist;
+      max_dist = tdist * tdist;
     } else if (x > maxc0) {
       tdist = (x - maxc0) * C0_SCALE;
-      min_dist = tdist*tdist;
+      min_dist = tdist * tdist;
       tdist = (x - minc0) * C0_SCALE;
-      max_dist = tdist*tdist;
+      max_dist = tdist * tdist;
     } else {
       /* within cell range so no contribution to min_dist */
       min_dist = 0;
       if (x <= centerc0) {
         tdist = (x - maxc0) * C0_SCALE;
-        max_dist = tdist*tdist;
+        max_dist = tdist * tdist;
       } else {
         tdist = (x - minc0) * C0_SCALE;
-        max_dist = tdist*tdist;
+        max_dist = tdist * tdist;
       }
     }
 
-    x = GETJSAMPLE(cinfo->colormap[1][i]);
+    x = cinfo->colormap[1][i];
     if (x < minc1) {
       tdist = (x - minc1) * C1_SCALE;
-      min_dist += tdist*tdist;
+      min_dist += tdist * tdist;
       tdist = (x - maxc1) * C1_SCALE;
-      max_dist += tdist*tdist;
+      max_dist += tdist * tdist;
     } else if (x > maxc1) {
       tdist = (x - maxc1) * C1_SCALE;
-      min_dist += tdist*tdist;
+      min_dist += tdist * tdist;
       tdist = (x - minc1) * C1_SCALE;
-      max_dist += tdist*tdist;
+      max_dist += tdist * tdist;
     } else {
       /* within cell range so no contribution to min_dist */
       if (x <= centerc1) {
         tdist = (x - maxc1) * C1_SCALE;
-        max_dist += tdist*tdist;
+        max_dist += tdist * tdist;
       } else {
         tdist = (x - minc1) * C1_SCALE;
-        max_dist += tdist*tdist;
+        max_dist += tdist * tdist;
       }
     }
 
-    x = GETJSAMPLE(cinfo->colormap[2][i]);
+    x = cinfo->colormap[2][i];
     if (x < minc2) {
       tdist = (x - minc2) * C2_SCALE;
-      min_dist += tdist*tdist;
+      min_dist += tdist * tdist;
       tdist = (x - maxc2) * C2_SCALE;
-      max_dist += tdist*tdist;
+      max_dist += tdist * tdist;
     } else if (x > maxc2) {
       tdist = (x - maxc2) * C2_SCALE;
-      min_dist += tdist*tdist;
+      min_dist += tdist * tdist;
       tdist = (x - minc2) * C2_SCALE;
-      max_dist += tdist*tdist;
+      max_dist += tdist * tdist;
     } else {
       /* within cell range so no contribution to min_dist */
       if (x <= centerc2) {
         tdist = (x - maxc2) * C2_SCALE;
-        max_dist += tdist*tdist;
+        max_dist += tdist * tdist;
       } else {
         tdist = (x - minc2) * C2_SCALE;
-        max_dist += tdist*tdist;
+        max_dist += tdist * tdist;
       }
     }
 
@@ -745,15 +744,15 @@ find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
   ncolors = 0;
   for (i = 0; i < numcolors; i++) {
     if (mindist[i] <= minmaxdist)
-      colorlist[ncolors++] = (JSAMPLE) i;
+      colorlist[ncolors++] = (JSAMPLE)i;
   }
   return ncolors;
 }
 
 
 LOCAL(void)
-find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
-                  int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[])
+find_best_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
+                 int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[])
 /* Find the closest colormap entry for each cell in the update box,
  * given the list of candidate colors prepared by find_nearby_colors.
  * Return the indexes of the closest entries in the bestcolor[] array.
@@ -775,7 +774,7 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 
   /* Initialize best-distance for each cell of the update box */
   bptr = bestdist;
-  for (i = BOX_C0_ELEMS*BOX_C1_ELEMS*BOX_C2_ELEMS-1; i >= 0; i--)
+  for (i = BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS - 1; i >= 0; i--)
     *bptr++ = 0x7FFFFFFFL;
 
   /* For each color selected by find_nearby_colors,
@@ -789,14 +788,14 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 #define STEP_C2  ((1 << C2_SHIFT) * C2_SCALE)
 
   for (i = 0; i < numcolors; i++) {
-    icolor = GETJSAMPLE(colorlist[i]);
+    icolor = colorlist[i];
     /* Compute (square of) distance from minc0/c1/c2 to this color */
-    inc0 = (minc0 - GETJSAMPLE(cinfo->colormap[0][icolor])) * C0_SCALE;
-    dist0 = inc0*inc0;
-    inc1 = (minc1 - GETJSAMPLE(cinfo->colormap[1][icolor])) * C1_SCALE;
-    dist0 += inc1*inc1;
-    inc2 = (minc2 - GETJSAMPLE(cinfo->colormap[2][icolor])) * C2_SCALE;
-    dist0 += inc2*inc2;
+    inc0 = (minc0 - cinfo->colormap[0][icolor]) * C0_SCALE;
+    dist0 = inc0 * inc0;
+    inc1 = (minc1 - cinfo->colormap[1][icolor]) * C1_SCALE;
+    dist0 += inc1 * inc1;
+    inc2 = (minc2 - cinfo->colormap[2][icolor]) * C2_SCALE;
+    dist0 += inc2 * inc2;
     /* Form the initial difference increments */
     inc0 = inc0 * (2 * STEP_C0) + STEP_C0 * STEP_C0;
     inc1 = inc1 * (2 * STEP_C1) + STEP_C1 * STEP_C1;
@@ -805,16 +804,16 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
     bptr = bestdist;
     cptr = bestcolor;
     xx0 = inc0;
-    for (ic0 = BOX_C0_ELEMS-1; ic0 >= 0; ic0--) {
+    for (ic0 = BOX_C0_ELEMS - 1; ic0 >= 0; ic0--) {
       dist1 = dist0;
       xx1 = inc1;
-      for (ic1 = BOX_C1_ELEMS-1; ic1 >= 0; ic1--) {
+      for (ic1 = BOX_C1_ELEMS - 1; ic1 >= 0; ic1--) {
         dist2 = dist1;
         xx2 = inc2;
-        for (ic2 = BOX_C2_ELEMS-1; ic2 >= 0; ic2--) {
+        for (ic2 = BOX_C2_ELEMS - 1; ic2 >= 0; ic2--) {
           if (dist2 < *bptr) {
             *bptr = dist2;
-            *cptr = (JSAMPLE) icolor;
+            *cptr = (JSAMPLE)icolor;
           }
           dist2 += xx2;
           xx2 += 2 * STEP_C2 * STEP_C2;
@@ -832,12 +831,12 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 
 
 LOCAL(void)
-fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2)
+fill_inverse_cmap(j_decompress_ptr cinfo, int c0, int c1, int c2)
 /* Fill the inverse-colormap entries in the update box that contains */
 /* histogram cell c0/c1/c2.  (Only that one cell MUST be filled, but */
 /* we can fill as many others as we wish.) */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   int minc0, minc1, minc2;      /* lower left corner of update box */
   int ic0, ic1, ic2;
@@ -878,9 +877,9 @@ fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2)
   cptr = bestcolor;
   for (ic0 = 0; ic0 < BOX_C0_ELEMS; ic0++) {
     for (ic1 = 0; ic1 < BOX_C1_ELEMS; ic1++) {
-      cachep = & histogram[c0+ic0][c1+ic1][c2];
+      cachep = &histogram[c0 + ic0][c1 + ic1][c2];
       for (ic2 = 0; ic2 < BOX_C2_ELEMS; ic2++) {
-        *cachep++ = (histcell) (GETJSAMPLE(*cptr++) + 1);
+        *cachep++ = (histcell)((*cptr++) + 1);
       }
     }
   }
@@ -892,11 +891,11 @@ fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2)
  */
 
 METHODDEF(void)
-pass2_no_dither (j_decompress_ptr cinfo,
-                 JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows)
+pass2_no_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPARRAY output_buf, int num_rows)
 /* This version performs no dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   register JSAMPROW inptr, outptr;
   register histptr cachep;
@@ -910,27 +909,27 @@ pass2_no_dither (j_decompress_ptr cinfo,
     outptr = output_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the cache */
-      c0 = GETJSAMPLE(*inptr++) >> C0_SHIFT;
-      c1 = GETJSAMPLE(*inptr++) >> C1_SHIFT;
-      c2 = GETJSAMPLE(*inptr++) >> C2_SHIFT;
-      cachep = & histogram[c0][c1][c2];
+      c0 = (*inptr++) >> C0_SHIFT;
+      c1 = (*inptr++) >> C1_SHIFT;
+      c2 = (*inptr++) >> C2_SHIFT;
+      cachep = &histogram[c0][c1][c2];
       /* If we have not seen this color before, find nearest colormap entry */
       /* and update the cache */
       if (*cachep == 0)
-        fill_inverse_cmap(cinfo, c0,c1,c2);
+        fill_inverse_cmap(cinfo, c0, c1, c2);
       /* Now emit the colormap index for this cell */
-      *outptr++ = (JSAMPLE) (*cachep - 1);
+      *outptr++ = (JSAMPLE)(*cachep - 1);
     }
   }
 }
 
 
 METHODDEF(void)
-pass2_fs_dither (j_decompress_ptr cinfo,
-                 JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows)
+pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPARRAY output_buf, int num_rows)
 /* This version performs Floyd-Steinberg dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   register LOCFSERROR cur0, cur1, cur2; /* current error or pixel value */
   LOCFSERROR belowerr0, belowerr1, belowerr2; /* error for pixel below cur */
@@ -956,11 +955,11 @@ pass2_fs_dither (j_decompress_ptr cinfo,
     outptr = output_buf[row];
     if (cquantize->on_odd_row) {
       /* work right to left in this row */
-      inptr += (width-1) * 3;   /* so point to rightmost pixel */
-      outptr += width-1;
+      inptr += (width - 1) * 3; /* so point to rightmost pixel */
+      outptr += width - 1;
       dir = -1;
       dir3 = -3;
-      errorptr = cquantize->fserrors + (width+1)*3; /* => entry after last column */
+      errorptr = cquantize->fserrors + (width + 1) * 3; /* => entry after last column */
       cquantize->on_odd_row = FALSE; /* flip for next time */
     } else {
       /* work left to right in this row */
@@ -984,9 +983,9 @@ pass2_fs_dither (j_decompress_ptr cinfo,
        * for either sign of the error value.
        * Note: errorptr points to *previous* column's array entry.
        */
-      cur0 = RIGHT_SHIFT(cur0 + errorptr[dir3+0] + 8, 4);
-      cur1 = RIGHT_SHIFT(cur1 + errorptr[dir3+1] + 8, 4);
-      cur2 = RIGHT_SHIFT(cur2 + errorptr[dir3+2] + 8, 4);
+      cur0 = RIGHT_SHIFT(cur0 + errorptr[dir3 + 0] + 8, 4);
+      cur1 = RIGHT_SHIFT(cur1 + errorptr[dir3 + 1] + 8, 4);
+      cur2 = RIGHT_SHIFT(cur2 + errorptr[dir3 + 2] + 8, 4);
       /* Limit the error using transfer function set by init_error_limit.
        * See comments with init_error_limit for rationale.
        */
@@ -997,44 +996,48 @@ pass2_fs_dither (j_decompress_ptr cinfo,
        * The maximum error is +- MAXJSAMPLE (or less with error limiting);
        * this sets the required size of the range_limit array.
        */
-      cur0 += GETJSAMPLE(inptr[0]);
-      cur1 += GETJSAMPLE(inptr[1]);
-      cur2 += GETJSAMPLE(inptr[2]);
-      cur0 = GETJSAMPLE(range_limit[cur0]);
-      cur1 = GETJSAMPLE(range_limit[cur1]);
-      cur2 = GETJSAMPLE(range_limit[cur2]);
+      cur0 += inptr[0];
+      cur1 += inptr[1];
+      cur2 += inptr[2];
+      cur0 = range_limit[cur0];
+      cur1 = range_limit[cur1];
+      cur2 = range_limit[cur2];
       /* Index into the cache with adjusted pixel value */
-      cachep = & histogram[cur0>>C0_SHIFT][cur1>>C1_SHIFT][cur2>>C2_SHIFT];
+      cachep =
+        &histogram[cur0 >> C0_SHIFT][cur1 >> C1_SHIFT][cur2 >> C2_SHIFT];
       /* If we have not seen this color before, find nearest colormap */
       /* entry and update the cache */
       if (*cachep == 0)
-        fill_inverse_cmap(cinfo, cur0>>C0_SHIFT,cur1>>C1_SHIFT,cur2>>C2_SHIFT);
+        fill_inverse_cmap(cinfo, cur0 >> C0_SHIFT, cur1 >> C1_SHIFT,
+                          cur2 >> C2_SHIFT);
       /* Now emit the colormap index for this cell */
-      { register int pixcode = *cachep - 1;
-        *outptr = (JSAMPLE) pixcode;
+      {
+        register int pixcode = *cachep - 1;
+        *outptr = (JSAMPLE)pixcode;
         /* Compute representation error for this pixel */
-        cur0 -= GETJSAMPLE(colormap0[pixcode]);
-        cur1 -= GETJSAMPLE(colormap1[pixcode]);
-        cur2 -= GETJSAMPLE(colormap2[pixcode]);
+        cur0 -= colormap0[pixcode];
+        cur1 -= colormap1[pixcode];
+        cur2 -= colormap2[pixcode];
       }
       /* Compute error fractions to be propagated to adjacent pixels.
        * Add these into the running sums, and simultaneously shift the
        * next-line error sums left by 1 column.
        */
-      { register LOCFSERROR bnexterr;
+      {
+        register LOCFSERROR bnexterr;
 
         bnexterr = cur0;        /* Process component 0 */
-        errorptr[0] = (FSERROR) (bpreverr0 + cur0 * 3);
+        errorptr[0] = (FSERROR)(bpreverr0 + cur0 * 3);
         bpreverr0 = belowerr0 + cur0 * 5;
         belowerr0 = bnexterr;
         cur0 *= 7;
         bnexterr = cur1;        /* Process component 1 */
-        errorptr[1] = (FSERROR) (bpreverr1 + cur1 * 3);
+        errorptr[1] = (FSERROR)(bpreverr1 + cur1 * 3);
         bpreverr1 = belowerr1 + cur1 * 5;
         belowerr1 = bnexterr;
         cur1 *= 7;
         bnexterr = cur2;        /* Process component 2 */
-        errorptr[2] = (FSERROR) (bpreverr2 + cur2 * 3);
+        errorptr[2] = (FSERROR)(bpreverr2 + cur2 * 3);
         bpreverr2 = belowerr2 + cur2 * 5;
         belowerr2 = bnexterr;
         cur2 *= 7;
@@ -1051,9 +1054,9 @@ pass2_fs_dither (j_decompress_ptr cinfo,
      * final fserrors[] entry.  Note we need not unload belowerrN because
      * it is for the dummy column before or after the actual array.
      */
-    errorptr[0] = (FSERROR) bpreverr0; /* unload prev errs into array */
-    errorptr[1] = (FSERROR) bpreverr1;
-    errorptr[2] = (FSERROR) bpreverr2;
+    errorptr[0] = (FSERROR)bpreverr0; /* unload prev errs into array */
+    errorptr[1] = (FSERROR)bpreverr1;
+    errorptr[2] = (FSERROR)bpreverr2;
   }
 }
 
@@ -1076,31 +1079,31 @@ pass2_fs_dither (j_decompress_ptr cinfo,
  */
 
 LOCAL(void)
-init_error_limit (j_decompress_ptr cinfo)
+init_error_limit(j_decompress_ptr cinfo)
 /* Allocate and fill in the error_limiter table */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   int *table;
   int in, out;
 
-  table = (int *) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE*2+1) * sizeof(int));
+  table = (int *)(*cinfo->mem->alloc_small)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, (MAXJSAMPLE * 2 + 1) * sizeof(int));
   table += MAXJSAMPLE;          /* so can index -MAXJSAMPLE .. +MAXJSAMPLE */
   cquantize->error_limiter = table;
 
-#define STEPSIZE ((MAXJSAMPLE+1)/16)
+#define STEPSIZE  ((MAXJSAMPLE + 1) / 16)
   /* Map errors 1:1 up to +- MAXJSAMPLE/16 */
   out = 0;
   for (in = 0; in < STEPSIZE; in++, out++) {
-    table[in] = out; table[-in] = -out;
+    table[in] = out;  table[-in] = -out;
   }
   /* Map errors 1:2 up to +- 3*MAXJSAMPLE/16 */
-  for (; in < STEPSIZE*3; in++, out += (in&1) ? 0 : 1) {
-    table[in] = out; table[-in] = -out;
+  for (; in < STEPSIZE * 3; in++, out += (in & 1) ? 0 : 1) {
+    table[in] = out;  table[-in] = -out;
   }
   /* Clamp the rest to final out value (which is (MAXJSAMPLE+1)/8) */
   for (; in <= MAXJSAMPLE; in++) {
-    table[in] = out; table[-in] = -out;
+    table[in] = out;  table[-in] = -out;
   }
 #undef STEPSIZE
 }
@@ -1111,9 +1114,9 @@ init_error_limit (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-finish_pass1 (j_decompress_ptr cinfo)
+finish_pass1(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
 
   /* Select the representative colors and fill in cinfo->colormap */
   cinfo->colormap = cquantize->sv_colormap;
@@ -1124,7 +1127,7 @@ finish_pass1 (j_decompress_ptr cinfo)
 
 
 METHODDEF(void)
-finish_pass2 (j_decompress_ptr cinfo)
+finish_pass2(j_decompress_ptr cinfo)
 {
   /* no work */
 }
@@ -1135,14 +1138,14 @@ finish_pass2 (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
+start_pass_2_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   int i;
 
   /* Only F-S dithering or no dithering is supported. */
-  /* If user asks for ordered dither, give him F-S. */
+  /* If user asks for ordered dither, give them F-S. */
   if (cinfo->dither_mode != JDITHER_NONE)
     cinfo->dither_mode = JDITHER_FS;
 
@@ -1167,14 +1170,14 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
       ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
 
     if (cinfo->dither_mode == JDITHER_FS) {
-      size_t arraysize = (size_t) ((cinfo->output_width + 2) *
-                                   (3 * sizeof(FSERROR)));
+      size_t arraysize =
+        (size_t)((cinfo->output_width + 2) * (3 * sizeof(FSERROR)));
       /* Allocate Floyd-Steinberg workspace if we didn't already. */
       if (cquantize->fserrors == NULL)
-        cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
+        cquantize->fserrors = (FSERRPTR)(*cinfo->mem->alloc_large)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, arraysize);
       /* Initialize the propagated errors to zero. */
-      jzero_far((void *) cquantize->fserrors, arraysize);
+      jzero_far((void *)cquantize->fserrors, arraysize);
       /* Make the error-limit table if we didn't already. */
       if (cquantize->error_limiter == NULL)
         init_error_limit(cinfo);
@@ -1185,8 +1188,8 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
   /* Zero the histogram or inverse color map, if necessary */
   if (cquantize->needs_zeroed) {
     for (i = 0; i < HIST_C0_ELEMS; i++) {
-      jzero_far((void *) histogram[i],
-                HIST_C1_ELEMS*HIST_C2_ELEMS * sizeof(histcell));
+      jzero_far((void *)histogram[i],
+                HIST_C1_ELEMS * HIST_C2_ELEMS * sizeof(histcell));
     }
     cquantize->needs_zeroed = FALSE;
   }
@@ -1198,9 +1201,9 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
  */
 
 METHODDEF(void)
-new_color_map_2_quant (j_decompress_ptr cinfo)
+new_color_map_2_quant(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
 
   /* Reset the inverse color map */
   cquantize->needs_zeroed = TRUE;
@@ -1212,15 +1215,15 @@ new_color_map_2_quant (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_2pass_quantizer (j_decompress_ptr cinfo)
+jinit_2pass_quantizer(j_decompress_ptr cinfo)
 {
   my_cquantize_ptr cquantize;
   int i;
 
   cquantize = (my_cquantize_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_cquantizer));
-  cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize;
+  cinfo->cquantize = (struct jpeg_color_quantizer *)cquantize;
   cquantize->pub.start_pass = start_pass_2_quant;
   cquantize->pub.new_color_map = new_color_map_2_quant;
   cquantize->fserrors = NULL;   /* flag optional arrays not allocated */
@@ -1231,12 +1234,12 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo)
     ERREXIT(cinfo, JERR_NOTIMPL);
 
   /* Allocate the histogram/inverse colormap storage */
-  cquantize->histogram = (hist3d) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, HIST_C0_ELEMS * sizeof(hist2d));
+  cquantize->histogram = (hist3d)(*cinfo->mem->alloc_small)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, HIST_C0_ELEMS * sizeof(hist2d));
   for (i = 0; i < HIST_C0_ELEMS; i++) {
-    cquantize->histogram[i] = (hist2d) (*cinfo->mem->alloc_large)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       HIST_C1_ELEMS*HIST_C2_ELEMS * sizeof(histcell));
+    cquantize->histogram[i] = (hist2d)(*cinfo->mem->alloc_large)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       HIST_C1_ELEMS * HIST_C2_ELEMS * sizeof(histcell));
   }
   cquantize->needs_zeroed = TRUE; /* histogram is garbage now */
 
@@ -1254,13 +1257,13 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo)
     if (desired > MAXNUMCOLORS)
       ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
     cquantize->sv_colormap = (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo,JPOOL_IMAGE, (JDIMENSION) desired, (JDIMENSION) 3);
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)desired, (JDIMENSION)3);
     cquantize->desired = desired;
   } else
     cquantize->sv_colormap = NULL;
 
   /* Only F-S dithering or no dithering is supported. */
-  /* If user asks for ordered dither, give him F-S. */
+  /* If user asks for ordered dither, give them F-S. */
   if (cinfo->dither_mode != JDITHER_NONE)
     cinfo->dither_mode = JDITHER_FS;
 
@@ -1271,9 +1274,9 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo)
    * dither_mode changes.
    */
   if (cinfo->dither_mode == JDITHER_FS) {
-    cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (size_t) ((cinfo->output_width + 2) * (3 * sizeof(FSERROR))));
+    cquantize->fserrors = (FSERRPTR)(*cinfo->mem->alloc_large)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (size_t)((cinfo->output_width + 2) * (3 * sizeof(FSERROR))));
     /* Might as well create the error-limiting table too. */
     init_error_limit(cinfo);
   }
diff --git a/media/libjpeg/jsimd.h b/media/libjpeg/jsimd.h
index 3aa0779b8a..6c203655ef 100644
--- a/media/libjpeg/jsimd.h
+++ b/media/libjpeg/jsimd.h
@@ -3,7 +3,8 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2011, 2014, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -13,81 +14,110 @@
 
 #include "jchuff.h"             /* Declarations shared with jcphuff.c */
 
-EXTERN(int) jsimd_can_rgb_ycc (void);
-EXTERN(int) jsimd_can_rgb_gray (void);
-EXTERN(int) jsimd_can_ycc_rgb (void);
-EXTERN(int) jsimd_can_ycc_rgb565 (void);
-EXTERN(int) jsimd_c_can_null_convert (void);
-
-EXTERN(void) jsimd_rgb_ycc_convert
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_rgb_gray_convert
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_ycc_rgb_convert
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_rgb565_convert
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_c_null_convert
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-
-EXTERN(int) jsimd_can_h2v2_downsample (void);
-EXTERN(int) jsimd_can_h2v1_downsample (void);
-
-EXTERN(void) jsimd_h2v2_downsample
-        (j_compress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
-
-EXTERN(int) jsimd_can_h2v2_smooth_downsample (void);
-
-EXTERN(void) jsimd_h2v2_smooth_downsample
-        (j_compress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
-
-EXTERN(void) jsimd_h2v1_downsample
-        (j_compress_ptr cinfo, jpeg_component_info *compptr,
-        JSAMPARRAY input_data, JSAMPARRAY output_data);
-
-EXTERN(int) jsimd_can_h2v2_upsample (void);
-EXTERN(int) jsimd_can_h2v1_upsample (void);
-EXTERN(int) jsimd_can_int_upsample (void);
-
-EXTERN(void) jsimd_h2v2_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v1_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_int_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-
-EXTERN(int) jsimd_can_h2v2_fancy_upsample (void);
-EXTERN(int) jsimd_can_h2v1_fancy_upsample (void);
-
-EXTERN(void) jsimd_h2v2_fancy_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v1_fancy_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-
-EXTERN(int) jsimd_can_h2v2_merged_upsample (void);
-EXTERN(int) jsimd_can_h2v1_merged_upsample (void);
-
-EXTERN(void) jsimd_h2v2_merged_upsample
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
-EXTERN(void) jsimd_h2v1_merged_upsample
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
-
-EXTERN(int) jsimd_can_huff_encode_one_block (void);
-
-EXTERN(JOCTET*) jsimd_huff_encode_one_block
-        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
-         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(int) jsimd_can_rgb_ycc(void);
+EXTERN(int) jsimd_can_rgb_gray(void);
+EXTERN(int) jsimd_can_ycc_rgb(void);
+EXTERN(int) jsimd_can_ycc_rgb565(void);
+EXTERN(int) jsimd_c_can_null_convert(void);
+
+EXTERN(void) jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                                   JSAMPIMAGE output_buf,
+                                   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                                    JSAMPIMAGE output_buf,
+                                    JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_ycc_rgb_convert(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf, JDIMENSION input_row,
+                                   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo,
+                                      JSAMPIMAGE input_buf,
+                                      JDIMENSION input_row,
+                                      JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                                  JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                  int num_rows);
+
+EXTERN(int) jsimd_can_h2v2_downsample(void);
+EXTERN(int) jsimd_can_h2v1_downsample(void);
+
+EXTERN(void) jsimd_h2v2_downsample(j_compress_ptr cinfo,
+                                   jpeg_component_info *compptr,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY output_data);
+
+EXTERN(int) jsimd_can_h2v2_smooth_downsample(void);
+
+EXTERN(void) jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                                          jpeg_component_info *compptr,
+                                          JSAMPARRAY input_data,
+                                          JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample(j_compress_ptr cinfo,
+                                   jpeg_component_info *compptr,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY output_data);
+
+EXTERN(int) jsimd_can_h2v2_upsample(void);
+EXTERN(int) jsimd_can_h2v1_upsample(void);
+EXTERN(int) jsimd_can_int_upsample(void);
+
+EXTERN(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo,
+                                 jpeg_component_info *compptr,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo,
+                                 jpeg_component_info *compptr,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_int_upsample(j_decompress_ptr cinfo,
+                                jpeg_component_info *compptr,
+                                JSAMPARRAY input_data,
+                                JSAMPARRAY *output_data_ptr);
+
+EXTERN(int) jsimd_can_h2v2_fancy_upsample(void);
+EXTERN(int) jsimd_can_h2v1_fancy_upsample(void);
+EXTERN(int) jsimd_can_h1v2_fancy_upsample(void);
+
+EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo,
+                                       jpeg_component_info *compptr,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo,
+                                       jpeg_component_info *compptr,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo,
+                                       jpeg_component_info *compptr,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr);
+
+EXTERN(int) jsimd_can_h2v2_merged_upsample(void);
+EXTERN(int) jsimd_can_h2v1_merged_upsample(void);
+
+EXTERN(void) jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf);
+
+EXTERN(int) jsimd_can_huff_encode_one_block(void);
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block(void *state, JOCTET *buffer,
+                                             JCOEFPTR block, int last_dc_val,
+                                             c_derived_tbl *dctbl,
+                                             c_derived_tbl *actbl);
+
+EXTERN(int) jsimd_can_encode_mcu_AC_first_prepare(void);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+
+EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
diff --git a/media/libjpeg/jsimd_none.c b/media/libjpeg/jsimd_none.c
index f29030cfa7..5b38a9fb5c 100644
--- a/media/libjpeg/jsimd_none.c
+++ b/media/libjpeg/jsimd_none.c
@@ -3,7 +3,8 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2009-2011, 2014, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,385 +21,411 @@
 #include "jsimddct.h"
 
 GLOBAL(int)
-jsimd_can_rgb_ycc (void)
+jsimd_can_rgb_ycc(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_rgb_gray (void)
+jsimd_can_rgb_gray(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_ycc_rgb (void)
+jsimd_can_ycc_rgb(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
+jsimd_can_ycc_rgb565(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_c_can_null_convert (void)
+jsimd_c_can_null_convert(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
 {
 }
 
 GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
 {
 }
 
 GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
 {
 }
 
 GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
 {
 }
 
 GLOBAL(void)
-jsimd_c_null_convert (j_compress_ptr cinfo,
-                      JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                      JDIMENSION output_row, int num_rows)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
 {
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
+jsimd_can_h2v2_downsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
+jsimd_can_h2v1_downsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_smooth_downsample (void)
+jsimd_can_h2v2_smooth_downsample(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
-                              jpeg_component_info *compptr,
-                              JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
+jsimd_can_h2v2_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
+jsimd_can_h2v1_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_int_upsample (void)
+jsimd_can_int_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                      JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
+jsimd_can_h2v2_fancy_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
+jsimd_can_h2v1_fancy_upsample(void)
 {
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
 GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
+jsimd_can_h2v2_merged_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
+jsimd_can_h2v1_merged_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
 }
 
 GLOBAL(int)
-jsimd_can_convsamp (void)
+jsimd_can_convsamp(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_convsamp_float (void)
+jsimd_can_convsamp_float(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
 {
 }
 
 GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
 {
 }
 
 GLOBAL(int)
-jsimd_can_fdct_islow (void)
+jsimd_can_fdct_islow(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_fdct_ifast (void)
+jsimd_can_fdct_ifast(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_fdct_float (void)
+jsimd_can_fdct_float(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
+jsimd_fdct_islow(DCTELEM *data)
 {
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
+jsimd_fdct_ifast(DCTELEM *data)
 {
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
+jsimd_fdct_float(FAST_FLOAT *data)
 {
 }
 
 GLOBAL(int)
-jsimd_can_quantize (void)
+jsimd_can_quantize(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_quantize_float (void)
+jsimd_can_quantize_float(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 {
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
 {
 }
 
 GLOBAL(int)
-jsimd_can_idct_2x2 (void)
+jsimd_can_idct_2x2(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_4x4 (void)
+jsimd_can_idct_4x4(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_6x6 (void)
+jsimd_can_idct_6x6(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_12x12 (void)
+jsimd_can_idct_12x12(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(int)
-jsimd_can_idct_islow (void)
+jsimd_can_idct_islow(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_ifast (void)
+jsimd_can_idct_ifast(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_float (void)
+jsimd_can_idct_float(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
+jsimd_can_huff_encode_one_block(void)
 {
   return 0;
 }
 
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
 {
   return NULL;
 }
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/media/libjpeg/jsimddct.h b/media/libjpeg/jsimddct.h
index b19ab48d40..55ee8cf67f 100644
--- a/media/libjpeg/jsimddct.h
+++ b/media/libjpeg/jsimddct.h
@@ -9,66 +9,62 @@
  *
  */
 
-EXTERN(int) jsimd_can_convsamp (void);
-EXTERN(int) jsimd_can_convsamp_float (void);
+EXTERN(int) jsimd_can_convsamp(void);
+EXTERN(int) jsimd_can_convsamp_float(void);
 
-EXTERN(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                             DCTELEM *workspace);
-EXTERN(void) jsimd_convsamp_float (JSAMPARRAY sample_data,
-                                   JDIMENSION start_col,
-                                   FAST_FLOAT *workspace);
+EXTERN(void) jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+                            DCTELEM *workspace);
+EXTERN(void) jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                                  FAST_FLOAT *workspace);
 
-EXTERN(int) jsimd_can_fdct_islow (void);
-EXTERN(int) jsimd_can_fdct_ifast (void);
-EXTERN(int) jsimd_can_fdct_float (void);
+EXTERN(int) jsimd_can_fdct_islow(void);
+EXTERN(int) jsimd_can_fdct_ifast(void);
+EXTERN(int) jsimd_can_fdct_float(void);
 
-EXTERN(void) jsimd_fdct_islow (DCTELEM *data);
-EXTERN(void) jsimd_fdct_ifast (DCTELEM *data);
-EXTERN(void) jsimd_fdct_float (FAST_FLOAT *data);
+EXTERN(void) jsimd_fdct_islow(DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast(DCTELEM *data);
+EXTERN(void) jsimd_fdct_float(FAST_FLOAT *data);
 
-EXTERN(int) jsimd_can_quantize (void);
-EXTERN(int) jsimd_can_quantize_float (void);
+EXTERN(int) jsimd_can_quantize(void);
+EXTERN(int) jsimd_can_quantize_float(void);
 
-EXTERN(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                             DCTELEM *workspace);
-EXTERN(void) jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                                   FAST_FLOAT *workspace);
+EXTERN(void) jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors,
+                            DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                                  FAST_FLOAT *workspace);
 
-EXTERN(int) jsimd_can_idct_2x2 (void);
-EXTERN(int) jsimd_can_idct_4x4 (void);
-EXTERN(int) jsimd_can_idct_6x6 (void);
-EXTERN(int) jsimd_can_idct_12x12 (void);
+EXTERN(int) jsimd_can_idct_2x2(void);
+EXTERN(int) jsimd_can_idct_4x4(void);
+EXTERN(int) jsimd_can_idct_6x6(void);
+EXTERN(int) jsimd_can_idct_12x12(void);
 
-EXTERN(void) jsimd_idct_2x2 (j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr,
-                             JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                             JDIMENSION output_col);
-EXTERN(void) jsimd_idct_4x4 (j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr,
-                             JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                             JDIMENSION output_col);
-EXTERN(void) jsimd_idct_6x6 (j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr,
-                             JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                             JDIMENSION output_col);
-EXTERN(void) jsimd_idct_12x12 (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                               JDIMENSION output_col);
+EXTERN(void) jsimd_idct_2x2(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_6x6(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
 
-EXTERN(int) jsimd_can_idct_islow (void);
-EXTERN(int) jsimd_can_idct_ifast (void);
-EXTERN(int) jsimd_can_idct_float (void);
+EXTERN(int) jsimd_can_idct_islow(void);
+EXTERN(int) jsimd_can_idct_ifast(void);
+EXTERN(int) jsimd_can_idct_float(void);
 
-EXTERN(void) jsimd_idct_islow (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                               JDIMENSION output_col);
-EXTERN(void) jsimd_idct_ifast (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                               JDIMENSION output_col);
-EXTERN(void) jsimd_idct_float (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                               JDIMENSION output_col);
+EXTERN(void) jsimd_idct_islow(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) jsimd_idct_ifast(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) jsimd_idct_float(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
diff --git a/media/libjpeg/jstdhuff.c b/media/libjpeg/jstdhuff.c
index e202e8e7ec..345b513d4d 100644
--- a/media/libjpeg/jstdhuff.c
+++ b/media/libjpeg/jstdhuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2013, D. R. Commander.
+ * Copyright (C) 2013, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -17,8 +17,8 @@
  */
 
 LOCAL(void)
-add_huff_table (j_common_ptr cinfo,
-                JHUFF_TBL **htblptr, const UINT8 *bits, const UINT8 *val)
+add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits,
+               const UINT8 *val)
 /* Define a Huffman table */
 {
   int nsymbols, len;
@@ -29,7 +29,7 @@ add_huff_table (j_common_ptr cinfo,
     return;
 
   /* Copy the number-of-symbols-of-each-code-length counts */
-  MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
+  memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
 
   /* Validate the counts.  We do this here mainly so we can copy the right
    * number of symbols from the val[] array, without risking marching off
@@ -41,8 +41,9 @@ add_huff_table (j_common_ptr cinfo,
   if (nsymbols < 1 || nsymbols > 256)
     ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
 
-  MEMCOPY((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
-  MEMZERO(&((*htblptr)->huffval[nsymbols]), (256 - nsymbols) * sizeof(UINT8));
+  memcpy((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
+  memset(&((*htblptr)->huffval[nsymbols]), 0,
+         (256 - nsymbols) * sizeof(UINT8));
 
   /* Initialize sent_table FALSE so table will be written to JPEG file. */
   (*htblptr)->sent_table = FALSE;
@@ -50,71 +51,79 @@ add_huff_table (j_common_ptr cinfo,
 
 
 LOCAL(void)
-std_huff_tables (j_common_ptr cinfo)
+std_huff_tables(j_common_ptr cinfo)
 /* Set up the standard Huffman tables (cf. JPEG standard section K.3) */
 /* IMPORTANT: these are only valid for 8-bit data precision! */
 {
   JHUFF_TBL **dc_huff_tbl_ptrs, **ac_huff_tbl_ptrs;
 
-  static const UINT8 bits_dc_luminance[17] =
-    { /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
-  static const UINT8 val_dc_luminance[] =
-    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+  static const UINT8 bits_dc_luminance[17] = {
+    /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0
+  };
+  static const UINT8 val_dc_luminance[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+  };
 
-  static const UINT8 bits_dc_chrominance[17] =
-    { /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
-  static const UINT8 val_dc_chrominance[] =
-    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+  static const UINT8 bits_dc_chrominance[17] = {
+    /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
+  };
+  static const UINT8 val_dc_chrominance[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+  };
 
-  static const UINT8 bits_ac_luminance[17] =
-    { /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d };
-  static const UINT8 val_ac_luminance[] =
-    { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
-      0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
-      0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
-      0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
-      0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
-      0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
-      0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
-      0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
-      0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
-      0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-      0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-      0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
-      0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
-      0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
-      0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
-      0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
-      0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
-      0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
-      0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
-      0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
-      0xf9, 0xfa };
+  static const UINT8 bits_ac_luminance[17] = {
+    /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d
+  };
+  static const UINT8 val_ac_luminance[] = {
+    0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+    0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+    0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+    0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+    0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+    0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+    0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+    0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+    0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+    0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+    0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+    0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+    0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+    0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+    0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+    0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+    0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+    0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+    0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+    0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa
+  };
 
-  static const UINT8 bits_ac_chrominance[17] =
-    { /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 };
-  static const UINT8 val_ac_chrominance[] =
-    { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
-      0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
-      0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
-      0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
-      0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
-      0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
-      0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
-      0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
-      0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
-      0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
-      0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
-      0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-      0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
-      0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
-      0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
-      0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
-      0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
-      0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
-      0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
-      0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
-      0xf9, 0xfa };
+  static const UINT8 bits_ac_chrominance[17] = {
+    /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77
+  };
+  static const UINT8 val_ac_chrominance[] = {
+    0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+    0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+    0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+    0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+    0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+    0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+    0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+    0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+    0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+    0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+    0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+    0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+    0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+    0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+    0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+    0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+    0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+    0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+    0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+    0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa
+  };
 
   if (cinfo->is_decompressor) {
     dc_huff_tbl_ptrs = ((j_decompress_ptr)cinfo)->dc_huff_tbl_ptrs;
diff --git a/media/libjpeg/jutils.c b/media/libjpeg/jutils.c
index f9d35023e5..d86271624a 100644
--- a/media/libjpeg/jutils.c
+++ b/media/libjpeg/jutils.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code
- * relevant to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -53,7 +53,7 @@ const int jpeg_zigzag_order[DCTSIZE2] = {
  * fake entries.
  */
 
-const int jpeg_natural_order[DCTSIZE2+16] = {
+const int jpeg_natural_order[DCTSIZE2 + 16] = {
   0,  1,  8, 16,  9,  2,  3, 10,
  17, 24, 32, 25, 18, 11,  4,  5,
  12, 19, 26, 33, 40, 48, 41, 34,
@@ -72,7 +72,7 @@ const int jpeg_natural_order[DCTSIZE2+16] = {
  */
 
 GLOBAL(long)
-jdiv_round_up (long a, long b)
+jdiv_round_up(long a, long b)
 /* Compute a/b rounded up to next integer, ie, ceil(a/b) */
 /* Assumes a >= 0, b > 0 */
 {
@@ -81,7 +81,7 @@ jdiv_round_up (long a, long b)
 
 
 GLOBAL(long)
-jround_up (long a, long b)
+jround_up(long a, long b)
 /* Compute a rounded up to next multiple of b, ie, ceil(a/b)*b */
 /* Assumes a >= 0, b > 0 */
 {
@@ -91,9 +91,9 @@ jround_up (long a, long b)
 
 
 GLOBAL(void)
-jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
-                   JSAMPARRAY output_array, int dest_row,
-                   int num_rows, JDIMENSION num_cols)
+jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
+                  JSAMPARRAY output_array, int dest_row, int num_rows,
+                  JDIMENSION num_cols)
 /* Copy some rows of samples from one place to another.
  * num_rows rows are copied from input_array[source_row++]
  * to output_array[dest_row++]; these areas may overlap for duplication.
@@ -101,7 +101,7 @@ jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
  */
 {
   register JSAMPROW inptr, outptr;
-  register size_t count = (size_t) (num_cols * sizeof(JSAMPLE));
+  register size_t count = (size_t)(num_cols * sizeof(JSAMPLE));
   register int row;
 
   input_array += source_row;
@@ -110,24 +110,24 @@ jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
   for (row = num_rows; row > 0; row--) {
     inptr = *input_array++;
     outptr = *output_array++;
-    MEMCOPY(outptr, inptr, count);
+    memcpy(outptr, inptr, count);
   }
 }
 
 
 GLOBAL(void)
-jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
-                 JDIMENSION num_blocks)
+jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
+                JDIMENSION num_blocks)
 /* Copy a row of coefficient blocks from one place to another. */
 {
-  MEMCOPY(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
+  memcpy(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
 }
 
 
 GLOBAL(void)
-jzero_far (void *target, size_t bytestozero)
+jzero_far(void *target, size_t bytestozero)
 /* Zero out a chunk of memory. */
 /* This might be sample-array data, block-array data, or alloc_large data. */
 {
-  MEMZERO(target, bytestozero);
+  memset(target, 0, bytestozero);
 }
diff --git a/media/libjpeg/jversion.h b/media/libjpeg/jversion.h
index 6ce663d82a..63db95b99b 100644
--- a/media/libjpeg/jversion.h
+++ b/media/libjpeg/jversion.h
@@ -2,9 +2,9 @@
  * jversion.h
  *
  * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2012-2016, D. R. Commander.
+ * Copyright (C) 2010, 2012-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -30,20 +30,25 @@
  * NOTE: It is our convention to place the authors in the following order:
  * - libjpeg-turbo authors (2009-) in descending order of the date of their
  *   most recent contribution to the project, then in ascending order of the
- *   date of their first contribution to the project
+ *   date of their first contribution to the project, then in alphabetical
+ *   order
  * - Upstream authors in descending order of the date of the first inclusion of
  *   their code
  */
 
-#define JCOPYRIGHT      "Copyright (C) 2009-2016 D. R. Commander\n" \
-                        "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
-                        "Copyright (C) 2015-2016 Matthieu Darbois\n" \
-                        "Copyright (C) 2015 Google, Inc.\n" \
-                        "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
-                        "Copyright (C) 2013 Linaro Limited\n" \
-                        "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
-                        "Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
-                        "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
-                        "Copyright (C) 1991-2016 Thomas G. Lane, Guido Vollbeding" \
-
-#define JCOPYRIGHT_SHORT "Copyright (C) 1991-2016 The libjpeg-turbo Project and many others"
+#define JCOPYRIGHT \
+  "Copyright (C) 2009-2022 D. R. Commander\n" \
+  "Copyright (C) 2015, 2020 Google, Inc.\n" \
+  "Copyright (C) 2019-2020 Arm Limited\n" \
+  "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
+  "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
+  "Copyright (C) 2015 Intel Corporation\n" \
+  "Copyright (C) 2013-2014 Linaro Limited\n" \
+  "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
+  "Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \
+  "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
+  "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
+  "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding"
+
+#define JCOPYRIGHT_SHORT \
+  "Copyright (C) 1991-2022 The libjpeg-turbo Project and many others"
diff --git a/media/libjpeg/moz.build b/media/libjpeg/moz.build
index 6519c30fbd..0da04dabc3 100644
--- a/media/libjpeg/moz.build
+++ b/media/libjpeg/moz.build
@@ -1,4 +1,5 @@
 # -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
@@ -22,6 +23,7 @@ SOURCES += [
     'jdcolor.c',
     'jddctmgr.c',
     'jdhuff.c',
+    'jdicc.c',
     'jdinput.c',
     'jdmainct.c',
     'jdmarker.c',
@@ -54,6 +56,7 @@ SOURCES += [
     'jccolor.c',
     'jcdctmgr.c',
     'jchuff.c',
+    'jcicc.c',
     'jcinit.c',
     'jcmainct.c',
     'jcmarker.c',
@@ -70,88 +73,188 @@ if CONFIG['LIBJPEG_TURBO_USE_YASM']:
 
 if CONFIG['LIBJPEG_TURBO_ASFLAGS']:
     if CONFIG['CPU_ARCH'] == 'arm':
+        LOCAL_INCLUDES += [
+            '/media/libjpeg/simd/arm',
+            '/media/libjpeg/simd/arm/aarch32',
+        ]
         SOURCES += [
-            'simd/jsimd_arm.c',
-            'simd/jsimd_arm_neon.S',
+            'simd/arm/aarch32/jchuff-neon.c',
+            'simd/arm/aarch32/jsimd.c',
+            'simd/arm/aarch32/jsimd_neon.S',
+            'simd/arm/jcgray-neon.c',
+            'simd/arm/jcphuff-neon.c',
+            'simd/arm/jcsample-neon.c',
+            'simd/arm/jdcolor-neon.c',
+            'simd/arm/jdmerge-neon.c',
+            'simd/arm/jdsample-neon.c',
+            'simd/arm/jfdctfst-neon.c',
+            'simd/arm/jfdctint-neon.c',
+            'simd/arm/jidctred-neon.c',
+            'simd/arm/jquanti-neon.c',
         ]
     elif CONFIG['CPU_ARCH'] == 'aarch64':
+        LOCAL_INCLUDES += [
+            '/media/libjpeg/simd/arm',
+            '/media/libjpeg/simd/arm/aarch64',
+        ]
+        SOURCES += [
+            'simd/arm/aarch64/jsimd.c',
+            'simd/arm/aarch64/jsimd_neon.S',
+            'simd/arm/jcgray-neon.c',
+            'simd/arm/jcphuff-neon.c',
+            'simd/arm/jcsample-neon.c',
+            'simd/arm/jdmerge-neon.c',
+            'simd/arm/jdsample-neon.c',
+            'simd/arm/jfdctfst-neon.c',
+            'simd/arm/jidctfst-neon.c',
+            'simd/arm/jidctred-neon.c',
+            'simd/arm/jquanti-neon.c',
+        ]
+    elif CONFIG['CPU_ARCH'] == 'mips32':
         SOURCES += [
-            'simd/jsimd_arm64.c',
-            'simd/jsimd_arm64_neon.S',
+            'simd/mips/jsimd.c',
+            'simd/mips/jsimd_dspr2.S',
         ]
-    elif CONFIG['CPU_ARCH'] == 'mips':
+        if CONFIG['CC_TYPE'] == 'clang':
+            SOURCES['simd/mips/jsimd_dspr2.S'].flags += [
+                '-fno-integrated-as',
+            ]
+    elif CONFIG['CPU_ARCH'] == 'mips64':
+        LOCAL_INCLUDES += ['/media/libjpeg/simd/mips64']
         SOURCES += [
-            'simd/jsimd_mips.c',
-            'simd/jsimd_mips_dspr2.S',
+            'simd/mips64/jccolor-mmi.c',
+            'simd/mips64/jcgray-mmi.c',
+            'simd/mips64/jcsample-mmi.c',
+            'simd/mips64/jdcolor-mmi.c',
+            'simd/mips64/jdmerge-mmi.c',
+            'simd/mips64/jdsample-mmi.c',
+            'simd/mips64/jfdctfst-mmi.c',
+            'simd/mips64/jfdctint-mmi.c',
+            'simd/mips64/jidctfst-mmi.c',
+            'simd/mips64/jidctint-mmi.c',
+            'simd/mips64/jquanti-mmi.c',
+            'simd/mips64/jsimd.c',
         ]
     elif CONFIG['CPU_ARCH'] == 'x86_64':
         SOURCES += [
-            'simd/jccolor-sse2-64.asm',
-            'simd/jcgray-sse2-64.asm',
-            'simd/jchuff-sse2-64.asm',
-            'simd/jcsample-sse2-64.asm',
-            'simd/jdcolor-sse2-64.asm',
-            'simd/jdmerge-sse2-64.asm',
-            'simd/jdsample-sse2-64.asm',
-            'simd/jfdctflt-sse-64.asm',
-            'simd/jfdctfst-sse2-64.asm',
-            'simd/jfdctint-sse2-64.asm',
-            'simd/jidctflt-sse2-64.asm',
-            'simd/jidctfst-sse2-64.asm',
-            'simd/jidctint-sse2-64.asm',
-            'simd/jidctred-sse2-64.asm',
-            'simd/jquantf-sse2-64.asm',
-            'simd/jquanti-sse2-64.asm',
-            'simd/jsimd_x86_64.c',
+            'simd/x86_64/jccolor-avx2.asm',
+            'simd/x86_64/jccolor-sse2.asm',
+            'simd/x86_64/jcgray-avx2.asm',
+            'simd/x86_64/jcgray-sse2.asm',
+            'simd/x86_64/jchuff-sse2.asm',
+            'simd/x86_64/jcphuff-sse2.asm',
+            'simd/x86_64/jcsample-avx2.asm',
+            'simd/x86_64/jcsample-sse2.asm',
+            'simd/x86_64/jdcolor-avx2.asm',
+            'simd/x86_64/jdcolor-sse2.asm',
+            'simd/x86_64/jdmerge-avx2.asm',
+            'simd/x86_64/jdmerge-sse2.asm',
+            'simd/x86_64/jdsample-avx2.asm',
+            'simd/x86_64/jdsample-sse2.asm',
+            'simd/x86_64/jfdctflt-sse.asm',
+            'simd/x86_64/jfdctfst-sse2.asm',
+            'simd/x86_64/jfdctint-avx2.asm',
+            'simd/x86_64/jfdctint-sse2.asm',
+            'simd/x86_64/jidctflt-sse2.asm',
+            'simd/x86_64/jidctfst-sse2.asm',
+            'simd/x86_64/jidctint-avx2.asm',
+            'simd/x86_64/jidctint-sse2.asm',
+            'simd/x86_64/jidctred-sse2.asm',
+            'simd/x86_64/jquantf-sse2.asm',
+            'simd/x86_64/jquanti-avx2.asm',
+            'simd/x86_64/jquanti-sse2.asm',
+            'simd/x86_64/jsimd.c',
+            'simd/x86_64/jsimdcpu.asm',
         ]
     elif CONFIG['CPU_ARCH'] == 'x86':
         SOURCES += [
-            'simd/jccolor-mmx.asm',
-            'simd/jccolor-sse2.asm',
-            'simd/jcgray-mmx.asm',
-            'simd/jcgray-sse2.asm',
-            'simd/jchuff-sse2.asm',
-            'simd/jcsample-mmx.asm',
-            'simd/jcsample-sse2.asm',
-            'simd/jdcolor-mmx.asm',
-            'simd/jdcolor-sse2.asm',
-            'simd/jdmerge-mmx.asm',
-            'simd/jdmerge-sse2.asm',
-            'simd/jdsample-mmx.asm',
-            'simd/jdsample-sse2.asm',
-            'simd/jfdctflt-3dn.asm',
-            'simd/jfdctflt-sse.asm',
-            'simd/jfdctfst-mmx.asm',
-            'simd/jfdctfst-sse2.asm',
-            'simd/jfdctint-mmx.asm',
-            'simd/jfdctint-sse2.asm',
-            'simd/jidctflt-3dn.asm',
-            'simd/jidctflt-sse.asm',
-            'simd/jidctflt-sse2.asm',
-            'simd/jidctfst-mmx.asm',
-            'simd/jidctfst-sse2.asm',
-            'simd/jidctint-mmx.asm',
-            'simd/jidctint-sse2.asm',
-            'simd/jidctred-mmx.asm',
-            'simd/jidctred-sse2.asm',
-            'simd/jquant-3dn.asm',
-            'simd/jquant-mmx.asm',
-            'simd/jquant-sse.asm',
-            'simd/jquantf-sse2.asm',
-            'simd/jquanti-sse2.asm',
-            'simd/jsimd_i386.c',
-            'simd/jsimdcpu.asm',
+            'simd/i386/jccolor-avx2.asm',
+            'simd/i386/jccolor-mmx.asm',
+            'simd/i386/jccolor-sse2.asm',
+            'simd/i386/jcgray-avx2.asm',
+            'simd/i386/jcgray-mmx.asm',
+            'simd/i386/jcgray-sse2.asm',
+            'simd/i386/jchuff-sse2.asm',
+            'simd/i386/jcphuff-sse2.asm',
+            'simd/i386/jcsample-avx2.asm',
+            'simd/i386/jcsample-mmx.asm',
+            'simd/i386/jcsample-sse2.asm',
+            'simd/i386/jdcolor-avx2.asm',
+            'simd/i386/jdcolor-mmx.asm',
+            'simd/i386/jdcolor-sse2.asm',
+            'simd/i386/jdmerge-avx2.asm',
+            'simd/i386/jdmerge-mmx.asm',
+            'simd/i386/jdmerge-sse2.asm',
+            'simd/i386/jdsample-avx2.asm',
+            'simd/i386/jdsample-mmx.asm',
+            'simd/i386/jdsample-sse2.asm',
+            'simd/i386/jfdctflt-3dn.asm',
+            'simd/i386/jfdctflt-sse.asm',
+            'simd/i386/jfdctfst-mmx.asm',
+            'simd/i386/jfdctfst-sse2.asm',
+            'simd/i386/jfdctint-avx2.asm',
+            'simd/i386/jfdctint-mmx.asm',
+            'simd/i386/jfdctint-sse2.asm',
+            'simd/i386/jidctflt-3dn.asm',
+            'simd/i386/jidctflt-sse.asm',
+            'simd/i386/jidctflt-sse2.asm',
+            'simd/i386/jidctfst-mmx.asm',
+            'simd/i386/jidctfst-sse2.asm',
+            'simd/i386/jidctint-avx2.asm',
+            'simd/i386/jidctint-mmx.asm',
+            'simd/i386/jidctint-sse2.asm',
+            'simd/i386/jidctred-mmx.asm',
+            'simd/i386/jidctred-sse2.asm',
+            'simd/i386/jquant-3dn.asm',
+            'simd/i386/jquant-mmx.asm',
+            'simd/i386/jquant-sse.asm',
+            'simd/i386/jquantf-sse2.asm',
+            'simd/i386/jquanti-avx2.asm',
+            'simd/i386/jquanti-sse2.asm',
+            'simd/i386/jsimd.c',
+            'simd/i386/jsimdcpu.asm',
         ]
+elif CONFIG['CPU_ARCH'].startswith('ppc'):
+    # PowerPC has no assembly files, but still needs its own headers.
+    LOCAL_INCLUDES += ['/media/libjpeg/simd/powerpc']
+
+    # For libjpeg's internal runtime detection to work, jsimd.c must NOT
+    # be compiled with -maltivec (otherwise it gets statically set),
+    # but everything else should be. If -maltivec was already
+    # specified in .mozconfig, though, then this won't harm anything.
+    ppc_vmx_sources = [
+        'simd/powerpc/jccolor-altivec.c',
+        'simd/powerpc/jcgray-altivec.c',
+        'simd/powerpc/jcsample-altivec.c',
+        'simd/powerpc/jdcolor-altivec.c',
+        'simd/powerpc/jdmerge-altivec.c',
+        'simd/powerpc/jdsample-altivec.c',
+        'simd/powerpc/jfdctfst-altivec.c',
+        'simd/powerpc/jfdctint-altivec.c',
+        'simd/powerpc/jidctfst-altivec.c',
+        'simd/powerpc/jidctint-altivec.c',
+        'simd/powerpc/jquanti-altivec.c',
+    ]
+    SOURCES += ppc_vmx_sources
+    SOURCES += [
+        'simd/powerpc/jsimd.c',
+    ]
+    for srcfile in ppc_vmx_sources:
+        SOURCES[srcfile].flags += CONFIG['PPC_VMX_FLAGS']
 else: # No SIMD support?
     SOURCES += [
         'jsimd_none.c',
     ]
 
 ASFLAGS += CONFIG['LIBJPEG_TURBO_ASFLAGS']
-ASFLAGS += ['-I%s/media/libjpeg/simd/' % TOPSRCDIR]
 
-if CONFIG['GKMEDIAS_SHARED_LIBRARY']:
-    NO_VISIBILITY_FLAGS = True
+# Make sure the x86 & x86-64 ASM files can see the necessary includes.
+if CONFIG['CPU_ARCH'] == 'x86':
+    ASFLAGS += ['-I%s/media/libjpeg/simd/nasm/' % TOPSRCDIR]
+    ASFLAGS += ['-I%s/media/libjpeg/simd/i386/' % TOPSRCDIR]
+if CONFIG['CPU_ARCH'] == 'x86_64':
+    ASFLAGS += ['-I%s/media/libjpeg/simd/nasm/' % TOPSRCDIR]
+    ASFLAGS += ['-I%s/media/libjpeg/simd/x86_64/' % TOPSRCDIR]
 
 # We allow warnings for third-party code that can be updated from upstream.
 ALLOW_COMPILER_WARNINGS = True
diff --git a/media/libjpeg/mozilla.diff b/media/libjpeg/mozilla.diff
index 24b235b401..bc1bcb3066 100644
--- a/media/libjpeg/mozilla.diff
+++ b/media/libjpeg/mozilla.diff
@@ -1,32 +1,7 @@
-diff --git jmemmgr.c jmemmgr.c
---- jmemmgr.c
-+++ jmemmgr.c
-@@ -28,16 +28,17 @@
-  */
- 
- #define JPEG_INTERNALS
- #define AM_MEMORY_MANAGER       /* we define jvirt_Xarray_control structs */
- #include "jinclude.h"
- #include "jpeglib.h"
- #include "jmemsys.h"            /* import the system-dependent declarations */
- #include <stdint.h>
-+#include <limits.h>             /* some NDKs define SIZE_MAX in limits.h */
- 
- #ifndef NO_GETENV
- #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare getenv() */
- extern char *getenv (const char *name);
- #endif
- #endif
-
-
 diff --git jmorecfg.h jmorecfg.h
 --- jmorecfg.h
 +++ jmorecfg.h
-@@ -9,16 +9,17 @@
-  * For conditions of distribution and use, see the accompanying README.ijg
-  * file.
-  *
-  * This file contains additional configuration options that customize the
+@@ -13,8 +13,9 @@
   * JPEG software for special applications or support machine-dependent
   * optimizations.  Most users will not need to touch this file.
   */
@@ -35,29 +10,13 @@ diff --git jmorecfg.h jmorecfg.h
  
  /*
   * Maximum number of components (color channels) allowed in JPEG image.
-  * To meet the letter of the JPEG spec, set this to 255.  However, darn
-  * few applications need more than 4 channels (maybe 5 for CMYK + alpha
-  * mask).  We recommend 10 as a reasonable compromise; use 4 if you are
-  * really short on memory.  (Each allowed component costs a hundred or so
-  * bytes of storage, whether actually used in an image or not.)
-@@ -118,39 +119,25 @@ typedef char JOCTET;
-  * They must be at least as wide as specified; but making them too big
-  * won't cost a huge amount of memory, so we don't provide special
-  * extraction code like we did for JSAMPLE.  (In other words, these
-  * typedefs live at a different point on the speed/space tradeoff curve.)
+  * To meet the letter of Rec. ITU-T T.81 | ISO/IEC 10918-1, set this to 255.
+@@ -95,23 +96,17 @@ typedef unsigned char JOCTET;
   */
  
  /* UINT8 must hold at least the values 0..255. */
  
--#ifdef HAVE_UNSIGNED_CHAR
 -typedef unsigned char UINT8;
--#else /* not HAVE_UNSIGNED_CHAR */
--#ifdef __CHAR_UNSIGNED__
--typedef char UINT8;
--#else /* not __CHAR_UNSIGNED__ */
--typedef short UINT8;
--#endif /* __CHAR_UNSIGNED__ */
--#endif /* HAVE_UNSIGNED_CHAR */
 +typedef uint8_t UINT8;
  
  /* UINT16 must hold at least the values 0..65535. */
@@ -79,23 +38,15 @@ diff --git jmorecfg.h jmorecfg.h
  /* INT32 must hold at least signed 32-bit values.
   *
   * NOTE: The INT32 typedef dates back to libjpeg v5 (1994.)  Integers were
-  * sometimes 16-bit back then (MS-DOS), which is why INT32 is typedef'd to
-  * long.  It also wasn't common (or at least as common) in 1994 for INT32 to be
-  * defined by platform headers.  Since then, however, INT32 is defined in
-  * several other common places:
-@@ -167,25 +154,17 @@ typedef short INT16;
-  * This is a recipe for conflict, since "long" and "int" aren't always
-  * compatible types.  Since the definition of INT32 has technically been part
-  * of the libjpeg API for more than 20 years, we can't remove it, but we do not
-  * use it internally any longer.  We instead define a separate type (JLONG)
+@@ -136,17 +131,9 @@ typedef short INT16;
   * for internal use, which ensures that internal behavior will always be the
   * same regardless of any external headers that may be included.
   */
  
 -#ifndef XMD_H                   /* X11/xmd.h correctly defines INT32 */
--#ifndef _BASETSD_H_		/* Microsoft defines it in basetsd.h */
--#ifndef _BASETSD_H		/* MinGW is slightly different */
--#ifndef QGLOBAL_H		/* Qt defines it in qglobal.h */
+-#ifndef _BASETSD_H_             /* Microsoft defines it in basetsd.h */
+-#ifndef _BASETSD_H              /* MinGW is slightly different */
+-#ifndef QGLOBAL_H               /* Qt defines it in qglobal.h */
 -typedef long INT32;
 -#endif
 -#endif
@@ -106,7 +57,3 @@ diff --git jmorecfg.h jmorecfg.h
  /* Datatype used for image dimensions.  The JPEG standard only supports
   * images up to 64K*64K due to 16-bit fields in SOF markers.  Therefore
   * "unsigned int" is sufficient on all machines.  However, if you need to
-  * handle larger images and you don't mind deviating from the spec, you
-  * can change this datatype.  (Note that changing this datatype will
-  * potentially require modifying the SIMD code.  The x86-64 SIMD extensions,
-  * in particular, assume a 32-bit JDIMENSION.)
diff --git a/media/libjpeg/simd/arm/aarch32/jccolext-neon.c b/media/libjpeg/simd/arm/aarch32/jccolext-neon.c
new file mode 100644
index 0000000000..362102d2b2
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jccolext-neon.c
@@ -0,0 +1,148 @@
+/*
+ * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 8) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+
+  /* Set up conversion constants. */
+#ifdef HAVE_VLD1_U16_X2
+  const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
+  const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
+  const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
+  const uint16x4x2_t consts = { { consts1, consts2 } };
+#endif
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 8) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 8) {
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
+      uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_low = scaled_128_5;
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
+      cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
+      uint32x4_t cb_high = scaled_128_5;
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
+      cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_low = scaled_128_5;
+      cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
+      uint32x4_t cr_high = scaled_128_5;
+      cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
+                                      vrshrn_n_u32(y_high, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
+                                       vshrn_n_u32(cb_high, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
+                                       vshrn_n_u32(cr_high, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+
+      /* Increment pointers. */
+      inptr += (8 * RGB_PIXELSIZE);
+      outptr0 += 8;
+      outptr1 += 8;
+      outptr2 += 8;
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jchuff-neon.c b/media/libjpeg/simd/arm/aarch32/jchuff-neon.c
new file mode 100644
index 0000000000..19d94f720d
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jchuff-neon.c
@@ -0,0 +1,334 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint8_t block_nbits[DCTSIZE2];
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load rows of coefficients from DCT block in zig-zag order. */
+
+  /* Compute DC coefficient difference value. (F.1.1.5.1) */
+  int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
+  row0 = vld1q_lane_s16(block +  1, row0, 1);
+  row0 = vld1q_lane_s16(block +  8, row0, 2);
+  row0 = vld1q_lane_s16(block + 16, row0, 3);
+  row0 = vld1q_lane_s16(block +  9, row0, 4);
+  row0 = vld1q_lane_s16(block +  2, row0, 5);
+  row0 = vld1q_lane_s16(block +  3, row0, 6);
+  row0 = vld1q_lane_s16(block + 10, row0, 7);
+
+  int16x8_t row1 = vld1q_dup_s16(block + 17);
+  row1 = vld1q_lane_s16(block + 24, row1, 1);
+  row1 = vld1q_lane_s16(block + 32, row1, 2);
+  row1 = vld1q_lane_s16(block + 25, row1, 3);
+  row1 = vld1q_lane_s16(block + 18, row1, 4);
+  row1 = vld1q_lane_s16(block + 11, row1, 5);
+  row1 = vld1q_lane_s16(block +  4, row1, 6);
+  row1 = vld1q_lane_s16(block +  5, row1, 7);
+
+  int16x8_t row2 = vld1q_dup_s16(block + 12);
+  row2 = vld1q_lane_s16(block + 19, row2, 1);
+  row2 = vld1q_lane_s16(block + 26, row2, 2);
+  row2 = vld1q_lane_s16(block + 33, row2, 3);
+  row2 = vld1q_lane_s16(block + 40, row2, 4);
+  row2 = vld1q_lane_s16(block + 48, row2, 5);
+  row2 = vld1q_lane_s16(block + 41, row2, 6);
+  row2 = vld1q_lane_s16(block + 34, row2, 7);
+
+  int16x8_t row3 = vld1q_dup_s16(block + 27);
+  row3 = vld1q_lane_s16(block + 20, row3, 1);
+  row3 = vld1q_lane_s16(block + 13, row3, 2);
+  row3 = vld1q_lane_s16(block +  6, row3, 3);
+  row3 = vld1q_lane_s16(block +  7, row3, 4);
+  row3 = vld1q_lane_s16(block + 14, row3, 5);
+  row3 = vld1q_lane_s16(block + 21, row3, 6);
+  row3 = vld1q_lane_s16(block + 28, row3, 7);
+
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t abs_row1 = vabsq_s16(row1);
+  int16x8_t abs_row2 = vabsq_s16(row2);
+  int16x8_t abs_row3 = vabsq_s16(row3);
+
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  int16x8_t row1_lz = vclzq_s16(abs_row1);
+  int16x8_t row2_lz = vclzq_s16(abs_row2);
+  int16x8_t row3_lz = vclzq_s16(abs_row3);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+  uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+  uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+  uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+
+  vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+  vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+  vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+  vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+
+  uint16x8_t row0_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
+              vnegq_s16(row0_lz));
+  uint16x8_t row1_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
+              vnegq_s16(row1_lz));
+  uint16x8_t row2_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
+              vnegq_s16(row2_lz));
+  uint16x8_t row3_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
+              vnegq_s16(row3_lz));
+
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+  uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
+  uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
+  uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
+
+  /* Store diff values for rows 0, 1, 2, and 3. */
+  vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+  vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+  vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+  vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+
+  /* Load last four rows of coefficients from DCT block in zig-zag order. */
+  int16x8_t row4 = vld1q_dup_s16(block + 35);
+  row4 = vld1q_lane_s16(block + 42, row4, 1);
+  row4 = vld1q_lane_s16(block + 49, row4, 2);
+  row4 = vld1q_lane_s16(block + 56, row4, 3);
+  row4 = vld1q_lane_s16(block + 57, row4, 4);
+  row4 = vld1q_lane_s16(block + 50, row4, 5);
+  row4 = vld1q_lane_s16(block + 43, row4, 6);
+  row4 = vld1q_lane_s16(block + 36, row4, 7);
+
+  int16x8_t row5 = vld1q_dup_s16(block + 29);
+  row5 = vld1q_lane_s16(block + 22, row5, 1);
+  row5 = vld1q_lane_s16(block + 15, row5, 2);
+  row5 = vld1q_lane_s16(block + 23, row5, 3);
+  row5 = vld1q_lane_s16(block + 30, row5, 4);
+  row5 = vld1q_lane_s16(block + 37, row5, 5);
+  row5 = vld1q_lane_s16(block + 44, row5, 6);
+  row5 = vld1q_lane_s16(block + 51, row5, 7);
+
+  int16x8_t row6 = vld1q_dup_s16(block + 58);
+  row6 = vld1q_lane_s16(block + 59, row6, 1);
+  row6 = vld1q_lane_s16(block + 52, row6, 2);
+  row6 = vld1q_lane_s16(block + 45, row6, 3);
+  row6 = vld1q_lane_s16(block + 38, row6, 4);
+  row6 = vld1q_lane_s16(block + 31, row6, 5);
+  row6 = vld1q_lane_s16(block + 39, row6, 6);
+  row6 = vld1q_lane_s16(block + 46, row6, 7);
+
+  int16x8_t row7 = vld1q_dup_s16(block + 53);
+  row7 = vld1q_lane_s16(block + 60, row7, 1);
+  row7 = vld1q_lane_s16(block + 61, row7, 2);
+  row7 = vld1q_lane_s16(block + 54, row7, 3);
+  row7 = vld1q_lane_s16(block + 47, row7, 4);
+  row7 = vld1q_lane_s16(block + 55, row7, 5);
+  row7 = vld1q_lane_s16(block + 62, row7, 6);
+  row7 = vld1q_lane_s16(block + 63, row7, 7);
+
+  int16x8_t abs_row4 = vabsq_s16(row4);
+  int16x8_t abs_row5 = vabsq_s16(row5);
+  int16x8_t abs_row6 = vabsq_s16(row6);
+  int16x8_t abs_row7 = vabsq_s16(row7);
+
+  int16x8_t row4_lz = vclzq_s16(abs_row4);
+  int16x8_t row5_lz = vclzq_s16(abs_row5);
+  int16x8_t row6_lz = vclzq_s16(abs_row6);
+  int16x8_t row7_lz = vclzq_s16(abs_row7);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+  uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+  uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+  uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+
+  vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+  vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+  vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+  vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+
+  uint16x8_t row4_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
+              vnegq_s16(row4_lz));
+  uint16x8_t row5_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
+              vnegq_s16(row5_lz));
+  uint16x8_t row6_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
+              vnegq_s16(row6_lz));
+  uint16x8_t row7_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
+              vnegq_s16(row7_lz));
+
+  uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
+  uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
+  uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
+  uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
+
+  /* Store diff values for rows 4, 5, 6, and 7. */
+  vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+  vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+  vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+  vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
+  uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
+  uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
+  uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
+  uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
+  uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
+  uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
+  uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
+
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+
+  row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
+  row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
+  row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
+  row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
+  row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
+  row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
+  row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
+  row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
+  uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
+  uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
+  uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
+  uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+  /* Shift left to remove DC bit. */
+  bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
+  /* Move bitmap to 32-bit scalar registers. */
+  uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
+  uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  unsigned int nbits = block_nbits[0];
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = block_diff[0];
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  while (bitmap_1_32 != 0) {
+    r = BUILTIN_CLZ(bitmap_1_32);
+    i += r;
+    bitmap_1_32 <<= r;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    i++;
+    bitmap_1_32 <<= 1;
+  }
+
+  r = 33 - i;
+  i = 33;
+
+  while (bitmap_33_63 != 0) {
+    unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
+    r += leading_zeros;
+    i += leading_zeros;
+    bitmap_33_63 <<= leading_zeros;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    r = 0;
+    i++;
+    bitmap_33_63 <<= 1;
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jsimd.c b/media/libjpeg/simd/arm/aarch32/jsimd.c
new file mode 100644
index 0000000000..e3adf23d50
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jsimd.c
@@ -0,0 +1,980 @@
+/*
+ * jsimd_arm.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2019, Google LLC.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "Features", 8) != 0)
+    return 0;
+  buffer += 8;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "neon"))
+        simd_support |= JSIMD_NEON;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__ARM_NEON__)
+  simd_support |= JSIMD_NEON;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  /* We still have a chance to use Neon regardless of globally used
+   * -mcpu/-mfpu options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux/android */
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+    simd_support = JSIMD_NEON;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_ycc_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_ycc_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_ycc_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_ycc_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_ycc_extrgb_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_ycc_extrgbx_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_ycc_extbgr_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_ycc_extbgrx_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_ycc_extxbgr_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_ycc_extxrgb_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_ycc_extrgb_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start, Sl,
+                                                 Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jsimd_neon.S b/media/libjpeg/simd/arm/aarch32/jsimd_neon.S
new file mode 100644
index 0000000000..7e1e2b1451
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jsimd_neon.S
@@ -0,0 +1,1200 @@
+/*
+ * Armv7 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
+ * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.arm
+.syntax unified
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+    .private_extern _\fname
+    .globl _\fname
+_\fname:
+#else
+    .global \fname
+#ifdef __ELF__
+    .hidden \fname
+    .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+
+#define CENTERJSAMPLE  128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define FIX_0_298631336  (2446)
+#define FIX_0_390180644  (3196)
+#define FIX_0_541196100  (4433)
+#define FIX_0_765366865  (6270)
+#define FIX_0_899976223  (7373)
+#define FIX_1_175875602  (9633)
+#define FIX_1_501321110  (12299)
+#define FIX_1_847759065  (15137)
+#define FIX_1_961570560  (16069)
+#define FIX_2_053119869  (16819)
+#define FIX_2_562915447  (20995)
+#define FIX_3_072711026  (25172)
+
+#define FIX_1_175875602_MINUS_1_961570560  (FIX_1_175875602 - FIX_1_961570560)
+#define FIX_1_175875602_MINUS_0_390180644  (FIX_1_175875602 - FIX_0_390180644)
+#define FIX_0_541196100_MINUS_1_847759065  (FIX_0_541196100 - FIX_1_847759065)
+#define FIX_3_072711026_MINUS_2_562915447  (FIX_3_072711026 - FIX_2_562915447)
+#define FIX_0_298631336_MINUS_0_899976223  (FIX_0_298631336 - FIX_0_899976223)
+#define FIX_1_501321110_MINUS_0_899976223  (FIX_1_501321110 - FIX_0_899976223)
+#define FIX_2_053119869_MINUS_2_562915447  (FIX_2_053119869 - FIX_2_562915447)
+#define FIX_0_541196100_PLUS_0_765366865   (FIX_0_541196100 + FIX_0_765366865)
+
+/*
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
+ */
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
+  DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
+  JLONG   q1, q2, q3, q4, q5, q6, q7; \
+  JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
+  \
+  /* 1-D iDCT input data */ \
+  row0 = xrow0; \
+  row1 = xrow1; \
+  row2 = xrow2; \
+  row3 = xrow3; \
+  row4 = xrow4; \
+  row5 = xrow5; \
+  row6 = xrow6; \
+  row7 = xrow7; \
+  \
+  q5 = row7 + row3; \
+  q4 = row5 + row1; \
+  q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
+       MULTIPLY(q4, FIX_1_175875602); \
+  q7 = MULTIPLY(q5, FIX_1_175875602) + \
+       MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
+  q2 = MULTIPLY(row2, FIX_0_541196100) + \
+       MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
+  q4 = q6; \
+  q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
+  q6 += MULTIPLY(row5, -FIX_2_562915447) + \
+        MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
+  /* now we can use q1 (reloadable constants have been used up) */ \
+  q1 = q3 + q2; \
+  q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
+        MULTIPLY(row1, -FIX_0_899976223); \
+  q5 = q7; \
+  q1 = q1 + q6; \
+  q7 += MULTIPLY(row7, -FIX_0_899976223) + \
+        MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
+  \
+  /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
+  tmp11_plus_tmp2 = q1; \
+  row1 = 0; \
+  \
+  q1 = q1 - q6; \
+  q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
+        MULTIPLY(row3, -FIX_2_562915447); \
+  q1 = q1 - q6; \
+  q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
+       MULTIPLY(row6, FIX_0_541196100); \
+  q3 = q3 - q2; \
+  \
+  /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
+  tmp11_minus_tmp2 = q1; \
+  \
+  q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
+  q2 = q1 + q6; \
+  q1 = q1 - q6; \
+  \
+  /* pick up the results */ \
+  tmp0  = q4; \
+  tmp1  = q5; \
+  tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
+  tmp3  = q7; \
+  tmp10 = q2; \
+  tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
+  tmp12 = q3; \
+  tmp13 = q1; \
+}
+
+#define XFIX_0_899976223                    d0[0]
+#define XFIX_0_541196100                    d0[1]
+#define XFIX_2_562915447                    d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
+#define XFIX_1_175875602                    d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
+
+.balign 16
+jsimd_idct_islow_neon_consts:
+  .short FIX_0_899976223                    /* d0[0] */
+  .short FIX_0_541196100                    /* d0[1] */
+  .short FIX_2_562915447                    /* d0[2] */
+  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
+  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
+  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
+  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
+  .short FIX_1_175875602                    /* d1[3] */
+  /* reloadable constants */
+  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
+  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
+  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
+  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
+
+asm_function jsimd_idct_islow_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    ROW0L           .req d16
+    ROW0R           .req d17
+    ROW1L           .req d18
+    ROW1R           .req d19
+    ROW2L           .req d20
+    ROW2R           .req d21
+    ROW3L           .req d22
+    ROW3R           .req d23
+    ROW4L           .req d24
+    ROW4R           .req d25
+    ROW5L           .req d26
+    ROW5R           .req d27
+    ROW6L           .req d28
+    ROW6R           .req d29
+    ROW7L           .req d30
+    ROW7R           .req d31
+
+    /* Load and dequantize coefficients into Neon registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( q8  )
+     *   1 | d18     | d19     ( q9  )
+     *   2 | d20     | d21     ( q10 )
+     *   3 | d22     | d23     ( q11 )
+     *   4 | d24     | d25     ( q12 )
+     *   5 | d26     | d27     ( q13 )
+     *   6 | d28     | d29     ( q14 )
+     *   7 | d30     | d31     ( q15 )
+     */
+    adr             ip, jsimd_idct_islow_neon_consts
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+    vmul.s16        q8, q8, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q9, q9, q1
+    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+    vmul.s16        q10, q10, q2
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vmul.s16        q11, q11, q3
+    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+    vmul.s16        q12, q12, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q14, q14, q2
+    vmul.s16        q13, q13, q1
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
+    add             ip, ip, #16
+    vmul.s16        q15, q15, q3
+    vpush           {d8 - d15}                    /* save Neon registers */
+    /* 1-D IDCT, pass 1, left 4x8 half */
+    vadd.s16        d4, ROW7L, ROW3L
+    vadd.s16        d5, ROW5L, ROW1L
+    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d5, XFIX_1_175875602
+    vmull.s16       q7, d4, XFIX_1_175875602
+      /* Check for the zero coefficients in the right 4x8 half */
+      push            {r4, r5}
+    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW4L
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+      orr             r0, r4, r5
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+      orr             r0, r0, r4
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q2
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      orr             r0, r0, r4
+    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1L, q1, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+      orr             r0, r0, r4
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+      orr             r0, r0, r5
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+    vmlal.s16       q6, ROW6L, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+      orr             r0, r0, r4
+    vrshrn.s32      ROW6L, q1, #11
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q5
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW4L
+      orr             r0, r0, r4
+    vrshrn.s32      ROW2L, q1, #11
+      orr             r0, r0, r5
+    vrshrn.s32      ROW5L, q3, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+      orr             r0, r0, r4
+    vadd.s32        q2, q5, q6
+      orrs            r0, r0, r5
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+      orr             r0, r4, r5
+    vsub.s32        q3, q1, q4
+      pop             {r4, r5}
+    vrshrn.s32      ROW7L, q2, #11
+    vrshrn.s32      ROW3L, q5, #11
+    vrshrn.s32      ROW0L, q6, #11
+    vrshrn.s32      ROW4L, q3, #11
+
+      beq             3f  /* Go to do some special handling for the sparse
+                             right 4x8 half */
+
+    /* 1-D IDCT, pass 1, right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]  /* reload constants */
+    vadd.s16        d10, ROW7R, ROW3R
+    vadd.s16        d8, ROW5R, ROW1R
+      /* Transpose left 4x8 half */
+      vtrn.16         ROW6L, ROW7L
+    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d8, XFIX_1_175875602
+      vtrn.16         ROW2L, ROW3L
+    vmull.s16       q7, d10, XFIX_1_175875602
+    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
+      vtrn.16         ROW0L, ROW1L
+    vsubl.s16       q3, ROW0R, ROW4R
+    vmull.s16       q2, ROW2R, XFIX_0_541196100
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+      vtrn.16         ROW4L, ROW5L
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
+      vtrn.32         ROW1L, ROW3L
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
+      vtrn.32         ROW4L, ROW6L
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      vtrn.32         ROW0L, ROW2L
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1R, q1, #11
+      vtrn.32         ROW5L, ROW7L
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vrshrn.s32      ROW6R, q1, #11
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0R, ROW4R
+    vrshrn.s32      ROW2R, q1, #11
+    vrshrn.s32      ROW5R, q3, #11
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vrshrn.s32      ROW7R, q2, #11
+    vrshrn.s32      ROW3R, q5, #11
+    vrshrn.s32      ROW0R, q6, #11
+    vrshrn.s32      ROW4R, q3, #11
+    /* Transpose right 4x8 half */
+    vtrn.16         ROW6R, ROW7R
+    vtrn.16         ROW2R, ROW3R
+    vtrn.16         ROW0R, ROW1R
+    vtrn.16         ROW4R, ROW5R
+    vtrn.32         ROW1R, ROW3R
+    vtrn.32         ROW4R, ROW6R
+    vtrn.32         ROW0R, ROW2R
+    vtrn.32         ROW5R, ROW7R
+
+1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
+    /* 1-D IDCT, pass 2, right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5R, XFIX_1_175875602
+    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmull.s16       q7, ROW7R, XFIX_1_175875602
+    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+
+2:  /* Descale to 8-bit and range limit */
+    vqrshrn.s16     d16, q8, #2
+    vqrshrn.s16     d17, q9, #2
+    vqrshrn.s16     d18, q10, #2
+    vqrshrn.s16     d19, q11, #2
+    vpop            {d8 - d15}                    /* restore Neon registers */
+    vqrshrn.s16     d20, q12, #2
+      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
+      vtrn.16         q8, q9
+    vqrshrn.s16     d21, q13, #2
+    vqrshrn.s16     d22, q14, #2
+      vmov.u8         q0, #(CENTERJSAMPLE)
+    vqrshrn.s16     d23, q15, #2
+      vtrn.8          d16, d17
+      vtrn.8          d18, d19
+      vadd.u8         q8, q8, q0
+      vadd.u8         q9, q9, q0
+      vtrn.16         q10, q11
+        /* Store results to the output buffer */
+        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        vst1.8          {d16}, [TMP1]
+      vtrn.8          d20, d21
+        vst1.8          {d17}, [TMP2]
+        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        vst1.8          {d18}, [TMP1]
+      vadd.u8         q10, q10, q0
+        vst1.8          {d19}, [TMP2]
+        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        add             TMP3, TMP3, OUTPUT_COL
+        add             TMP4, TMP4, OUTPUT_COL
+      vtrn.8          d22, d23
+        vst1.8          {d20}, [TMP1]
+      vadd.u8         q11, q11, q0
+        vst1.8          {d21}, [TMP2]
+        vst1.8          {d22}, [TMP3]
+        vst1.8          {d23}, [TMP4]
+    bx              lr
+
+3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+
+    /* Transpose left 4x8 half */
+    vtrn.16         ROW6L, ROW7L
+    vtrn.16         ROW2L, ROW3L
+    vtrn.16         ROW0L, ROW1L
+    vtrn.16         ROW4L, ROW5L
+    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
+    vtrn.32         ROW1L, ROW3L
+    vtrn.32         ROW4L, ROW6L
+    vtrn.32         ROW0L, ROW2L
+    vtrn.32         ROW5L, ROW7L
+
+    cmp             r0, #0
+    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
+                           pass */
+
+    /* Only row 0 is non-zero for the right 4x8 half  */
+    vdup.s16        ROW1R, ROW0R[1]
+    vdup.s16        ROW2R, ROW0R[2]
+    vdup.s16        ROW3R, ROW0R[3]
+    vdup.s16        ROW4R, ROW0R[0]
+    vdup.s16        ROW5R, ROW0R[1]
+    vdup.s16        ROW6R, ROW0R[2]
+    vdup.s16        ROW7R, ROW0R[3]
+    vdup.s16        ROW0R, ROW0R[0]
+    b               1b  /* Go to 'normal' second pass */
+
+4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vshll.s16       q3, ROW0L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW0L, #13
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
+    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5L, XFIX_1_175875602
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW7L, XFIX_1_175875602
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW6L, XFIX_0_541196100
+    vshll.s16       q3, ROW4L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW4L, #13
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+    b               2b                            /* Go to epilogue */
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+    .unreq          ROW0L
+    .unreq          ROW0R
+    .unreq          ROW1L
+    .unreq          ROW1R
+    .unreq          ROW2L
+    .unreq          ROW2R
+    .unreq          ROW3L
+    .unreq          ROW3R
+    .unreq          ROW4L
+    .unreq          ROW4R
+    .unreq          ROW5L
+    .unreq          ROW5R
+    .unreq          ROW6L
+    .unreq          ROW6R
+    .unreq          ROW7L
+    .unreq          ROW7R
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
+ * function from jidctfst.c
+ *
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
+ * But in Arm Neon case some extra additions are required because VQDMULH
+ * instruction can't handle the constants larger than 1. So the expressions
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
+ * which introduces an extra addition. Overall, there are 6 extra additions
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+ */
+
+#define XFIX_1_082392200  d0[0]
+#define XFIX_1_414213562  d0[1]
+#define XFIX_1_847759065  d0[2]
+#define XFIX_2_613125930  d0[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
+
+asm_function jsimd_idct_ifast_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    /* Load and dequantize coefficients into Neon registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( q8  )
+     *   1 | d18     | d19     ( q9  )
+     *   2 | d20     | d21     ( q10 )
+     *   3 | d22     | d23     ( q11 )
+     *   4 | d24     | d25     ( q12 )
+     *   5 | d26     | d27     ( q13 )
+     *   6 | d28     | d29     ( q14 )
+     *   7 | d30     | d31     ( q15 )
+     */
+    adr             ip, jsimd_idct_ifast_neon_consts
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+    vmul.s16        q8, q8, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q9, q9, q1
+    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+    vmul.s16        q10, q10, q2
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vmul.s16        q11, q11, q3
+    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+    vmul.s16        q12, q12, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q14, q14, q2
+    vmul.s16        q13, q13, q1
+    vld1.16         {d0}, [ip, :64]  /* load constants */
+    vmul.s16        q15, q15, q3
+    vpush           {d8 - d13}       /* save Neon registers */
+    /* 1-D IDCT, pass 1 */
+    vsub.s16        q2, q10, q14
+    vadd.s16        q14, q10, q14
+    vsub.s16        q1, q11, q13
+    vadd.s16        q13, q11, q13
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
+    vsub.s16        q10, q10, q14
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
+    vsub.s16        q12, q12, q14
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
+    vsub.s16        q13, q10, q2
+    vadd.s16        q10, q10, q2
+      /* Transpose */
+      vtrn.16         q8, q9
+    vsub.s16        q11, q12, q1
+      vtrn.16         q14, q15
+    vadd.s16        q12, q12, q1
+      vtrn.16         q10, q11
+      vtrn.16         q12, q13
+      vtrn.32         q9, q11
+      vtrn.32         q12, q14
+      vtrn.32         q8, q10
+      vtrn.32         q13, q15
+      vswp            d28, d21
+      vswp            d26, d19
+    /* 1-D IDCT, pass 2 */
+    vsub.s16        q2, q10, q14
+      vswp            d30, d23
+    vadd.s16        q14, q10, q14
+      vswp            d24, d17
+    vsub.s16        q1, q11, q13
+    vadd.s16        q13, q11, q13
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
+    vsub.s16        q10, q10, q14
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
+    vsub.s16        q12, q12, q14
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
+    vsub.s16        q13, q10, q2
+    vpop            {d8 - d13}    /* restore Neon registers */
+    vadd.s16        q10, q10, q2
+    vsub.s16        q11, q12, q1
+    vadd.s16        q12, q12, q1
+    /* Descale to 8-bit and range limit */
+    vmov.u8         q0, #0x80
+    vqshrn.s16      d16, q8, #5
+    vqshrn.s16      d17, q9, #5
+    vqshrn.s16      d18, q10, #5
+    vqshrn.s16      d19, q11, #5
+    vqshrn.s16      d20, q12, #5
+    vqshrn.s16      d21, q13, #5
+    vqshrn.s16      d22, q14, #5
+    vqshrn.s16      d23, q15, #5
+    vadd.u8         q8, q8, q0
+    vadd.u8         q9, q9, q0
+    vadd.u8         q10, q10, q0
+    vadd.u8         q11, q11, q0
+    /* Transpose the final 8-bit samples */
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.8          d16, d17
+    vtrn.8          d18, d19
+      /* Store results to the output buffer */
+      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      vst1.8          {d16}, [TMP1]
+      vst1.8          {d17}, [TMP2]
+      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      vst1.8          {d18}, [TMP1]
+    vtrn.8          d20, d21
+      vst1.8          {d19}, [TMP2]
+      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      add             TMP3, TMP3, OUTPUT_COL
+      add             TMP4, TMP4, OUTPUT_COL
+      vst1.8          {d20}, [TMP1]
+    vtrn.8          d22, d23
+      vst1.8          {d21}, [TMP2]
+      vst1.8          {d22}, [TMP3]
+      vst1.8          {d23}, [TMP4]
+    bx              lr
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+  .if \size == 8
+    vst1.8          {d20}, [Y]!
+    vst1.8          {d21}, [U]!
+    vst1.8          {d22}, [V]!
+  .elseif \size == 4
+    vst1.8          {d20[0]}, [Y]!
+    vst1.8          {d20[1]}, [Y]!
+    vst1.8          {d20[2]}, [Y]!
+    vst1.8          {d20[3]}, [Y]!
+    vst1.8          {d21[0]}, [U]!
+    vst1.8          {d21[1]}, [U]!
+    vst1.8          {d21[2]}, [U]!
+    vst1.8          {d21[3]}, [U]!
+    vst1.8          {d22[0]}, [V]!
+    vst1.8          {d22[1]}, [V]!
+    vst1.8          {d22[2]}, [V]!
+    vst1.8          {d22[3]}, [V]!
+  .elseif \size == 2
+    vst1.8          {d20[4]}, [Y]!
+    vst1.8          {d20[5]}, [Y]!
+    vst1.8          {d21[4]}, [U]!
+    vst1.8          {d21[5]}, [U]!
+    vst1.8          {d22[4]}, [V]!
+    vst1.8          {d22[5]}, [V]!
+  .elseif \size == 1
+    vst1.8          {d20[6]}, [Y]!
+    vst1.8          {d21[6]}, [U]!
+    vst1.8          {d22[6]}, [V]!
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_load bpp, size
+  .if \bpp == 24
+    .if \size == 8
+      vld3.8        {d10, d11, d12}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
+      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
+      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
+      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
+    .elseif \size == 2
+      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
+      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
+    .elseif \size == 1
+      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      vld4.8        {d10, d11, d12, d13}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+    .elseif \size == 2
+      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+    .elseif \size == 1
+      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vrev64.32       q9, q1
+    vrev64.32       q13, q1
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+    vrshrn.u32      d20, q7, #16
+    vrshrn.u32      d21, q8, #16
+    vshrn.u32       d22, q9, #16
+    vshrn.u32       d23, q13, #16
+    vshrn.u32       d24, q14, #16
+    vshrn.u32       d25, q15, #16
+    vmovn.u16       d20, q10       /* d20 = y */
+    vmovn.u16       d21, q11       /* d21 = u */
+    vmovn.u16       d22, q12       /* d22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+    do_rgb_to_yuv_stage1
+    do_rgb_to_yuv_stage2
+.endm
+
+.macro do_rgb_to_yuv_stage2_store_load_stage1
+      vrshrn.u32      d20, q7, #16
+      vrshrn.u32      d21, q8, #16
+      vshrn.u32       d22, q9, #16
+    vrev64.32       q9, q1
+      vshrn.u32       d23, q13, #16
+    vrev64.32       q13, q1
+      vshrn.u32       d24, q14, #16
+      vshrn.u32       d25, q15, #16
+    do_load         \bpp, 8
+      vmovn.u16       d20, q10     /* d20 = y */
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+      vmovn.u16       d21, q11     /* d21 = u */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+      vmovn.u16       d22, q12     /* d22 = v */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+      vst1.8          {d20}, [Y]!
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+      vst1.8          {d21}, [U]!
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+      vst1.8          {d22}, [V]!
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
+.endm
+
+.balign 16
+jsimd_\colorid\()_ycc_neon_consts:
+  .short 19595, 38470, 7471,  11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128,   32767, 128
+  .short 32767, 128,   32767, 128
+
+asm_function jsimd_\colorid\()_ycc_convert_neon
+    OUTPUT_WIDTH    .req r0
+    INPUT_BUF       .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_ROW      .req r3
+    NUM_ROWS        .req r4
+
+    OUTPUT_BUF0     .req r5
+    OUTPUT_BUF1     .req r6
+    OUTPUT_BUF2     .req OUTPUT_BUF
+
+    RGB             .req r7
+    Y               .req r8
+    U               .req r9
+    V               .req r10
+    N               .req ip
+
+    /* Load constants to d0, d1, d2, d3 */
+    adr             ip, jsimd_\colorid\()_ycc_neon_consts
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]
+
+    /* Save Arm registers and handle input arguments */
+    push            {r4, r5, r6, r7, r8, r9, r10, lr}
+    ldr             NUM_ROWS, [sp, #(4 * 8)]
+    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
+    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
+    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
+    .unreq          OUTPUT_BUF
+
+    /* Save Neon registers */
+    vpush           {d8 - d15}
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    blt             9f
+0:
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
+    add             OUTPUT_ROW, OUTPUT_ROW, #1
+    ldr             RGB, [INPUT_BUF], #4
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    blt             3f
+    do_load         \bpp, 8
+    do_rgb_to_yuv_stage1
+    subs            N, N, #8
+    blt             2f
+1:
+    do_rgb_to_yuv_stage2_store_load_stage1
+    subs            N, N, #8
+    bge             1b
+2:
+    do_rgb_to_yuv_stage2
+    do_store        8
+    tst             N, #7
+    beq             8f
+3:
+    tst             N, #4
+    beq             3f
+    do_load         \bpp, 4
+3:
+    tst             N, #2
+    beq             4f
+    do_load         \bpp, 2
+4:
+    tst             N, #1
+    beq             5f
+    do_load         \bpp, 1
+5:
+    do_rgb_to_yuv
+    tst             N, #4
+    beq             6f
+    do_store        4
+6:
+    tst             N, #2
+    beq             7f
+    do_store        2
+7:
+    tst             N, #1
+    beq             8f
+    do_store        1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    bgt             0b
+9:
+    /* Restore all registers and return */
+    vpop            {d8 - d15}
+    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
+
+    .unreq          OUTPUT_WIDTH
+    .unreq          OUTPUT_ROW
+    .unreq          INPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          OUTPUT_BUF0
+    .unreq          OUTPUT_BUF1
+    .unreq          OUTPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  G  B */
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
+
+.purgem do_load
+.purgem do_store
diff --git a/media/libjpeg/simd/arm/aarch64/jccolext-neon.c b/media/libjpeg/simd/arm/aarch64/jccolext-neon.c
new file mode 100644
index 0000000000..37130c225e
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jccolext-neon.c
@@ -0,0 +1,316 @@
+/*
+ * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+  /* Set up conversion constants. */
+  const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr0 += 16;
+      outptr1 += 16;
+      outptr2 += 16;
+    }
+
+    if (cols_remaining > 8) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+    } else if (cols_remaining > 0) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
+      uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_l = scaled_128_5;
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
+      cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
+      uint32x4_t cb_h = scaled_128_5;
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
+      cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_l = scaled_128_5;
+      cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
+      uint32x4_t cr_h = scaled_128_5;
+      cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
+                                      vrshrn_n_u32(y_h, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
+                                       vshrn_n_u32(cb_h, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
+                                       vshrn_n_u32(cr_h, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jchuff-neon.c b/media/libjpeg/simd/arm/aarch64/jchuff-neon.c
new file mode 100644
index 0000000000..607a116070
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jchuff-neon.c
@@ -0,0 +1,411 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, 2022, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../align.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
+    0,   1,   2,   3,  16,  17,  32,  33,
+   18,  19,   4,   5,   6,   7,  20,  21,
+   34,  35,  48,  49, 255, 255,  50,  51,
+   36,  37,  22,  23,   8,   9,  10,  11,
+  255, 255,   6,   7,  20,  21,  34,  35,
+   48,  49, 255, 255,  50,  51,  36,  37,
+   54,  55,  40,  41,  26,  27,  12,  13,
+   14,  15,  28,  29,  42,  43,  56,  57,
+    6,   7,  20,  21,  34,  35,  48,  49,
+   50,  51,  36,  37,  22,  23,   8,   9,
+   26,  27,  12,  13, 255, 255,  14,  15,
+   28,  29,  42,  43,  56,  57, 255, 255,
+   52,  53,  54,  55,  40,  41,  26,  27,
+   12,  13, 255, 255,  14,  15,  28,  29,
+   26,  27,  40,  41,  42,  43,  28,  29,
+   14,  15,  30,  31,  44,  45,  46,  47
+};
+
+/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
+ * address warning because the macro sometimes writes a 64-bit value to a
+ * non-64-bit-aligned address.  That behavior is technically undefined per
+ * the C specification, but it is supported by the AArch64 architecture and
+ * compilers.
+ */
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("alignment")))
+#endif
+#endif
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load lookup table indices for rows of zig-zag ordering. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const uint8x16x4_t idx_rows_0123 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
+  const uint8x16x4_t idx_rows_4567 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
+#else
+  /* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
+  const uint8x16x4_t idx_rows_0123 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
+  } };
+  const uint8x16x4_t idx_rows_4567 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
+  } };
+#endif
+
+  /* Load 8x8 block of DCT coefficients. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const int8x16x4_t tbl_rows_0123 =
+    vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
+  const int8x16x4_t tbl_rows_4567 =
+    vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
+#else
+  const int8x16x4_t tbl_rows_0123 = { {
+    vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
+  } };
+  const int8x16x4_t tbl_rows_4567 = { {
+    vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
+  } };
+#endif
+
+  /* Initialise extra lookup tables. */
+  const int8x16x4_t tbl_rows_2345 = { {
+    tbl_rows_0123.val[2], tbl_rows_0123.val[3],
+    tbl_rows_4567.val[0], tbl_rows_4567.val[1]
+  } };
+  const int8x16x3_t tbl_rows_567 =
+    { { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
+
+  /* Shuffle coefficients into zig-zag order. */
+  int16x8_t row0 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
+  int16x8_t row1 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
+  int16x8_t row2 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
+  int16x8_t row3 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
+  int16x8_t row4 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
+  int16x8_t row5 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
+  int16x8_t row6 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
+  int16x8_t row7 =
+    vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
+
+  /* Compute DC coefficient difference value (F.1.1.5.1). */
+  row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
+  /* Initialize AC coefficient lanes not reachable by lookup tables. */
+  row1 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
+                                  0), row1, 2);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  4), row2, 0);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  0), row2, 5);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  7), row5, 2);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  3), row5, 7);
+  row6 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
+                                  7), row6, 5);
+
+  /* DCT block is now in zig-zag order; start Huffman encoding process. */
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
+  uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
+  uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
+  uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
+  uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
+  uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
+  uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
+  uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
+
+  uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
+                                    vreinterpretq_u8_u16(row0_ne_0));
+  uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
+                                    vreinterpretq_u8_u16(row2_ne_0));
+  uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
+                                    vreinterpretq_u8_u16(row4_ne_0));
+  uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
+                                    vreinterpretq_u8_u16(row6_ne_0));
+
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+  const uint8x16_t bitmap_mask =
+    vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
+
+  uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
+
+  uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
+                                              bitmap_rows_3210);
+  uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
+                                  vget_high_u8(bitmap_rows_76543210));
+
+  /* Shift left to remove DC bit. */
+  bitmap_all =
+    vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
+  /* Count bits set (number of non-zero coefficients) in bitmap. */
+  unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
+  /* Move bitmap to 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+  /* Find nbits required to specify sign and amplitude of coefficient. */
+  unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
+  unsigned int nbits = 16 - lz;
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = vgetq_lane_u16(row0_diff, 0);
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  /* The most efficient method of computing nbits and diff depends on the
+   * number of non-zero coefficients.  If the bitmap is not too sparse (> 8
+   * non-zero AC coefficients), it is beneficial to do all of the work using
+   * Neon; else we do some of the work using Neon and the rest on demand using
+   * scalar code.
+   */
+  if (non_zero_coefficients > 8) {
+    uint8_t block_nbits[DCTSIZE2];
+
+    int16x8_t abs_row1 = vabsq_s16(row1);
+    int16x8_t abs_row2 = vabsq_s16(row2);
+    int16x8_t abs_row3 = vabsq_s16(row3);
+    int16x8_t abs_row4 = vabsq_s16(row4);
+    int16x8_t abs_row5 = vabsq_s16(row5);
+    int16x8_t abs_row6 = vabsq_s16(row6);
+    int16x8_t abs_row7 = vabsq_s16(row7);
+    int16x8_t row1_lz = vclzq_s16(abs_row1);
+    int16x8_t row2_lz = vclzq_s16(abs_row2);
+    int16x8_t row3_lz = vclzq_s16(abs_row3);
+    int16x8_t row4_lz = vclzq_s16(abs_row4);
+    int16x8_t row5_lz = vclzq_s16(abs_row5);
+    int16x8_t row6_lz = vclzq_s16(abs_row6);
+    int16x8_t row7_lz = vclzq_s16(abs_row7);
+    /* Narrow leading zero count to 8 bits. */
+    uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
+                                    vreinterpretq_u8_s16(row1_lz));
+    uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
+                                    vreinterpretq_u8_s16(row3_lz));
+    uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
+                                    vreinterpretq_u8_s16(row5_lz));
+    uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
+                                    vreinterpretq_u8_s16(row7_lz));
+    /* Compute nbits needed to specify magnitude of each coefficient. */
+    uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
+    uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
+    uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
+    uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
+    /* Store nbits. */
+    vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
+    vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
+    vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
+    vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
+    /* Mask bits not required to specify sign and amplitude of diff. */
+    uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
+    uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
+    uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
+    uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
+    uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
+    uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
+    uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
+    /* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
+    uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+                                     row1_mask);
+    uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+                                     row2_mask);
+    uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+                                     row3_mask);
+    uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+                                     row4_mask);
+    uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+                                     row5_mask);
+    uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+                                     row6_mask);
+    uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+                                     row7_mask);
+    /* Store diff bits. */
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    while (bitmap != 0) {
+      r = BUILTIN_CLZLL(bitmap);
+      i += r;
+      bitmap <<= r;
+      nbits = block_nbits[i];
+      diff = block_diff[i];
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  } else if (bitmap != 0) {
+    uint16_t block_abs[DCTSIZE2];
+    /* Compute and store absolute value of coefficients. */
+    int16x8_t abs_row1 = vabsq_s16(row1);
+    int16x8_t abs_row2 = vabsq_s16(row2);
+    int16x8_t abs_row3 = vabsq_s16(row3);
+    int16x8_t abs_row4 = vabsq_s16(row4);
+    int16x8_t abs_row5 = vabsq_s16(row5);
+    int16x8_t abs_row6 = vabsq_s16(row6);
+    int16x8_t abs_row7 = vabsq_s16(row7);
+    vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
+    vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
+    vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
+    vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
+    vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
+    vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
+    vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
+    vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
+    /* Compute diff bits (without nbits mask) and store. */
+    uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+                                     vcltzq_s16(row1));
+    uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+                                     vcltzq_s16(row2));
+    uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+                                     vcltzq_s16(row3));
+    uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+                                     vcltzq_s16(row4));
+    uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+                                     vcltzq_s16(row5));
+    uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+                                     vcltzq_s16(row6));
+    uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+                                     vcltzq_s16(row7));
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    /* Same as above but must mask diff bits and compute nbits on demand. */
+    while (bitmap != 0) {
+      r = BUILTIN_CLZLL(bitmap);
+      i += r;
+      bitmap <<= r;
+      lz = BUILTIN_CLZ(block_abs[i]);
+      nbits = 32 - lz;
+      diff = ((unsigned int)block_diff[i] << lz) >> lz;
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jsimd.c b/media/libjpeg/simd/arm/aarch64/jsimd.c
new file mode 100644
index 0000000000..604d5472f6
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jsimd.c
@@ -0,0 +1,1058 @@
+/*
+ * jsimd_arm64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "jconfigint.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#define JSIMD_FASTLD3  1
+#define JSIMD_FASTST3  2
+#define JSIMD_FASTTBL  4
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
+                                    JSIMD_FASTTBL;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_cpuinfo(char *buffer, const char *field, char *value)
+{
+  char *p;
+
+  if (*value == 0)
+    return 0;
+  if (strncmp(buffer, field, strlen(field)) != 0)
+    return 0;
+  buffer += strlen(field);
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'value' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, value))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(value);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+          check_cpuinfo(buffer, "CPU part", "0xd07"))
+        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
+           percent speedup by disabling the use of that instruction.  The
+           speedup on Cortex-A57 is more subtle but still measurable. */
+        simd_features &= ~JSIMD_FASTTBL;
+      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+        /* The SIMD version of Huffman encoding is slower than the C version on
+           Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
+           CPU. */
+        simd_huffman = simd_features = 0;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+
+/*
+ * Armv8 architectures support Neon extensions by default.
+ * It is no longer optional as it was with Armv7.
+ */
+
+
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+  simd_support |= JSIMD_NEON;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+    simd_support = JSIMD_NEON;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+  if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "1"))
+    simd_features |= JSIMD_FASTLD3;
+  if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "0"))
+    simd_features &= ~JSIMD_FASTLD3;
+  if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "1"))
+    simd_features |= JSIMD_FASTST3;
+  if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "0"))
+    simd_features &= ~JSIMD_FASTST3;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTLD3)
+#endif
+      neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTLD3)
+#endif
+      neonfct = jsimd_extbgr_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+#endif
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_ycc_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_ycc_convert_neon;
+    break;
+  default:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTLD3)
+#endif
+      neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTST3)
+#endif
+      neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_ycc_extrgbx_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTST3)
+#endif
+      neonfct = jsimd_ycc_extbgr_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
+#endif
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_ycc_extbgrx_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_ycc_extxbgr_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_ycc_extxrgb_convert_neon;
+    break;
+  default:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTST3)
+#endif
+      neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+    break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+#ifndef NEON_INTRINSICS
+  if (simd_features & JSIMD_FASTTBL)
+#endif
+    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                            dctbl, actbl);
+#ifndef NEON_INTRINSICS
+  else
+    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+                                                    last_dc_val, dctbl, actbl);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/jsimd_arm64_neon.S b/media/libjpeg/simd/arm/aarch64/jsimd_neon.S
index 3309858241..738a4f0658 100644
--- a/media/libjpeg/simd/jsimd_arm64_neon.S
+++ b/media/libjpeg/simd/arm/aarch64/jsimd_neon.S
@@ -1,13 +1,13 @@
 /*
- * ARMv8 NEON optimizations for libjpeg-turbo
+ * Armv8 Neon optimizations for libjpeg-turbo
  *
  * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
  * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
- * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
- * Copyright (C) 2014-2016, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
+ * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Copyright (C) 2014-2016, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
  * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
@@ -31,10 +31,158 @@
 .section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
 #endif
 
-.text
+#if defined(__APPLE__)
+.section __DATA, __const
+#elif defined(_WIN32)
+.section .rdata
+#else
+.section .rodata, "a", %progbits
+#endif
 
+/* Constants for jsimd_idct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_idct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
 
-#define RESPECT_STRICT_ALIGNMENT 1
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_ycc_*_neon() */
+
+.balign 16
+Ljsimd_ycc_rgb_neon_consts:
+  .short 0,      0,     0,      0
+  .short 22971, -11277, -23401, 29033
+  .short -128,  -128,   -128,   -128
+  .short -128,  -128,   -128,   -128
+
+/* Constants for jsimd_*_ycc_neon() */
+
+.balign 16
+Ljsimd_rgb_ycc_neon_consts:
+  .short 19595, 38470, 7471, 11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128, 32767, 128
+  .short 32767, 128, 32767, 128
+
+/* Constants for jsimd_fdct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_huff_encode_one_block_neon() */
+
+.balign 16
+Ljsimd_huff_encode_one_block_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
+            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
+    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
+            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
+    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
+           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
+    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
+            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
+    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
+            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
+    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
+            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
+    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
+            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
+    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
+            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
+    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
+    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
+
+.text
 
 
 /*****************************************************************************/
@@ -42,6 +190,7 @@
 /* Supplementary macro for setting function attributes */
 .macro asm_function fname
 #ifdef __APPLE__
+    .private_extern _\fname
     .globl _\fname
 _\fname:
 #else
@@ -54,43 +203,15 @@ _\fname:
 #endif
 .endm
 
-/* Transpose elements of single 128 bit registers */
-.macro transpose_single x0, x1, xi, xilen, literal
-    ins             \xi\xilen[0], \x0\xilen[0]
-    ins             \x1\xilen[0], \x0\xilen[1]
-    trn1            \x0\literal, \x0\literal, \x1\literal
-    trn2            \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose elements of 2 differnet registers */
-.macro transpose x0, x1, xi, xilen, literal
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\literal, \x0\literal, \x1\literal
-    trn2            \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\x0len, \x0\x0len, \x2\x2len
-    trn2            \x2\x2len, \xi\x0len, \x2\x2len
-    mov             \xi\xilen, \x1\xilen
-    trn1            \x1\x1len, \x1\x1len, \x3\x3len
-    trn2            \x3\x3len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\x0len, \x0\x0len, \x1\x1len
-    trn2            \x1\x2len, \xi\x0len, \x1\x2len
-    mov             \xi\xilen, \x2\xilen
-    trn1            \x2\x2len, \x2\x2len, \x3\x3len
-    trn2            \x3\x2len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4 x0, x1, x2, x3, x5
-    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
-    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
+/* Get symbol location */
+.macro get_symbol_loc reg, symbol
+#ifdef __APPLE__
+    adrp            \reg, \symbol@PAGE
+    add             \reg, \reg, \symbol@PAGEOFF
+#else
+    adrp            \reg, \symbol
+    add             \reg, \reg, :lo12:\symbol
+#endif
 .endm
 
 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
@@ -123,7 +244,7 @@ _\fname:
 .endm
 
 
-#define CENTERJSAMPLE 128
+#define CENTERJSAMPLE  128
 
 /*****************************************************************************/
 
@@ -131,70 +252,25 @@ _\fname:
  * Perform dequantization and inverse DCT on one block of coefficients.
  *
  * GLOBAL(void)
- * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
- *                        JSAMPARRAY output_buf, JDIMENSION output_col)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
  */
 
-#define CONST_BITS 13
-#define PASS1_BITS 2
-
-#define F_0_298  2446  /* FIX(0.298631336) */
-#define F_0_390  3196  /* FIX(0.390180644) */
-#define F_0_541  4433  /* FIX(0.541196100) */
-#define F_0_765  6270  /* FIX(0.765366865) */
-#define F_0_899  7373  /* FIX(0.899976223) */
-#define F_1_175  9633  /* FIX(1.175875602) */
-#define F_1_501 12299  /* FIX(1.501321110) */
-#define F_1_847 15137  /* FIX(1.847759065) */
-#define F_1_961 16069  /* FIX(1.961570560) */
-#define F_2_053 16819  /* FIX(2.053119869) */
-#define F_2_562 20995  /* FIX(2.562915447) */
-#define F_3_072 25172  /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_idct_islow_neon_consts:
-  .short F_0_298
-  .short -F_0_390
-  .short F_0_541
-  .short F_0_765
-  .short - F_0_899
-  .short F_1_175
-  .short F_1_501
-  .short - F_1_847
-  .short - F_1_961
-  .short F_2_053
-  .short - F_2_562
-  .short F_3_072
-  .short 0          /* padding */
-  .short 0
-  .short 0
-  .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-
-#define XFIX_P_0_298 v0.h[0]
-#define XFIX_N_0_390 v0.h[1]
-#define XFIX_P_0_541 v0.h[2]
-#define XFIX_P_0_765 v0.h[3]
-#define XFIX_N_0_899 v0.h[4]
-#define XFIX_P_1_175 v0.h[5]
-#define XFIX_P_1_501 v0.h[6]
-#define XFIX_N_1_847 v0.h[7]
-#define XFIX_N_1_961 v1.h[0]
-#define XFIX_P_2_053 v1.h[1]
-#define XFIX_N_2_562 v1.h[2]
-#define XFIX_P_3_072 v1.h[3]
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define XFIX_P_0_298  v0.h[0]
+#define XFIX_N_0_390  v0.h[1]
+#define XFIX_P_0_541  v0.h[2]
+#define XFIX_P_0_765  v0.h[3]
+#define XFIX_N_0_899  v0.h[4]
+#define XFIX_P_1_175  v0.h[5]
+#define XFIX_P_1_501  v0.h[6]
+#define XFIX_N_1_847  v0.h[7]
+#define XFIX_N_1_961  v1.h[0]
+#define XFIX_P_2_053  v1.h[1]
+#define XFIX_N_2_562  v1.h[2]
+#define XFIX_P_3_072  v1.h[3]
 
 asm_function jsimd_idct_islow_neon
     DCT_TABLE       .req x0
@@ -216,7 +292,7 @@ asm_function jsimd_idct_islow_neon
     uxtw x3, w3
 
     sub             sp, sp, #64
-    adr             x15, Ljsimd_idct_islow_neon_consts
+    get_symbol_loc  x15, Ljsimd_idct_islow_neon_consts
     mov             x10, sp
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
@@ -292,8 +368,8 @@ asm_function jsimd_idct_islow_neon
     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
     mov             v21.16b, v19.16b               /* tmp3 = z1 */
     mov             v20.16b, v18.16b               /* tmp3 = z1 */
-    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
-    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
@@ -323,20 +399,20 @@ asm_function jsimd_idct_islow_neon
     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
@@ -380,40 +456,40 @@ asm_function jsimd_idct_islow_neon
     sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
 
-    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
     movi            v0.16b, #(CENTERJSAMPLE)
-    /* Prepare pointers (dual-issue with NEON instructions) */
+    /* Prepare pointers (dual-issue with Neon instructions) */
       ldp             TMP1, TMP2, [OUTPUT_BUF], 16
-    sqrshrn         v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn         v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       ldp             TMP3, TMP4, [OUTPUT_BUF], 16
-    sqrshrn         v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn         v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       add             TMP1, TMP1, OUTPUT_COL
-    sqrshrn         v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn         v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       add             TMP2, TMP2, OUTPUT_COL
-    sqrshrn         v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn         v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       add             TMP3, TMP3, OUTPUT_COL
-    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       add             TMP4, TMP4, OUTPUT_COL
-    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       ldp             TMP5, TMP6, [OUTPUT_BUF], 16
-    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       ldp             TMP7, TMP8, [OUTPUT_BUF], 16
-    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       add             TMP5, TMP5, OUTPUT_COL
     add             v16.16b, v28.16b, v0.16b
       add             TMP6, TMP6, OUTPUT_COL
@@ -474,7 +550,7 @@ asm_function jsimd_idct_islow_neon
     sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
     mov             v20.16b, v18.16b               /* tmp3 = z1 */
     sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
     add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
     sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
@@ -496,10 +572,10 @@ asm_function jsimd_idct_islow_neon
     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
@@ -525,14 +601,14 @@ asm_function jsimd_idct_islow_neon
     add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
     sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
 
-    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
     mov             v6.16b, v15.16b
     mov             v7.16b, v15.16b
     mov             v8.16b, v15.16b
@@ -551,7 +627,7 @@ asm_function jsimd_idct_islow_neon
     sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
     mov             v21.16b, v19.16b               /* tmp3 = z1 */
-    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
     add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
@@ -574,10 +650,10 @@ asm_function jsimd_idct_islow_neon
     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
@@ -609,14 +685,14 @@ asm_function jsimd_idct_islow_neon
     mov             v3.16b, v14.16b
     mov             v4.16b, v14.16b
     mov             v5.16b, v14.16b
-    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
     b               1b
 
 .balign 16
@@ -631,8 +707,8 @@ asm_function jsimd_idct_islow_neon
     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
     mov             v21.16b, v19.16b               /* tmp3 = z1 */
     mov             v20.16b, v18.16b               /* tmp3 = z1 */
-    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
-    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
@@ -662,20 +738,20 @@ asm_function jsimd_idct_islow_neon
     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
@@ -719,22 +795,22 @@ asm_function jsimd_idct_islow_neon
     sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
 
-    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
     b               1b
 
     .unreq          DCT_TABLE
@@ -770,665 +846,6 @@ asm_function jsimd_idct_islow_neon
 /*****************************************************************************/
 
 /*
- * jsimd_idct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
- * function from jidctfst.c
- *
- * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
- * But in ARM NEON case some extra additions are required because VQDMULH
- * instruction can't handle the constants larger than 1. So the expressions
- * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
- * which introduces an extra addition. Overall, there are 6 extra additions
- * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
- */
-
-#define XFIX_1_082392200 v0.h[0]
-#define XFIX_1_414213562 v0.h[1]
-#define XFIX_1_847759065 v0.h[2]
-#define XFIX_2_613125930 v0.h[3]
-
-.balign 16
-Ljsimd_idct_ifast_neon_consts:
-  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
-  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
-  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
-  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
-
-asm_function jsimd_idct_ifast_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x1
-    TMP3            .req x9
-    TMP4            .req x10
-    TMP5            .req x11
-    TMP6            .req x12
-    TMP7            .req x13
-    TMP8            .req x14
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* Load and dequantize coefficients into NEON registers
-     * with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17     ( v16.8h )
-     *   1 | d18     | d19     ( v17.8h )
-     *   2 | d20     | d21     ( v18.8h )
-     *   3 | d22     | d23     ( v19.8h )
-     *   4 | d24     | d25     ( v20.8h )
-     *   5 | d26     | d27     ( v21.8h )
-     *   6 | d28     | d29     ( v22.8h )
-     *   7 | d30     | d31     ( v23.8h )
-     */
-    /* Save NEON registers used in fast IDCT */
-    adr             TMP5, Ljsimd_idct_ifast_neon_consts
-    ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
-    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
-    ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
-    mul             v16.8h, v16.8h, v0.8h
-    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v17.8h, v17.8h, v1.8h
-    ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
-    mul             v18.8h, v18.8h, v2.8h
-    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
-    mul             v19.8h, v19.8h, v3.8h
-    ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
-    mul             v20.8h, v20.8h, v0.8h
-    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v22.8h, v22.8h, v2.8h
-    mul             v21.8h, v21.8h, v1.8h
-    ld1             {v0.4h}, [TMP5]        /* load constants */
-    mul             v23.8h, v23.8h, v3.8h
-
-    /* 1-D IDCT, pass 1 */
-    sub             v2.8h, v18.8h, v22.8h
-    add             v22.8h, v18.8h, v22.8h
-    sub             v1.8h, v19.8h, v21.8h
-    add             v21.8h, v19.8h, v21.8h
-    sub             v5.8h, v17.8h, v23.8h
-    add             v23.8h, v17.8h, v23.8h
-    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
-    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
-    add             v3.8h, v1.8h, v1.8h
-    sub             v1.8h, v5.8h, v1.8h
-    add             v18.8h, v2.8h, v4.8h
-    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
-    sub             v2.8h, v23.8h, v21.8h
-    add             v3.8h, v3.8h, v6.8h
-    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
-    add             v1.8h, v1.8h, v4.8h
-    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
-    sub             v18.8h, v18.8h, v22.8h
-    add             v2.8h, v2.8h, v6.8h
-    sub             v6.8h, v16.8h, v20.8h
-    add             v20.8h, v16.8h, v20.8h
-    add             v17.8h, v5.8h, v4.8h
-    add             v5.8h, v6.8h, v18.8h
-    sub             v18.8h, v6.8h, v18.8h
-    add             v6.8h, v23.8h, v21.8h
-    add             v16.8h, v20.8h, v22.8h
-    sub             v3.8h, v6.8h, v3.8h
-    sub             v20.8h, v20.8h, v22.8h
-    sub             v3.8h, v3.8h, v1.8h
-    sub             v1.8h, v17.8h, v1.8h
-    add             v2.8h, v3.8h, v2.8h
-    sub             v23.8h, v16.8h, v6.8h
-    add             v1.8h, v1.8h, v2.8h
-    add             v16.8h, v16.8h, v6.8h
-    add             v22.8h, v5.8h, v3.8h
-    sub             v17.8h, v5.8h, v3.8h
-    sub             v21.8h, v18.8h, v2.8h
-    add             v18.8h, v18.8h, v2.8h
-    sub             v19.8h, v20.8h, v1.8h
-    add             v20.8h, v20.8h, v1.8h
-    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
-    /* 1-D IDCT, pass 2 */
-    sub             v2.8h, v18.8h, v22.8h
-    add             v22.8h, v18.8h, v22.8h
-    sub             v1.8h, v19.8h, v21.8h
-    add             v21.8h, v19.8h, v21.8h
-    sub             v5.8h, v17.8h, v23.8h
-    add             v23.8h, v17.8h, v23.8h
-    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
-    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
-    add             v3.8h, v1.8h, v1.8h
-    sub             v1.8h, v5.8h, v1.8h
-    add             v18.8h, v2.8h, v4.8h
-    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
-    sub             v2.8h, v23.8h, v21.8h
-    add             v3.8h, v3.8h, v6.8h
-    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
-    add             v1.8h, v1.8h, v4.8h
-    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
-    sub             v18.8h, v18.8h, v22.8h
-    add             v2.8h, v2.8h, v6.8h
-    sub             v6.8h, v16.8h, v20.8h
-    add             v20.8h, v16.8h, v20.8h
-    add             v17.8h, v5.8h, v4.8h
-    add             v5.8h, v6.8h, v18.8h
-    sub             v18.8h, v6.8h, v18.8h
-    add             v6.8h, v23.8h, v21.8h
-    add             v16.8h, v20.8h, v22.8h
-    sub             v3.8h, v6.8h, v3.8h
-    sub             v20.8h, v20.8h, v22.8h
-    sub             v3.8h, v3.8h, v1.8h
-    sub             v1.8h, v17.8h, v1.8h
-    add             v2.8h, v3.8h, v2.8h
-    sub             v23.8h, v16.8h, v6.8h
-    add             v1.8h, v1.8h, v2.8h
-    add             v16.8h, v16.8h, v6.8h
-    add             v22.8h, v5.8h, v3.8h
-    sub             v17.8h, v5.8h, v3.8h
-    sub             v21.8h, v18.8h, v2.8h
-    add             v18.8h, v18.8h, v2.8h
-    sub             v19.8h, v20.8h, v1.8h
-    add             v20.8h, v20.8h, v1.8h
-    /* Descale to 8-bit and range limit */
-    movi            v0.16b, #0x80
-      /* Prepare pointers (dual-issue with NEON instructions) */
-      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
-    sqshrn          v28.8b, v16.8h, #5
-      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
-    sqshrn          v29.8b, v17.8h, #5
-      add             TMP1, TMP1, OUTPUT_COL
-    sqshrn          v30.8b, v18.8h, #5
-      add             TMP2, TMP2, OUTPUT_COL
-    sqshrn          v31.8b, v19.8h, #5
-      add             TMP3, TMP3, OUTPUT_COL
-    sqshrn2         v28.16b, v20.8h, #5
-      add             TMP4, TMP4, OUTPUT_COL
-    sqshrn2         v29.16b, v21.8h, #5
-      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
-    sqshrn2         v30.16b, v22.8h, #5
-      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
-    sqshrn2         v31.16b, v23.8h, #5
-      add             TMP5, TMP5, OUTPUT_COL
-    add             v16.16b, v28.16b, v0.16b
-      add             TMP6, TMP6, OUTPUT_COL
-    add             v18.16b, v29.16b, v0.16b
-      add             TMP7, TMP7, OUTPUT_COL
-    add             v20.16b, v30.16b, v0.16b
-      add             TMP8, TMP8, OUTPUT_COL
-    add             v22.16b, v31.16b, v0.16b
-
-    /* Transpose the final 8-bit samples */
-    trn1            v28.16b, v16.16b, v18.16b
-    trn1            v30.16b, v20.16b, v22.16b
-    trn2            v29.16b, v16.16b, v18.16b
-    trn2            v31.16b, v20.16b, v22.16b
-
-    trn1            v16.8h, v28.8h, v30.8h
-    trn2            v18.8h, v28.8h, v30.8h
-    trn1            v20.8h, v29.8h, v31.8h
-    trn2            v22.8h, v29.8h, v31.8h
-
-    uzp1            v28.4s, v16.4s, v18.4s
-    uzp2            v30.4s, v16.4s, v18.4s
-    uzp1            v29.4s, v20.4s, v22.4s
-    uzp2            v31.4s, v20.4s, v22.4s
-
-    /* Store results to the output buffer */
-    st1             {v28.d}[0], [TMP1]
-    st1             {v29.d}[0], [TMP2]
-    st1             {v28.d}[1], [TMP3]
-    st1             {v29.d}[1], [TMP4]
-    st1             {v30.d}[0], [TMP5]
-    st1             {v31.d}[0], [TMP6]
-    st1             {v30.d}[1], [TMP7]
-    st1             {v31.d}[1], [TMP8]
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-    .unreq          TMP5
-    .unreq          TMP6
-    .unreq          TMP7
-    .unreq          TMP8
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular NEON optimized function is
- *       bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- *       idct_helper/transpose_4x4 macros and reordering instructions,
- *       but readability will suffer somewhat.
- */
-
-#define CONST_BITS  13
-
-#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
-
-.balign 16
-Ljsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065      /* v0.h[0] */
-  .short -FIX_0_765366865     /* v0.h[1] */
-  .short -FIX_0_211164243     /* v0.h[2] */
-  .short FIX_1_451774981      /* v0.h[3] */
-  .short -FIX_2_172734803     /* d1[0] */
-  .short FIX_1_061594337      /* d1[1] */
-  .short -FIX_0_509795579     /* d1[2] */
-  .short -FIX_0_601344887     /* d1[3] */
-  .short FIX_0_899976223      /* v2.h[0] */
-  .short FIX_2_562915447      /* v2.h[1] */
-  .short 1 << (CONST_BITS+1)  /* v2.h[2] */
-  .short 0                    /* v2.h[3] */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    smull           v28.4s, \x4, v2.h[2]
-    smlal           v28.4s, \x8, v0.h[0]
-    smlal           v28.4s, \x14, v0.h[1]
-
-    smull           v26.4s, \x16, v1.h[2]
-    smlal           v26.4s, \x12, v1.h[3]
-    smlal           v26.4s, \x10, v2.h[0]
-    smlal           v26.4s, \x6, v2.h[1]
-
-    smull           v30.4s, \x4, v2.h[2]
-    smlsl           v30.4s, \x8, v0.h[0]
-    smlsl           v30.4s, \x14, v0.h[1]
-
-    smull           v24.4s, \x16, v0.h[2]
-    smlal           v24.4s, \x12, v0.h[3]
-    smlal           v24.4s, \x10, v1.h[0]
-    smlal           v24.4s, \x6, v1.h[1]
-
-    add             v20.4s, v28.4s, v26.4s
-    sub             v28.4s, v28.4s, v26.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v28.4s, v28.4s, #\shift
-    xtn             \y26, v20.4s
-    xtn             \y29, v28.4s
-  .else
-    rshrn           \y26, v20.4s, #\shift
-    rshrn           \y29, v28.4s, #\shift
-  .endif
-
-    add             v20.4s, v30.4s, v24.4s
-    sub             v30.4s, v30.4s, v24.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v30.4s, v30.4s, #\shift
-    xtn             \y27, v20.4s
-    xtn             \y28, v30.4s
-  .else
-    rshrn           \y27, v20.4s, #\shift
-    rshrn           \y28, v30.4s, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x1
-    TMP3            .req x2
-    TMP4            .req x15
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* Save all used NEON registers */
-    sub             sp, sp, 64
-    mov             x9, sp
-    /* Load constants (v3.4h is just used for padding) */
-    adr             TMP4, Ljsimd_idct_4x4_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
-
-    /* Load all COEF_BLOCK into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | v4.4h   | v5.4h
-     *   1 | v6.4h   | v7.4h
-     *   2 | v8.4h   | v9.4h
-     *   3 | v10.4h  | v11.4h
-     *   4 | -       | -
-     *   5 | v12.4h  | v13.4h
-     *   6 | v14.4h  | v15.4h
-     *   7 | v16.4h  | v17.4h
-     */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
-    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
-    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
-    /* dequantize */
-    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
-    mul             v4.4h, v4.4h, v18.4h
-    mul             v5.4h, v5.4h, v19.4h
-    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
-    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
-    mul             v6.4h, v6.4h, v20.4h
-    mul             v7.4h, v7.4h, v21.4h
-    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
-    mul             v8.4h, v8.4h, v22.4h
-    mul             v9.4h, v9.4h, v23.4h
-    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
-    mul             v10.4h, v10.4h, v24.4h
-    mul             v11.4h, v11.4h, v25.4h
-    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
-    mul             v12.4h, v12.4h, v26.4h
-    mul             v13.4h, v13.4h, v27.4h
-    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
-    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
-    mul             v14.4h, v14.4h, v28.4h
-    mul             v15.4h, v15.4h, v29.4h
-    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
-    mul             v16.4h, v16.4h, v30.4h
-    mul             v17.4h, v17.4h, v31.4h
-    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
-
-    /* Pass 1 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
-                    v4.4h, v6.4h, v8.4h, v10.4h
-    transpose_4x4   v4, v6, v8, v10, v3
-    ins             v10.d[1], v11.d[0]
-    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
-                    v5.4h, v7.4h, v9.4h, v11.4h
-    transpose_4x4   v5, v7, v9, v11, v3
-    ins             v10.d[1], v11.d[0]
-
-    /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
-                    v26.4h, v27.4h, v28.4h, v29.4h
-    transpose_4x4   v26, v27, v28, v29, v3
-
-    /* Range limit */
-    movi            v30.8h, #0x80
-    ins             v26.d[1], v27.d[0]
-    ins             v28.d[1], v29.d[0]
-    add             v26.8h, v26.8h, v30.8h
-    add             v28.8h, v28.8h, v30.8h
-    sqxtun          v26.8b, v26.8h
-    sqxtun          v27.8b, v28.8h
-
-    /* Store results to the output buffer */
-    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
-    ldp             TMP3, TMP4, [OUTPUT_BUF]
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-    add             TMP3, TMP3, OUTPUT_COL
-    add             TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
-    /* We can use much less instructions on little endian systems if the
-     * OS kernel is not configured to trap unaligned memory accesses
-     */
-    st1             {v26.s}[0], [TMP1], 4
-    st1             {v27.s}[0], [TMP3], 4
-    st1             {v26.s}[1], [TMP2], 4
-    st1             {v27.s}[1], [TMP4], 4
-#else
-    st1             {v26.b}[0], [TMP1], 1
-    st1             {v27.b}[0], [TMP3], 1
-    st1             {v26.b}[1], [TMP1], 1
-    st1             {v27.b}[1], [TMP3], 1
-    st1             {v26.b}[2], [TMP1], 1
-    st1             {v27.b}[2], [TMP3], 1
-    st1             {v26.b}[3], [TMP1], 1
-    st1             {v27.b}[3], [TMP3], 1
-
-    st1             {v26.b}[4], [TMP2], 1
-    st1             {v27.b}[4], [TMP4], 1
-    st1             {v26.b}[5], [TMP2], 1
-    st1             {v27.b}[5], [TMP4], 1
-    st1             {v26.b}[6], [TMP2], 1
-    st1             {v27.b}[6], [TMP4], 1
-    st1             {v26.b}[7], [TMP2], 1
-    st1             {v27.b}[7], [TMP4], 1
-#endif
-
-    /* vpop            {v8.4h - v15.4h}    ;not available */
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular NEON optimized function is
- *       bit exact compatibility with jpeg-6b.
- */
-
-.balign 8
-Ljsimd_idct_2x2_neon_consts:
-  .short -FIX_0_720959822  /* v14[0] */
-  .short FIX_0_850430095   /* v14[1] */
-  .short -FIX_1_272758580  /* v14[2] */
-  .short FIX_3_624509785   /* v14[3] */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    sshll           v15.4s, \x4, #15
-    smull           v26.4s, \x6, v14.h[3]
-    smlal           v26.4s, \x10, v14.h[2]
-    smlal           v26.4s, \x12, v14.h[1]
-    smlal           v26.4s, \x16, v14.h[0]
-
-    add             v20.4s, v15.4s, v26.4s
-    sub             v15.4s, v15.4s, v26.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v15.4s, v15.4s, #\shift
-    xtn             \y26, v20.4s
-    xtn             \y27, v15.4s
-  .else
-    rshrn           \y26, v20.4s, #\shift
-    rshrn           \y27, v15.4s, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x15
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* vpush           {v8.4h - v15.4h}            ; not available */
-    sub             sp, sp, 64
-    mov             x9, sp
-
-    /* Load constants */
-    adr             TMP2, Ljsimd_idct_2x2_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-    ld1             {v14.4h}, [TMP2]
-
-    /* Load all COEF_BLOCK into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | v4.4h   | v5.4h
-     *   1 | v6.4h   | v7.4h
-     *   2 | -       | -
-     *   3 | v10.4h  | v11.4h
-     *   4 | -       | -
-     *   5 | v12.4h  | v13.4h
-     *   6 | -       | -
-     *   7 | v16.4h  | v17.4h
-     */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
-    /* Dequantize */
-    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
-    mul             v4.4h, v4.4h, v18.4h
-    mul             v5.4h, v5.4h, v19.4h
-    ins             v4.d[1], v5.d[0]
-    mul             v6.4h, v6.4h, v20.4h
-    mul             v7.4h, v7.4h, v21.4h
-    ins             v6.d[1], v7.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
-    mul             v10.4h, v10.4h, v24.4h
-    mul             v11.4h, v11.4h, v25.4h
-    ins             v10.d[1], v11.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
-    mul             v12.4h, v12.4h, v26.4h
-    mul             v13.4h, v13.4h, v27.4h
-    ins             v12.d[1], v13.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
-    mul             v16.4h, v16.4h, v30.4h
-    mul             v17.4h, v17.4h, v31.4h
-    ins             v16.d[1], v17.d[0]
-
-    /* Pass 1 */
-#if 0
-    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
-    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
-    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
-    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
-#else
-    smull           v26.4s, v6.4h, v14.h[3]
-    smlal           v26.4s, v10.4h, v14.h[2]
-    smlal           v26.4s, v12.4h, v14.h[1]
-    smlal           v26.4s, v16.4h, v14.h[0]
-    smull           v24.4s, v7.4h, v14.h[3]
-    smlal           v24.4s, v11.4h, v14.h[2]
-    smlal           v24.4s, v13.4h, v14.h[1]
-    smlal           v24.4s, v17.4h, v14.h[0]
-    sshll           v15.4s, v4.4h, #15
-    sshll           v30.4s, v5.4h, #15
-    add             v20.4s, v15.4s, v26.4s
-    sub             v15.4s, v15.4s, v26.4s
-    rshrn           v4.4h, v20.4s, #13
-    rshrn           v6.4h, v15.4s, #13
-    add             v20.4s, v30.4s, v24.4s
-    sub             v15.4s, v30.4s, v24.4s
-    rshrn           v5.4h, v20.4s, #13
-    rshrn           v7.4h, v15.4s, #13
-    ins             v4.d[1], v5.d[0]
-    ins             v6.d[1], v7.d[0]
-    transpose       v4, v6, v3, .16b, .8h
-    transpose       v6, v10, v3, .16b, .4s
-    ins             v11.d[0], v10.d[1]
-    ins             v7.d[0], v6.d[1]
-#endif
-
-    /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
-
-    /* Range limit */
-    movi            v30.8h, #0x80
-    ins             v26.d[1], v27.d[0]
-    add             v26.8h, v26.8h, v30.8h
-    sqxtun          v30.8b, v26.8h
-    ins             v26.d[0], v30.d[0]
-    sqxtun          v27.8b, v26.8h
-
-    /* Store results to the output buffer */
-    ldp             TMP1, TMP2, [OUTPUT_BUF]
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-
-    st1             {v26.b}[0], [TMP1], 1
-    st1             {v27.b}[4], [TMP1], 1
-    st1             {v26.b}[1], [TMP2], 1
-    st1             {v27.b}[5], [TMP2], 1
-
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
  * jsimd_ycc_extrgb_convert_neon
  * jsimd_ycc_extbgr_convert_neon
  * jsimd_ycc_extrgbx_convert_neon
@@ -1543,7 +960,7 @@ asm_function jsimd_idct_2x2_neon
     .else
       .error unsupported macroblock size
     .endif
-  .elseif \bpp==16
+  .elseif \bpp == 16
     .if \size == 8
       st1           {v25.8h}, [RGB], 16
     .elseif \size == 4
@@ -1662,21 +1079,6 @@ asm_function jsimd_idct_2x2_neon
     do_yuv_to_rgb_stage2
 .endm
 
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-.if \fast_st3 == 1
-Ljsimd_ycc_\colorid\()_neon_consts:
-.else
-Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
-.endif
-  .short 0,      0,     0,      0
-  .short 22971, -11277, -23401, 29033
-  .short -128,  -128,   -128,   -128
-  .short -128,  -128,   -128,   -128
-
 .if \fast_st3 == 1
 asm_function jsimd_ycc_\colorid\()_convert_neon
 .else
@@ -1702,13 +1104,9 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
     mov             x9, sp
 
     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
-    .if \fast_st3 == 1
-      adr           x15, Ljsimd_ycc_\colorid\()_neon_consts
-    .else
-      adr           x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
-    .endif
+    get_symbol_loc  x15, Ljsimd_ycc_rgb_neon_consts
 
-    /* Save NEON registers */
+    /* Save Neon registers */
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v0.4h, v1.4h}, [x15], 16
@@ -1993,7 +1391,7 @@ generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,
 .endm
 
 /* TODO: expand macros and interleave instructions if some in-order
- *       ARM64 processor actually can dual-issue LOAD/STORE with ALU */
+ *       AArch64 processor actually can dual-issue LOAD/STORE with ALU */
 .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
     do_rgb_to_yuv_stage2
     do_load         \bpp, 8, \fast_ld3
@@ -2003,17 +1401,6 @@ generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,
     do_rgb_to_yuv_stage1
 .endm
 
-.balign 16
-.if \fast_ld3 == 1
-Ljsimd_\colorid\()_ycc_neon_consts:
-.else
-Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
-.endif
-  .short 19595, 38470, 7471, 11059
-  .short 21709, 32768, 27439, 5329
-  .short 32767, 128, 32767, 128
-  .short 32767, 128, 32767, 128
-
 .if \fast_ld3 == 1
 asm_function jsimd_\colorid\()_ycc_convert_neon
 .else
@@ -2036,11 +1423,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
     N               .req w12
 
     /* Load constants to d0, d1, d2, d3 */
-    .if \fast_ld3 == 1
-      adr           x13, Ljsimd_\colorid\()_ycc_neon_consts
-    .else
-      adr           x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
-    .endif
+    get_symbol_loc  x13, Ljsimd_rgb_ycc_neon_consts
     ld1             {v0.8h, v1.8h}, [x13]
 
     ldr             OUTPUT_BUF0, [OUTPUT_BUF]
@@ -2048,7 +1431,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
     ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
     .unreq          OUTPUT_BUF
 
-    /* Save NEON registers */
+    /* Save Neon registers */
     sub             sp, sp, #64
     mov             x9, sp
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
@@ -2147,85 +1530,9 @@ generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
 /*****************************************************************************/
 
 /*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- *       rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
-    SAMPLE_DATA     .req x0
-    START_COL       .req x1
-    WORKSPACE       .req x2
-    TMP1            .req x9
-    TMP2            .req x10
-    TMP3            .req x11
-    TMP4            .req x12
-    TMP5            .req x13
-    TMP6            .req x14
-    TMP7            .req x15
-    TMP8            .req x4
-    TMPDUP          .req w3
-
-    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x1 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x1, w1
-
-    mov             TMPDUP, #128
-    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
-    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
-    dup             v0.8b, TMPDUP
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
-    add             TMP5, TMP5, START_COL
-    add             TMP6, TMP6, START_COL
-    ld1             {v16.8b}, [TMP1]
-    add             TMP7, TMP7, START_COL
-    add             TMP8, TMP8, START_COL
-    ld1             {v17.8b}, [TMP2]
-    usubl           v16.8h, v16.8b, v0.8b
-    ld1             {v18.8b}, [TMP3]
-    usubl           v17.8h, v17.8b, v0.8b
-    ld1             {v19.8b}, [TMP4]
-    usubl           v18.8h, v18.8b, v0.8b
-    ld1             {v20.8b}, [TMP5]
-    usubl           v19.8h, v19.8b, v0.8b
-    ld1             {v21.8b}, [TMP6]
-    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
-    usubl           v20.8h, v20.8b, v0.8b
-    ld1             {v22.8b}, [TMP7]
-    usubl           v21.8h, v21.8b, v0.8b
-    ld1             {v23.8b}, [TMP8]
-    usubl           v22.8h, v22.8b, v0.8b
-    usubl           v23.8h, v23.8b, v0.8b
-    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
-
-    br              x30
-
-    .unreq          SAMPLE_DATA
-    .unreq          START_COL
-    .unreq          WORKSPACE
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-    .unreq          TMP5
-    .unreq          TMP6
-    .unreq          TMP7
-    .unreq          TMP8
-    .unreq          TMPDUP
-
-/*****************************************************************************/
-
-/*
  * jsimd_fdct_islow_neon
  *
- * This file contains a slow-but-accurate integer implementation of the
+ * This file contains a slower but more accurate integer implementation of the
  * forward DCT (Discrete Cosine Transform). The following code is based
  * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
  * more details.
@@ -2234,68 +1541,24 @@ asm_function jsimd_convsamp_neon
  *       rid of a bunch of VLD1.16 instructions
  */
 
-#define CONST_BITS 13
-#define PASS1_BITS 2
-
-#define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS+PASS1_BITS)
-
-#define F_0_298  2446  /* FIX(0.298631336) */
-#define F_0_390  3196  /* FIX(0.390180644) */
-#define F_0_541  4433  /* FIX(0.541196100) */
-#define F_0_765  6270  /* FIX(0.765366865) */
-#define F_0_899  7373  /* FIX(0.899976223) */
-#define F_1_175  9633  /* FIX(1.175875602) */
-#define F_1_501 12299  /* FIX(1.501321110) */
-#define F_1_847 15137  /* FIX(1.847759065) */
-#define F_1_961 16069  /* FIX(1.961570560) */
-#define F_2_053 16819  /* FIX(2.053119869) */
-#define F_2_562 20995  /* FIX(2.562915447) */
-#define F_3_072 25172  /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_fdct_islow_neon_consts:
-  .short F_0_298
-  .short -F_0_390
-  .short F_0_541
-  .short F_0_765
-  .short - F_0_899
-  .short F_1_175
-  .short F_1_501
-  .short - F_1_847
-  .short - F_1_961
-  .short F_2_053
-  .short - F_2_562
-  .short F_3_072
-  .short 0          /* padding */
-  .short 0
-  .short 0
-  .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-#define XFIX_P_0_298 v0.h[0]
-#define XFIX_N_0_390 v0.h[1]
-#define XFIX_P_0_541 v0.h[2]
-#define XFIX_P_0_765 v0.h[3]
-#define XFIX_N_0_899 v0.h[4]
-#define XFIX_P_1_175 v0.h[5]
-#define XFIX_P_1_501 v0.h[6]
-#define XFIX_N_1_847 v0.h[7]
-#define XFIX_N_1_961 v1.h[0]
-#define XFIX_P_2_053 v1.h[1]
-#define XFIX_N_2_562 v1.h[2]
-#define XFIX_P_3_072 v1.h[3]
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define XFIX_P_0_298  v0.h[0]
+#define XFIX_N_0_390  v0.h[1]
+#define XFIX_P_0_541  v0.h[2]
+#define XFIX_P_0_765  v0.h[3]
+#define XFIX_N_0_899  v0.h[4]
+#define XFIX_P_1_175  v0.h[5]
+#define XFIX_P_1_501  v0.h[6]
+#define XFIX_N_1_847  v0.h[7]
+#define XFIX_N_1_961  v1.h[0]
+#define XFIX_P_2_053  v1.h[1]
+#define XFIX_N_2_562  v1.h[2]
+#define XFIX_P_3_072  v1.h[3]
 
 asm_function jsimd_fdct_islow_neon
 
@@ -2303,16 +1566,16 @@ asm_function jsimd_fdct_islow_neon
     TMP             .req x9
 
     /* Load constants */
-    adr             TMP, Ljsimd_fdct_islow_neon_consts
+    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
     ld1             {v0.8h, v1.8h}, [TMP]
 
-    /* Save NEON registers */
+    /* Save Neon registers */
     sub             sp, sp, #64
     mov             x10, sp
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
 
-    /* Load all DATA into NEON registers with the following allocation:
+    /* Load all DATA into Neon registers with the following allocation:
      *       0 1 2 3 | 4 5 6 7
      *      ---------+--------
      *   0 | d16     | d17    | v16.8h
@@ -2353,8 +1616,8 @@ asm_function jsimd_fdct_islow_neon
 
     add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
 
-    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
-    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
 
     smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
     smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
@@ -2368,8 +1631,8 @@ asm_function jsimd_fdct_islow_neon
 
     rshrn           v18.4h, v18.4s, #DESCALE_P1
     rshrn           v22.4h, v22.4s, #DESCALE_P1
-    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
-    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
 
     /* Odd part */
 
@@ -2395,10 +1658,10 @@ asm_function jsimd_fdct_islow_neon
     smull2          v13.4s, v9.8h, XFIX_N_2_562
     smull2          v14.4s, v10.8h, XFIX_N_1_961
     smull2          v15.4s, v11.8h, XFIX_N_0_390
-    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
-    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
-    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
-    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
 
     add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
     add             v14.4s, v14.4s, v5.4s
@@ -2427,10 +1690,10 @@ asm_function jsimd_fdct_islow_neon
     rshrn           v21.4h, v29.4s, #DESCALE_P1
     rshrn           v19.4h, v30.4s, #DESCALE_P1
     rshrn           v17.4h, v31.4s, #DESCALE_P1
-    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
-    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
 
     /* Transpose */
     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
@@ -2456,8 +1719,8 @@ asm_function jsimd_fdct_islow_neon
 
     add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
 
-    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
-    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
+    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
+    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
 
     smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
     smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
@@ -2471,8 +1734,8 @@ asm_function jsimd_fdct_islow_neon
 
     rshrn           v18.4h, v18.4s, #DESCALE_P2
     rshrn           v22.4h, v22.4s, #DESCALE_P2
-    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
-    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
 
     /* Odd part */
     add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
@@ -2498,10 +1761,10 @@ asm_function jsimd_fdct_islow_neon
     smull2          v13.4s, v9.8h, XFIX_N_2_562
     smull2          v14.4s, v10.8h, XFIX_N_1_961
     smull2          v15.4s, v11.8h, XFIX_N_0_390
-    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
-    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
-    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
-    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
 
     add             v10.4s, v10.4s, v4.4s
     add             v14.4s, v14.4s, v5.4s
@@ -2530,16 +1793,16 @@ asm_function jsimd_fdct_islow_neon
     rshrn           v21.4h, v29.4s, #DESCALE_P2
     rshrn           v19.4h, v30.4s, #DESCALE_P2
     rshrn           v17.4h, v31.4s, #DESCALE_P2
-    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
-    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
 
     /* store results */
     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
 
-    /* Restore NEON registers */
+    /* Restore Neon registers */
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
 
@@ -2565,405 +1828,10 @@ asm_function jsimd_fdct_islow_neon
 /*****************************************************************************/
 
 /*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- *       rid of a bunch of VLD1.16 instructions
- */
-
-#undef XFIX_0_541196100
-#define XFIX_0_382683433 v0.h[0]
-#define XFIX_0_541196100 v0.h[1]
-#define XFIX_0_707106781 v0.h[2]
-#define XFIX_1_306562965 v0.h[3]
-
-.balign 16
-Ljsimd_fdct_ifast_neon_consts:
-  .short (98 * 128)               /* XFIX_0_382683433 */
-  .short (139 * 128)              /* XFIX_0_541196100 */
-  .short (181 * 128)              /* XFIX_0_707106781 */
-  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
-
-asm_function jsimd_fdct_ifast_neon
-
-    DATA            .req x0
-    TMP             .req x9
-
-    /* Load constants */
-    adr             TMP, Ljsimd_fdct_ifast_neon_consts
-    ld1             {v0.4h}, [TMP]
-
-    /* Load all DATA into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17    | v0.8h
-     *   1 | d18     | d19    | q9
-     *   2 | d20     | d21    | q10
-     *   3 | d22     | d23    | q11
-     *   4 | d24     | d25    | q12
-     *   5 | d26     | d27    | q13
-     *   6 | d28     | d29    | q14
-     *   7 | d30     | d31    | q15
-     */
-
-    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-    mov             TMP, #2
-    sub             DATA, DATA, #64
-1:
-    /* Transpose */
-    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
-    subs            TMP, TMP, #1
-    /* 1-D FDCT */
-    add             v4.8h, v19.8h, v20.8h
-    sub             v20.8h, v19.8h, v20.8h
-    sub             v28.8h, v18.8h, v21.8h
-    add             v18.8h, v18.8h, v21.8h
-    sub             v29.8h, v17.8h, v22.8h
-    add             v17.8h, v17.8h, v22.8h
-    sub             v21.8h, v16.8h, v23.8h
-    add             v16.8h, v16.8h, v23.8h
-    sub             v6.8h, v17.8h, v18.8h
-    sub             v7.8h, v16.8h, v4.8h
-    add             v5.8h, v17.8h, v18.8h
-    add             v6.8h, v6.8h, v7.8h
-    add             v4.8h, v16.8h, v4.8h
-    sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
-    add             v19.8h, v20.8h, v28.8h
-    add             v16.8h, v4.8h, v5.8h
-    sub             v20.8h, v4.8h, v5.8h
-    add             v5.8h, v28.8h, v29.8h
-    add             v29.8h, v29.8h, v21.8h
-    sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
-    sub             v28.8h, v19.8h, v29.8h
-    add             v18.8h, v7.8h, v6.8h
-    sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
-    sub             v22.8h, v7.8h, v6.8h
-    sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
-    sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
-    add             v6.8h, v21.8h, v5.8h
-    sub             v5.8h, v21.8h, v5.8h
-    add             v29.8h, v29.8h, v28.8h
-    add             v19.8h, v19.8h, v28.8h
-    add             v29.8h, v29.8h, v7.8h
-    add             v21.8h, v5.8h, v19.8h
-    sub             v19.8h, v5.8h, v19.8h
-    add             v17.8h, v6.8h, v29.8h
-    sub             v23.8h, v6.8h, v29.8h
-
-    b.ne            1b
-
-    /* store results */
-    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-
-    br              x30
-
-    .unreq          DATA
-    .unreq          TMP
-#undef XFIX_0_382683433
-#undef XFIX_0_541196100
-#undef XFIX_0_707106781
-#undef XFIX_1_306562965
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
- *                      DCTELEM *workspace);
- *
- */
-asm_function jsimd_quantize_neon
-
-    COEF_BLOCK      .req x0
-    DIVISORS        .req x1
-    WORKSPACE       .req x2
-
-    RECIPROCAL      .req DIVISORS
-    CORRECTION      .req x9
-    SHIFT           .req x10
-    LOOP_COUNT      .req x11
-
-    mov             LOOP_COUNT, #2
-    add             CORRECTION, DIVISORS, #(64 * 2)
-    add             SHIFT, DIVISORS, #(64 * 6)
-1:
-    subs            LOOP_COUNT, LOOP_COUNT, #1
-    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
-    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
-    abs             v20.8h, v0.8h
-    abs             v21.8h, v1.8h
-    abs             v22.8h, v2.8h
-    abs             v23.8h, v3.8h
-    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
-    add             v20.8h, v20.8h, v4.8h  /* add correction */
-    add             v21.8h, v21.8h, v5.8h
-    add             v22.8h, v22.8h, v6.8h
-    add             v23.8h, v23.8h, v7.8h
-    umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
-    umull2          v16.4s, v20.8h, v28.8h
-    umull           v5.4s, v21.4h, v29.4h
-    umull2          v17.4s, v21.8h, v29.8h
-    umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
-    umull2          v18.4s, v22.8h, v30.8h
-    umull           v7.4s, v23.4h, v31.4h
-    umull2          v19.4s, v23.8h, v31.8h
-    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
-    shrn            v4.4h, v4.4s, #16
-    shrn            v5.4h, v5.4s, #16
-    shrn            v6.4h, v6.4s, #16
-    shrn            v7.4h, v7.4s, #16
-    shrn2           v4.8h, v16.4s, #16
-    shrn2           v5.8h, v17.4s, #16
-    shrn2           v6.8h, v18.4s, #16
-    shrn2           v7.8h, v19.4s, #16
-    neg             v24.8h, v24.8h
-    neg             v25.8h, v25.8h
-    neg             v26.8h, v26.8h
-    neg             v27.8h, v27.8h
-    sshr            v0.8h, v0.8h, #15  /* extract sign */
-    sshr            v1.8h, v1.8h, #15
-    sshr            v2.8h, v2.8h, #15
-    sshr            v3.8h, v3.8h, #15
-    ushl            v4.8h, v4.8h, v24.8h  /* shift */
-    ushl            v5.8h, v5.8h, v25.8h
-    ushl            v6.8h, v6.8h, v26.8h
-    ushl            v7.8h, v7.8h, v27.8h
-
-    eor             v4.16b, v4.16b, v0.16b  /* restore sign */
-    eor             v5.16b, v5.16b, v1.16b
-    eor             v6.16b, v6.16b, v2.16b
-    eor             v7.16b, v7.16b, v3.16b
-    sub             v4.8h, v4.8h, v0.8h
-    sub             v5.8h, v5.8h, v1.8h
-    sub             v6.8h, v6.8h, v2.8h
-    sub             v7.8h, v7.8h, v3.8h
-    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
-
-    b.ne            1b
-
-    br              x30  /* return */
-
-    .unreq          COEF_BLOCK
-    .unreq          DIVISORS
-    .unreq          WORKSPACE
-    .unreq          RECIPROCAL
-    .unreq          CORRECTION
-    .unreq          SHIFT
-    .unreq          LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 1:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
- *                             JDIMENSION v_samp_factor,
- *                             JDIMENSION width_blocks, JSAMPARRAY input_data,
- *                             JSAMPARRAY output_data);
- */
-
-.balign 16
-Ljsimd_h2_downsample_neon_consts:
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
-        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
-        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
-        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
-  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
-        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
-  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
-        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
-  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
-
-asm_function jsimd_h2v1_downsample_neon
-    IMAGE_WIDTH     .req x0
-    MAX_V_SAMP      .req x1
-    V_SAMP          .req x2
-    BLOCK_WIDTH     .req x3
-    INPUT_DATA      .req x4
-    OUTPUT_DATA     .req x5
-    OUTPTR          .req x9
-    INPTR           .req x10
-    TMP1            .req x11
-    TMP2            .req x12
-    TMP3            .req x13
-    TMPDUP          .req w15
-
-    mov             TMPDUP, #0x10000
-    lsl             TMP2, BLOCK_WIDTH, #4
-    sub             TMP2, TMP2, IMAGE_WIDTH
-    adr             TMP3, Ljsimd_h2_downsample_neon_consts
-    add             TMP3, TMP3, TMP2, lsl #4
-    dup             v16.4s, TMPDUP
-    ld1             {v18.16b}, [TMP3]
-
-1:  /* row loop */
-    ldr             INPTR, [INPUT_DATA], #8
-    ldr             OUTPTR, [OUTPUT_DATA], #8
-    subs            TMP1, BLOCK_WIDTH, #1
-    b.eq            3f
-2:  /* columns */
-    ld1             {v0.16b}, [INPTR], #16
-    mov             v4.16b, v16.16b
-    subs            TMP1, TMP1, #1
-    uadalp          v4.8h, v0.16b
-    shrn            v6.8b, v4.8h, #1
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            2b
-3:  /* last columns */
-    ld1             {v0.16b}, [INPTR]
-    mov             v4.16b, v16.16b
-    subs            V_SAMP, V_SAMP, #1
-    /* expand right */
-    tbl             v2.16b, {v0.16b}, v18.16b
-    uadalp          v4.8h, v2.16b
-    shrn            v6.8b, v4.8h, #1
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            1b
-
-    br              x30
-
-    .unreq          IMAGE_WIDTH
-    .unreq          MAX_V_SAMP
-    .unreq          V_SAMP
-    .unreq          BLOCK_WIDTH
-    .unreq          INPUT_DATA
-    .unreq          OUTPUT_DATA
-    .unreq          OUTPTR
-    .unreq          INPTR
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMPDUP
-
-
-/*****************************************************************************/
-
-/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 2:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
- *                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- *                             JSAMPARRAY input_data, JSAMPARRAY output_data);
- */
-
-.balign 16
-asm_function jsimd_h2v2_downsample_neon
-    IMAGE_WIDTH     .req x0
-    MAX_V_SAMP      .req x1
-    V_SAMP          .req x2
-    BLOCK_WIDTH     .req x3
-    INPUT_DATA      .req x4
-    OUTPUT_DATA     .req x5
-    OUTPTR          .req x9
-    INPTR0          .req x10
-    INPTR1          .req x14
-    TMP1            .req x11
-    TMP2            .req x12
-    TMP3            .req x13
-    TMPDUP          .req w15
-
-    mov             TMPDUP, #1
-    lsl             TMP2, BLOCK_WIDTH, #4
-    lsl             TMPDUP, TMPDUP, #17
-    sub             TMP2, TMP2, IMAGE_WIDTH
-    adr             TMP3, Ljsimd_h2_downsample_neon_consts
-    orr             TMPDUP, TMPDUP, #1
-    add             TMP3, TMP3, TMP2, lsl #4
-    dup             v16.4s, TMPDUP
-    ld1             {v18.16b}, [TMP3]
-
-1:  /* row loop */
-    ldr             INPTR0, [INPUT_DATA], #8
-    ldr             OUTPTR, [OUTPUT_DATA], #8
-    ldr             INPTR1, [INPUT_DATA], #8
-    subs            TMP1, BLOCK_WIDTH, #1
-    b.eq            3f
-2:  /* columns */
-    ld1             {v0.16b}, [INPTR0], #16
-    ld1             {v1.16b}, [INPTR1], #16
-    mov             v4.16b, v16.16b
-    subs            TMP1, TMP1, #1
-    uadalp          v4.8h, v0.16b
-    uadalp          v4.8h, v1.16b
-    shrn            v6.8b, v4.8h, #2
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            2b
-3:  /* last columns */
-    ld1             {v0.16b}, [INPTR0], #16
-    ld1             {v1.16b}, [INPTR1], #16
-    mov             v4.16b, v16.16b
-    subs            V_SAMP, V_SAMP, #1
-    /* expand right */
-    tbl             v2.16b, {v0.16b}, v18.16b
-    tbl             v3.16b, {v1.16b}, v18.16b
-    uadalp          v4.8h, v2.16b
-    uadalp          v4.8h, v3.16b
-    shrn            v6.8b, v4.8h, #2
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            1b
-
-    br              x30
-
-    .unreq          IMAGE_WIDTH
-    .unreq          MAX_V_SAMP
-    .unreq          V_SAMP
-    .unreq          BLOCK_WIDTH
-    .unreq          INPUT_DATA
-    .unreq          OUTPUT_DATA
-    .unreq          OUTPTR
-    .unreq          INPTR0
-    .unreq          INPTR1
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMPDUP
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(JOCTET*)
- * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
- *                              JCOEFPTR block, int last_dc_val,
- *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ * GLOBAL(JOCTET *)
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
+ *                             JCOEFPTR block, int last_dc_val,
+ *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
  *
  */
 
@@ -3010,41 +1878,6 @@ asm_function jsimd_h2v2_downsample_neon
 
 .macro generate_jsimd_huff_encode_one_block fast_tbl
 
-.balign 16
-.if \fast_tbl == 1
-Ljsimd_huff_encode_one_block_neon_consts:
-.else
-Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
-.endif
-    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
-          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-.if \fast_tbl == 1
-    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
-            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
-    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
-            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
-    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
-           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
-    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
-            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
-    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
-            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
-    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
-            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
-    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
-            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
-    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
-            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
-    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
-           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
-    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
-             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
-    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
-           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
-    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
-           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
-.endif
-
 .if \fast_tbl == 1
 asm_function jsimd_huff_encode_one_block_neon
 .else
@@ -3052,13 +1885,9 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
 .endif
     sub             sp, sp, 272
     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
-    /* Save ARM registers */
+    /* Save Arm registers */
     stp             x19, x20, [sp]
-.if \fast_tbl == 1
-    adr             x15, Ljsimd_huff_encode_one_block_neon_consts
-.else
-    adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
-.endif
+    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
     ldr             PUT_BUFFER, [x0, #0x10]
     ldr             PUT_BITSw, [x0, #0x18]
     ldrsh           w12, [x2]               /* load DC coeff in w12 */
@@ -3278,7 +2107,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
       put_bits        x10, x11
     addp            v16.16b, v16.16b, v18.16b
       checkbuf47
-    umov            x9,v16.D[0]
+    umov            x9, v16.D[0]
       put_bits        x13, x12
     cnt             v17.8b, v16.8b
       mvn             x9, x9
diff --git a/media/libjpeg/simd/arm/align.h b/media/libjpeg/simd/arm/align.h
new file mode 100644
index 0000000000..cff4241e84
--- /dev/null
+++ b/media/libjpeg/simd/arm/align.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* How to obtain memory alignment for structures and variables */
+#if defined(_MSC_VER)
+#define ALIGN(alignment)  __declspec(align(alignment))
+#elif defined(__clang__) || defined(__GNUC__)
+#define ALIGN(alignment)  __attribute__((aligned(alignment)))
+#else
+#error "Unknown compiler"
+#endif
diff --git a/media/libjpeg/simd/arm/jccolor-neon.c b/media/libjpeg/simd/arm/jccolor-neon.c
new file mode 100644
index 0000000000..9fcc62dd25
--- /dev/null
+++ b/media/libjpeg/simd/arm/jccolor-neon.c
@@ -0,0 +1,160 @@
+/*
+ * jccolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> YCbCr conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+#define F_0_168  11059
+#define F_0_331  21709
+#define F_0_500  32768
+#define F_0_418  27439
+#define F_0_081  5329
+
+ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
+  F_0_298, F_0_587, F_0_113, F_0_168,
+  F_0_331, F_0_500, F_0_418, F_0_081
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgbx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgrx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
diff --git a/media/libjpeg/simd/arm/jcgray-neon.c b/media/libjpeg/simd/arm/jcgray-neon.c
new file mode 100644
index 0000000000..71c7b2de21
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcgray-neon.c
@@ -0,0 +1,120 @@
+/*
+ * jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> Grayscale conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgbx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgrx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
diff --git a/media/libjpeg/simd/arm/jcgryext-neon.c b/media/libjpeg/simd/arm/jcgryext-neon.c
new file mode 100644
index 0000000000..416a7385df
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcgryext-neon.c
@@ -0,0 +1,106 @@
+/*
+ * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-neon.c */
+
+
+/* RGB -> Grayscale conversion is defined by the following equation:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ * These constants are defined in jcgray-neon.c
+ *
+ * This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
+ */
+
+void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                 JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                 int num_rows)
+{
+  JSAMPROW inptr;
+  JSAMPROW outptr;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 16) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 16) {
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
+      uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
+      uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
+      uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+
+      /* Narrow Y values to 8-bit and store to memory.  Buffer overwrite is
+       * permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr += 16;
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/jchuff.h b/media/libjpeg/simd/arm/jchuff.h
new file mode 100644
index 0000000000..2fbd252b9b
--- /dev/null
+++ b/media/libjpeg/simd/arm/jchuff.h
@@ -0,0 +1,131 @@
+/*
+ * jchuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2018, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020-2021, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/* Expanded entropy encoder object for Huffman encoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define BIT_BUF_SIZE  64
+#else
+#define BIT_BUF_SIZE  32
+#endif
+
+typedef struct {
+  size_t put_buffer;                    /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
+  savable_state cur;            /* Current bit buffer & DC state */
+  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
+} working_state;
+
+/* Outputting bits to the file */
+
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
+ * as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
+ * 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and the
+ * speculative 0 byte will be overwritten by the next byte.
+ */
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    *((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
+    buffer += 8; \
+  } \
+}
+
+#else
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 24); \
+  buffer[1] = (JOCTET)(put_buffer >> 16); \
+  buffer[2] = (JOCTET)(put_buffer >>  8); \
+  buffer[3] = (JOCTET)(put_buffer      ); \
+  buffer += 4; \
+}
+#else
+#define SPLAT() { \
+  put_buffer = __builtin_bswap32(put_buffer); \
+  __asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
+}
+#endif
+
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    SPLAT(); \
+  } \
+}
+
+#endif
+
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size, diff) { \
+  diff |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(diff, nbits) \
+}
diff --git a/media/libjpeg/simd/arm/jcphuff-neon.c b/media/libjpeg/simd/arm/jcphuff-neon.c
new file mode 100644
index 0000000000..b91c5db478
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcphuff-neon.c
@@ -0,0 +1,622 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits)
+{
+  JCOEF *values_ptr = values;
+  JCOEF *diff_values_ptr = values + DCTSIZE2;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs1);
+    vst1q_s16(values_ptr + DCTSIZE, coefs2);
+    vst1q_s16(diff_values_ptr, diff1);
+    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs1);
+    vst1q_s16(values_ptr + DCTSIZE, coefs2);
+    vst1q_s16(diff_values_ptr, diff1);
+    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs = vabsq_s16(coefs);
+    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff = veorq_s16(coefs, sign_coefs);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs);
+    vst1q_s16(diff_values_ptr, diff);
+    values_ptr += 8;
+    diff_values_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in the values and diff_values blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_s16(values_ptr, vdupq_n_s16(0));
+    vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+    values_ptr += 8;
+    diff_values_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap.  A set bit means that the corresponding
+   * coefficient != 0.
+   */
+  int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
+
+  uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
+  uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
+  uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
+  uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
+  uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
+  uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
+  uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
+  uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+  row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+  row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+  row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+  row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+  row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+  row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+  row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+  row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  *zerobits = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  zerobits[0] = ~bitmap0;
+  zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits)
+{
+  /* Temporary storage buffers for data used to compute the signbits bitmap and
+   * the end-of-block (EOB) position
+   */
+  uint8_t coef_sign_bits[64];
+  uint8_t coef_eq1_bits[64];
+
+  JCOEF *absvalues_ptr = absvalues;
+  uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+  uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs1);
+    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs1);
+    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs = vabsq_s16(coefs);
+    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+    vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+    vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap. */
+  int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
+  int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
+  int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
+  int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
+  int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
+  int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
+  int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
+  int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
+
+  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
+  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
+  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
+  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
+  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
+  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
+  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
+  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+  abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+  abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+  abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+  abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+  abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+  abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+  abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+  abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap0;
+  bits[1] = ~bitmap1;
+#endif
+
+  /* Construct signbits bitmap. */
+  uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+  uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+  uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+  uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+  uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+  uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+  uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+  uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+  signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+  signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+  signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+  signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+  signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+  signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+  signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+  signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+  bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+  bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+  bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store signbits bitmap. */
+  bits[1] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store signbits bitmap. */
+  bits[2] = ~bitmap0;
+  bits[3] = ~bitmap1;
+#endif
+
+  /* Construct bitmap to find EOB position (the index of the last coefficient
+   * equal to 1.)
+   */
+  uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+  uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+  uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+  uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+  uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+  uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+  uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+  uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+  row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+  row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+  row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+  row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+  row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+  row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+  row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+  row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+  bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+  bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+  bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Return EOB position. */
+  if (bitmap == 0) {
+    /* EOB position is defined to be 0 if all coefficients != 1. */
+    return 0;
+  } else {
+    return 63 - BUILTIN_CLZLL(bitmap);
+  }
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+  /* Return EOB position. */
+  if (bitmap0 == 0 && bitmap1 == 0) {
+    return 0;
+  } else if (bitmap1 != 0) {
+    return 63 - BUILTIN_CLZ(bitmap1);
+  } else {
+    return 31 - BUILTIN_CLZ(bitmap0);
+  }
+#endif
+}
diff --git a/media/libjpeg/simd/arm/jcsample-neon.c b/media/libjpeg/simd/arm/jcsample-neon.c
new file mode 100644
index 0000000000..8a3e237838
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcsample-neon.c
@@ -0,0 +1,192 @@
+/*
+ * jcsample-neon.c - downsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 0 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 1 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 2 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 3 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 4 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 5 */
+  0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 6 */
+  0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 7 */
+  0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 8 */
+  0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06,   /* Pad 9 */
+  0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05,   /* Pad 10 */
+  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04,   /* Pad 11 */
+  0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+  0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03,   /* Pad 12 */
+  0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+  0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,   /* Pad 13 */
+  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+  0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,   /* Pad 14 */
+  0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,   /* Pad 15 */
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr = input_data[outrow];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+      /* Divide total by 2 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+      /* Store samples to memory. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels = vqtbl1q_u8(pixels, expand_mask);
+#else
+    uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
+    pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
+                         vtbl2_u8(table, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+    /* Divide total by 2, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr0, inptr1, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr0 = input_data[outrow];
+    inptr1 = input_data[outrow + 1];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
+      uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+      /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
+       */
+      samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+      /* Divide total by 4 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+      /* Store samples to memory and increment pointers. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels_r0 =
+      vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+    uint8x16_t pixels_r1 =
+      vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
+    pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
+#else
+    uint8x8x2_t table_r0 =
+      { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
+    uint8x8x2_t table_r1 =
+      { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
+    pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
+    pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+    /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
+    samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+    /* Divide total by 4, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
diff --git a/media/libjpeg/simd/arm/jdcolext-neon.c b/media/libjpeg/simd/arm/jdcolext-neon.c
new file mode 100644
index 0000000000..c3c07a1964
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdcolext-neon.c
@@ -0,0 +1,374 @@
+/*
+ * jdcolext-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-neon.c. */
+
+
+/* YCbCr -> RGB conversion is defined by the following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdcolor-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for YCbCr -> RGB conversion routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
+                                JDIMENSION input_row, JSAMPARRAY output_buf,
+                                int num_rows)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    int cols_remaining = output_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+      uint8x16_t y  = vld1q_u8(inptr0);
+      uint8x16_t cb = vld1q_u8(inptr1);
+      uint8x16_t cr = vld1q_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cr)));
+      int16x8_t cr_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cr)));
+      int16x8_t cb_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cb)));
+      int16x8_t cb_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cb)));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
+      int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
+                                            consts, 0);
+      int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
+      int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
+                                            consts, 0);
+      g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
+                                  consts, 1);
+      g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
+                                  consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
+                                         vrshrn_n_s32(g_sub_y_lh, 15));
+      int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
+                                         vrshrn_n_s32(g_sub_y_hh, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
+                                               consts, 2);
+      int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
+                                               consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
+                                               consts, 3);
+      int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
+                                               consts, 3);
+      /* Add Y. */
+      int16x8_t r_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t r_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t b_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t b_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t g_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t g_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
+                                       vget_high_u8(y)));
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4q_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x16x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Store RGB pixel data to memory. */
+      vst3q_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+      uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565_l);
+      vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 16;
+      inptr1 += 16;
+      inptr2 += 16;
+      outptr += (RGB_PIXELSIZE * 16);
+    }
+
+    if (cols_remaining >= 8) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      vst3_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 8;
+      inptr1 += 8;
+      inptr2 += 8;
+      outptr += (RGB_PIXELSIZE * 8);
+      cols_remaining -= 8;
+    }
+
+    /* Handle the tail elements. */
+    if (cols_remaining > 0) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst4_lane_u8(outptr, rgba, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst3_lane_u8(outptr, rgb, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB565 pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#endif
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/jdcolor-neon.c b/media/libjpeg/simd/arm/jdcolor-neon.c
new file mode 100644
index 0000000000..ea4668f1d3
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdcolor-neon.c
@@ -0,0 +1,142 @@
+/*
+ * jdcolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgbx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgrx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion */
+
+#define RGB_PIXELSIZE  2
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
diff --git a/media/libjpeg/simd/arm/jdmerge-neon.c b/media/libjpeg/simd/arm/jdmerge-neon.c
new file mode 100644
index 0000000000..e4f91fdc0e
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdmerge-neon.c
@@ -0,0 +1,145 @@
+/*
+ * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgbx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgbx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgrx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgrx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
diff --git a/media/libjpeg/simd/arm/jdmrgext-neon.c b/media/libjpeg/simd/arm/jdmrgext-neon.c
new file mode 100644
index 0000000000..5b89bdb339
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdmrgext-neon.c
@@ -0,0 +1,723 @@
+/*
+ * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-neon.c. */
+
+
+/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
+ * chroma upsampling and YCbCr -> RGB color conversion into a single function.
+ *
+ * As with the standalone functions, YCbCr -> RGB conversion is defined by the
+ * following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdmerge-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
+ * routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+ */
+
+void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba;
+    rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr, rgba);
+#else
+    uint8x16x3_t rgb;
+    rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr, rgb);
+#endif
+
+    /* Increment pointers. */
+    inptr0 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba_h;
+    rgba_h.val[RGB_RED] = r.val[1];
+    rgba_h.val[RGB_GREEN] = g.val[1];
+    rgba_h.val[RGB_BLUE] = b.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    uint8x8x4_t rgba_l;
+    rgba_l.val[RGB_RED] = r.val[0];
+    rgba_l.val[RGB_GREEN] = g.val[0];
+    rgba_l.val[RGB_BLUE] = b.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst4_u8(outptr, rgba_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst4_lane_u8(outptr, rgba_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb_h;
+    rgb_h.val[RGB_RED] = r.val[1];
+    rgb_h.val[RGB_GREEN] = g.val[1];
+    rgb_h.val[RGB_BLUE] = b.val[1];
+    uint8x8x3_t rgb_l;
+    rgb_l.val[RGB_RED] = r.val[0];
+    rgb_l.val[RGB_GREEN] = g.val[0];
+    rgb_l.val[RGB_BLUE] = b.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst3_u8(outptr, rgb_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst3_lane_u8(outptr, rgb_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#endif
+  }
+}
+
+
+/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+ *
+ * See comments above for details regarding color conversion and safe memory
+ * access.
+ */
+
+void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr0, outptr1;
+  /* Pointers to Y (both rows), Cb, and Cr data */
+  JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0_0 = input_buf[0][in_row_group_ctr * 2];
+  inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr0 = output_buf[0];
+  outptr1 = output_buf[1];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba0, rgba1;
+    rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr0, rgba0);
+    vst4q_u8(outptr1, rgba1);
+#else
+    uint8x16x3_t rgb0, rgb1;
+    rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr0, rgb0);
+    vst3q_u8(outptr1, rgb1);
+#endif
+
+    /* Increment pointers. */
+    inptr0_0 += 16;
+    inptr0_1 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr0 += (RGB_PIXELSIZE * 16);
+    outptr1 += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba0_h, rgba1_h;
+    rgba0_h.val[RGB_RED] = r0.val[1];
+    rgba1_h.val[RGB_RED] = r1.val[1];
+    rgba0_h.val[RGB_GREEN] = g0.val[1];
+    rgba1_h.val[RGB_GREEN] = g1.val[1];
+    rgba0_h.val[RGB_BLUE] = b0.val[1];
+    rgba1_h.val[RGB_BLUE] = b1.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+
+    uint8x8x4_t rgba0_l, rgba1_l;
+    rgba0_l.val[RGB_RED] = r0.val[0];
+    rgba1_l.val[RGB_RED] = r1.val[0];
+    rgba0_l.val[RGB_GREEN] = g0.val[0];
+    rgba1_l.val[RGB_GREEN] = g1.val[0];
+    rgba0_l.val[RGB_BLUE] = b0.val[0];
+    rgba1_l.val[RGB_BLUE] = b1.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
+      vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
+      vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
+      vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
+      vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
+      vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
+      vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
+      vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst4_u8(outptr0, rgba0_l);
+      vst4_u8(outptr1, rgba1_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
+      vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
+      vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
+      vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
+      vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
+      vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
+      vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst4_lane_u8(outptr0, rgba0_l, 0);
+      vst4_lane_u8(outptr1, rgba1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb0_h, rgb1_h;
+    rgb0_h.val[RGB_RED] = r0.val[1];
+    rgb1_h.val[RGB_RED] = r1.val[1];
+    rgb0_h.val[RGB_GREEN] = g0.val[1];
+    rgb1_h.val[RGB_GREEN] = g1.val[1];
+    rgb0_h.val[RGB_BLUE] = b0.val[1];
+    rgb1_h.val[RGB_BLUE] = b1.val[1];
+
+    uint8x8x3_t rgb0_l, rgb1_l;
+    rgb0_l.val[RGB_RED] = r0.val[0];
+    rgb1_l.val[RGB_RED] = r1.val[0];
+    rgb0_l.val[RGB_GREEN] = g0.val[0];
+    rgb1_l.val[RGB_GREEN] = g1.val[0];
+    rgb0_l.val[RGB_BLUE] = b0.val[0];
+    rgb1_l.val[RGB_BLUE] = b1.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
+      vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
+      vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
+      vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
+      vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
+      vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
+      vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
+      vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst3_u8(outptr0, rgb0_l);
+      vst3_u8(outptr1, rgb1_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
+      vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
+      vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
+      vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
+      vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
+      vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
+      vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst3_lane_u8(outptr0, rgb0_l, 0);
+      vst3_lane_u8(outptr1, rgb1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#endif
+  }
+}
diff --git a/media/libjpeg/simd/arm/jdsample-neon.c b/media/libjpeg/simd/arm/jdsample-neon.c
new file mode 100644
index 0000000000..90ec6782c4
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdsample-neon.c
@@ -0,0 +1,569 @@
+/*
+ * jdsample-neon.c - upsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            |         |         |         |
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *            |         |         |         |
+ *            +---------+---------+---------+
+ *
+ * Samples s0-s2 were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each row.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * s0 + 1/4 * s1
+ *     p2(upsampled) = 3/4 * s1 + 1/4 * s0
+ * When computing the first and last pixel component values in the row, there
+ * is no adjacent sample to blend, so:
+ *     p0(upsampled) = s0
+ *     p5(upsampled) = s2
+ */
+
+void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    /* First pixel component value in this row of the original image */
+    *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
+
+    /*    3/4 * containing sample + 1/4 * nearest neighboring sample
+     * For p1: containing sample = s0, nearest neighboring sample = s1
+     * For p2: containing sample = s1, nearest neighboring sample = s0
+     */
+    uint8x16_t s0 = vld1q_u8(inptr);
+    uint8x16_t s1 = vld1q_u8(inptr + 1);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s1_add_3s0_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+    uint16x8_t s1_add_3s0_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+    uint16x8_t s0_add_3s1_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+    uint16x8_t s0_add_3s1_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+    /* Add ordered dithering bias to odd pixel values. */
+    s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+    s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+    /* The offset is initially 1, because the first pixel component has already
+     * been stored.  However, in subsequent iterations of the SIMD loop, this
+     * offset is (2 * colctr - 1) to stay within the bounds of the sample
+     * buffers without having to resort to a slow scalar tail case for the last
+     * (downsampled_width % 16) samples.  See "Creation of 2-D sample arrays"
+     * in jmemmgr.c for more details.
+     */
+    unsigned outptr_offset = 1;
+    uint8x16x2_t output_pixels;
+
+    /* We use software pipelining to maximise performance.  The code indented
+     * an extra two spaces begins the next iteration of the loop.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+
+        s0 = vld1q_u8(inptr + colctr - 1);
+        s1 = vld1q_u8(inptr + colctr);
+
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                         vrshrn_n_u16(s1_add_3s0_h, 2));
+      output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                         vshrn_n_u16(s0_add_3s1_h, 2));
+
+        /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+         * denote low half and high half respectively.
+         */
+        s1_add_3s0_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+        s1_add_3s0_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+        s0_add_3s1_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+        s0_add_3s1_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+        /* Add ordered dithering bias to odd pixel values. */
+        s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+        s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr + outptr_offset, output_pixels);
+      outptr_offset = 2 * colctr - 1;
+    }
+
+    /* Complete the last iteration of the loop. */
+
+    /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+    output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                       vrshrn_n_u16(s1_add_3s0_h, 2));
+    output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                       vshrn_n_u16(s0_add_3s1_h, 2));
+    /* Store pixel component values to memory. */
+    vst2q_u8(outptr + outptr_offset, output_pixels);
+
+    /* Last pixel component value in this row of the original image */
+    outptr[2 * downsampled_width - 1] =
+      GETJSAMPLE(inptr[downsampled_width - 1]);
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *       sA   |         |         |         |
+ *            | p6   p7 | p8   p9 | p10  p11|
+ *            +---------+---------+---------+
+ *            | p12  p13| p14  p15| p16  p17|
+ *       sB   |         |         |         |
+ *            | p18  p19| p20  p21| p22  p23|
+ *            +---------+---------+---------+
+ *            | p24  p25| p26  p27| p28  p29|
+ *       sC   |         |         |         |
+ *            | p30  p31| p32  p33| p34  p35|
+ *            +---------+---------+---------+
+ *
+ * Samples s0A-s2C were created by averaging the original pixel component
+ * values centered at positions p0-p35 above.  To approximate one of those
+ * original pixel component values, we proportionally blend the sample
+ * containing the pixel center with the nearest neighboring samples in each
+ * row, column, and diagonal.
+ *
+ * An upsampled pixel component value is computed by first blending the sample
+ * containing the pixel center with the nearest neighboring samples in the
+ * same column, in the ratio 3:1, and then blending each column sum with the
+ * nearest neighboring column sum, in the ratio 3:1.  For example:
+ *     p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
+ *                      1/4 * (3/4 * s0B + 1/4 * s0A)
+ *                    = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
+ * When computing the first and last pixel component values in the row, there
+ * is no horizontally adjacent sample to blend, so:
+ *     p12(upsampled) = 3/4 * s0B + 1/4 * s0A
+ *     p23(upsampled) = 3/4 * s2B + 1/4 * s2C
+ * When computing the first and last pixel component values in the column,
+ * there is no vertically adjacent sample to blend, so:
+ *     p2(upsampled) = 3/4 * s1A + 1/4 * s0A
+ *     p33(upsampled) = 3/4 * s1C + 1/4 * s2C
+ * When computing the corner pixel component values, there is no adjacent
+ * sample to blend, so:
+ *     p0(upsampled) = s0A
+ *     p35(upsampled) = s2C
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t seven_u16 = vdupq_n_u16(7);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+  const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    /* First pixel component value in this row of the original image */
+    int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+    *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+    int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+    *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+    /* Step 1: Blend samples vertically in columns s0 and s1.
+     * Leave the divide by 4 until the end, when it can be done for both
+     * dimensions at once, right-shifting by 4.
+     */
+
+    /* Load and compute s0colsum0 and s0colsum1. */
+    uint8x16_t s0A = vld1q_u8(inptr0);
+    uint8x16_t s0B = vld1q_u8(inptr1);
+    uint8x16_t s0C = vld1q_u8(inptr2);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
+                                      vget_high_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
+                                      vget_high_u8(s0B), three_u8);
+    /* Load and compute s1colsum0 and s1colsum1. */
+    uint8x16_t s1A = vld1q_u8(inptr0 + 1);
+    uint8x16_t s1B = vld1q_u8(inptr1 + 1);
+    uint8x16_t s1C = vld1q_u8(inptr2 + 1);
+    uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
+                                      vget_high_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
+                                      vget_high_u8(s1B), three_u8);
+
+    /* Step 2: Blend the already-blended columns. */
+
+    uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+    uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+    uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+    uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+    uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+    uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+    uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+    uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+    /* Add ordered dithering bias to odd pixel values. */
+    output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+    output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+    output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+    output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+    /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+    uint8x16x2_t output_pixels0 = { {
+      vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
+    } };
+    uint8x16x2_t output_pixels1 = { {
+      vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
+    } };
+
+    /* Store pixel component values to memory.
+     * The minimum size of the output buffer for each row is 64 bytes => no
+     * need to worry about buffer overflow here.  See "Creation of 2-D sample
+     * arrays" in jmemmgr.c for more details.
+     */
+    vst2q_u8(outptr0 + 1, output_pixels0);
+    vst2q_u8(outptr1 + 1, output_pixels1);
+
+    /* The first pixel of the image shifted our loads and stores by one byte.
+     * We have to re-align on a 32-byte boundary at some point before the end
+     * of the row (we do it now on the 32/33 pixel boundary) to stay within the
+     * bounds of the sample buffers without having to resort to a slow scalar
+     * tail case for the last (downsampled_width % 16) samples.  See "Creation
+     * of 2-D sample arrays" in jmemmgr.c for more details.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+      /* Step 1: Blend samples vertically in columns s0 and s1. */
+
+      /* Load and compute s0colsum0 and s0colsum1. */
+      s0A = vld1q_u8(inptr0 + colctr - 1);
+      s0B = vld1q_u8(inptr1 + colctr - 1);
+      s0C = vld1q_u8(inptr2 + colctr - 1);
+      s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
+                             three_u8);
+      s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
+                             three_u8);
+      /* Load and compute s1colsum0 and s1colsum1. */
+      s1A = vld1q_u8(inptr0 + colctr);
+      s1B = vld1q_u8(inptr1 + colctr);
+      s1C = vld1q_u8(inptr2 + colctr);
+      s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
+                             three_u8);
+      s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
+                             three_u8);
+
+      /* Step 2: Blend the already-blended columns. */
+
+      output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+      output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+      output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+      output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+      output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+      output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+      output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+      output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+      /* Add ordered dithering bias to odd pixel values. */
+      output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+      output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+      output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+      output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+      /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+      output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+                                          vshrn_n_u16(output0_p1_h, 4));
+      output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+                                          vrshrn_n_u16(output0_p2_h, 4));
+      output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+                                          vshrn_n_u16(output1_p1_h, 4));
+      output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+                                          vrshrn_n_u16(output1_p2_h, 4));
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+      vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+    }
+
+    /* Last pixel component value in this row of the original image */
+    int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr0[downsampled_width - 1]);
+    outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+    int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr2[downsampled_width - 1]);
+    outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+    inrow++;
+  }
+}
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ *            +---------+
+ *            |   p0    |
+ *     sA     |         |
+ *            |   p1    |
+ *            +---------+
+ *            |   p2    |
+ *     sB     |         |
+ *            |   p3    |
+ *            +---------+
+ *            |   p4    |
+ *     sC     |         |
+ *            |   p5    |
+ *            +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * sA + 1/4 * sB
+ *     p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ *     p0(upsampled) = sA
+ *     p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+    inrow++;
+
+    /* The size of the input and output buffers is always a multiple of 32
+     * bytes => no need to worry about buffer overflow when reading/writing
+     * memory.  See "Creation of 2-D sample arrays" in jmemmgr.c for more
+     * details.
+     */
+    for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+      /* Load samples. */
+      uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+      uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+      uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+      /* Blend samples vertically. */
+      uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+                                      vget_high_u8(sB), three_u8);
+      uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+                                      vget_high_u8(sB), three_u8);
+      /* Add ordered dithering bias to pixel values in even output rows. */
+      colsum0_l = vaddq_u16(colsum0_l, one_u16);
+      colsum0_h = vaddq_u16(colsum0_h, one_u16);
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+                                              vshrn_n_u16(colsum0_h, 2));
+      uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+                                              vrshrn_n_u16(colsum1_h, 2));
+      /* Store pixel component values to memory. */
+      vst1q_u8(outptr0 + colctr, output_pixels0);
+      vst1q_u8(outptr1 + colctr, output_pixels1);
+    }
+  }
+}
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            |         |         |
+ *            | p0   p1 | p2   p3 |
+ *            |         |         |
+ *            +---------+---------+
+ *
+ * Samples s0 and s1 were created by averaging the original pixel component
+ * values centered at positions p0-p3 above.  To approximate those original
+ * pixel component values, we duplicate the samples horizontally:
+ *     p0(upsampled) = p1(upsampled) = s0
+ *     p2(upsampled) = p3(upsampled) = s1
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr + 2 * colctr, output_pixels);
+    }
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            | p0   p1 | p2   p3 |
+ *       sA   |         |         |
+ *            | p4   p5 | p6   p7 |
+ *            +---------+---------+
+ *            | p8   p9 | p10  p11|
+ *       sB   |         |         |
+ *            | p12  p13| p14  p15|
+ *            +---------+---------+
+ *
+ * Samples s0A-s1B were created by averaging the original pixel component
+ * values centered at positions p0-p15 above.  To approximate those original
+ * pixel component values, we duplicate the samples both horizontally and
+ * vertically:
+ *     p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
+ *     p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
+ *     p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
+ *     p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values for both output rows to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+      vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/jfdctfst-neon.c b/media/libjpeg/simd/arm/jfdctfst-neon.c
new file mode 100644
index 0000000000..bb371be399
--- /dev/null
+++ b/media/libjpeg/simd/arm/jfdctfst-neon.c
@@ -0,0 +1,214 @@
+/*
+ * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.382683433 = 12544 * 2^-15
+ *    0.541196100 = 17795 * 2^-15
+ *    0.707106781 = 23168 * 2^-15
+ *    0.306562965 =  9984 * 2^-15
+ *
+ * See jfdctfst.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_ifast_neon() match up
+ * with those in jpeg_fdct_ifast().
+ */
+
+#define F_0_382  12544
+#define F_0_541  17792
+#define F_0_707  23168
+#define F_0_306  9984
+
+
+ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+  F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+void jsimd_fdct_ifast_neon(DCTELEM *data)
+{
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t data1 = vld4q_s16(data);
+  int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  /* Load DCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);    /* phase 2 */
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  col4 = vsubq_s16(tmp10, tmp11);
+
+  int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  col2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  col6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  int16x8_t z11 = vaddq_s16(tmp7, z3);        /* phase 5 */
+  int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+  col5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  col3 = vsubq_s16(z13, z2);
+  col1 = vaddq_s16(z11, z4);
+  col7 = vsubq_s16(z11, z4);
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);              /* phase 2 */
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  row4 = vsubq_s16(tmp10, tmp11);
+
+  z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  row2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  row6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  z11 = vaddq_s16(tmp7, z3);                  /* phase 5 */
+  z13 = vsubq_s16(tmp7, z3);
+
+  row5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  row3 = vsubq_s16(z13, z2);
+  row1 = vaddq_s16(z11, z4);
+  row7 = vsubq_s16(z11, z4);
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/media/libjpeg/simd/arm/jfdctint-neon.c b/media/libjpeg/simd/arm/jfdctint-neon.c
new file mode 100644
index 0000000000..ccfc07b15d
--- /dev/null
+++ b/media/libjpeg/simd/arm/jfdctint-neon.c
@@ -0,0 +1,376 @@
+/*
+ * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_islow() function, which can be found in jfdctint.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ *
+ * See jfdctint.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_islow_neon() match up
+ * with those in jpeg_fdct_islow().
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+
+ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
+  F_0_298, -F_0_390,  F_0_541,  F_0_765,
+ -F_0_899,  F_1_175,  F_1_501, -F_1_847,
+ -F_1_961,  F_2_053, -F_2_562,  F_3_072
+};
+
+void jsimd_fdct_islow_neon(DCTELEM *data)
+{
+  /* Load DCT constants. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t s_rows_0123 = vld4q_s16(data);
+  int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  int32x4_t z1_l =
+    vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  int32x4_t z1_h =
+    vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t col2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t col2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
+
+  int32x4_t col6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t col6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
+
+  /* Odd part */
+  int16x8_t z1 = vaddq_s16(tmp4, tmp7);
+  int16x8_t z2 = vaddq_s16(tmp5, tmp6);
+  int16x8_t z3 = vaddq_s16(tmp4, tmp6);
+  int16x8_t z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P1));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P1));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P1));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P1));
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t row2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t row2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
+
+  int32x4_t row6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t row6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
+
+  /* Odd part */
+  z1 = vaddq_s16(tmp4, tmp7);
+  z2 = vaddq_s16(tmp5, tmp6);
+  z3 = vaddq_s16(tmp4, tmp6);
+  z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P2));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P2));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P2));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P2));
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/media/libjpeg/simd/arm/jidctfst-neon.c b/media/libjpeg/simd/arm/jidctfst-neon.c
new file mode 100644
index 0000000000..a91be5362e
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctfst-neon.c
@@ -0,0 +1,472 @@
+/*
+ * jidctfst-neon.c - fast integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients.  It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.082392200 =  2688 * 2^-15
+ *    0.414213562 = 13568 * 2^-15
+ *    0.847759065 = 27776 * 2^-15
+ *    0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_neon() match up
+ * with those in jpeg_idct_ifast().
+ */
+
+#define PASS1_BITS  2
+
+#define F_0_082  2688
+#define F_0_414  13568
+#define F_0_847  27776
+#define F_0_613  20096
+
+
+ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
+  F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  IFAST_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row4);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into vectors.
+     */
+    int16x8_t dcval = row0;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+    row4 = dcval;
+    row5 = dcval;
+    row6 = dcval;
+    row7 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_low_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_high_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
+    row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
+    row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
+    row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
+    row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
+    row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
+    row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
+    row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_high_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_low_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
+    row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
+    row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
+    row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
+    row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
+    row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
+    row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
+    row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
+  } else {
+    /* Some AC coefficients are non-zero; full IDCT calculation required. */
+
+    /* Load quantization table. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x8_t tmp0 = row0;
+    int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
+    int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
+    int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
+
+    int16x8_t tmp10 = vaddq_s16(tmp0, tmp2);   /* phase 3 */
+    int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
+
+    int16x8_t tmp13 = vaddq_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
+    int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsubq_s16(tmp12, tmp13);
+
+    tmp0 = vaddq_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsubq_s16(tmp10, tmp13);
+    tmp1 = vaddq_s16(tmp11, tmp12);
+    tmp2 = vsubq_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
+    int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
+    int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
+    int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
+
+    int16x8_t z13 = vaddq_s16(tmp6, tmp5);     /* phase 6 */
+    int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
+    int16x8_t z11 = vaddq_s16(tmp4, tmp7);
+    int16x8_t z12 = vsubq_s16(tmp4, tmp7);
+
+    tmp7 = vaddq_s16(z11, z13);                /* phase 5 */
+    int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+    tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+    int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+    int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+    z5 = vaddq_s16(z5, z10_add_z12);
+    tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+    tmp10 = vaddq_s16(tmp10, z12);
+    tmp10 = vsubq_s16(tmp10, z5);
+    tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+    tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+    tmp12 = vaddq_s16(tmp12, z5);
+
+    tmp6 = vsubq_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsubq_s16(tmp11, tmp6);
+    tmp4 = vaddq_s16(tmp10, tmp5);
+
+    row0 = vaddq_s16(tmp0, tmp7);
+    row7 = vsubq_s16(tmp0, tmp7);
+    row1 = vaddq_s16(tmp1, tmp6);
+    row6 = vsubq_s16(tmp1, tmp6);
+    row2 = vaddq_s16(tmp2, tmp5);
+    row5 = vsubq_s16(tmp2, tmp5);
+    row4 = vaddq_s16(tmp3, tmp4);
+    row3 = vsubq_s16(tmp3, tmp4);
+  }
+
+  /* Transpose rows to work on columns in pass 2. */
+  int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
+  int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
+  int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
+
+  int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
+                                      vreinterpretq_s32_s16(rows_45.val[0]));
+  int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
+                                      vreinterpretq_s32_s16(rows_45.val[1]));
+  int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
+                                      vreinterpretq_s32_s16(rows_67.val[0]));
+  int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
+                                      vreinterpretq_s32_s16(rows_67.val[1]));
+
+  int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
+  int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
+  int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
+  int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
+
+  int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
+  int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
+  int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
+  int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
+  int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
+  int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
+  int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
+  int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
+
+  /* 1-D IDCT, pass 2 */
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(col0, col4);
+  int16x8_t tmp11 = vsubq_s16(col0, col4);
+
+  int16x8_t tmp13 = vaddq_s16(col2, col6);
+  int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
+  int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
+  tmp12 = vaddq_s16(tmp12, col2_sub_col6);
+  tmp12 = vsubq_s16(tmp12, tmp13);
+
+  int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
+  int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
+  int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
+  int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
+
+  /* Odd part */
+  int16x8_t z13 = vaddq_s16(col5, col3);
+  int16x8_t neg_z10 = vsubq_s16(col3, col5);
+  int16x8_t z11 = vaddq_s16(col1, col7);
+  int16x8_t z12 = vsubq_s16(col1, col7);
+
+  int16x8_t tmp7 = vaddq_s16(z11, z13);      /* phase 5 */
+  int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+  tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+  tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+  int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+  int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+  z5 = vaddq_s16(z5, z10_add_z12);
+  tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+  tmp10 = vaddq_s16(tmp10, z12);
+  tmp10 = vsubq_s16(tmp10, z5);
+  tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+  tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+  tmp12 = vaddq_s16(tmp12, z5);
+
+  int16x8_t tmp6 = vsubq_s16(tmp12, tmp7);   /* phase 2 */
+  int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
+  int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
+
+  col0 = vaddq_s16(tmp0, tmp7);
+  col7 = vsubq_s16(tmp0, tmp7);
+  col1 = vaddq_s16(tmp1, tmp6);
+  col6 = vsubq_s16(tmp1, tmp6);
+  col2 = vaddq_s16(tmp2, tmp5);
+  col5 = vsubq_s16(tmp2, tmp5);
+  col4 = vaddq_s16(tmp3, tmp4);
+  col3 = vsubq_s16(tmp3, tmp4);
+
+  /* Scale down by a factor of 8, narrowing to 8-bit. */
+  int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col1, PASS1_BITS + 3));
+  int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col5, PASS1_BITS + 3));
+  int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col3, PASS1_BITS + 3));
+  int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col7, PASS1_BITS + 3));
+  /* Clamp to range [0-255]. */
+  uint8x16_t cols_01 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_45 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_23 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_67 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+  /* Transpose block to prepare for store. */
+  uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
+                                     vreinterpretq_u32_u8(cols_45));
+  uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
+                                     vreinterpretq_u32_u8(cols_67));
+
+  uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
+                                    vreinterpretq_u8_u32(cols_0415.val[1]));
+  uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
+                                    vreinterpretq_u8_u32(cols_2637.val[1]));
+  uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
+                                     vreinterpretq_u16_u8(cols_2367.val[0]));
+  uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
+                                     vreinterpretq_u16_u8(cols_2367.val[1]));
+
+  uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
+  uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
+  uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
+  uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
+
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  JSAMPROW outptr4 = output_buf[4] + output_col;
+  JSAMPROW outptr5 = output_buf[5] + output_col;
+  JSAMPROW outptr6 = output_buf[6] + output_col;
+  JSAMPROW outptr7 = output_buf[7] + output_col;
+
+  /* Store DCT block to memory. */
+  vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
+  vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
+  vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
+  vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
+  vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
+  vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
+  vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
+  vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
+}
diff --git a/media/libjpeg/simd/arm/jidctint-neon.c b/media/libjpeg/simd/arm/jidctint-neon.c
new file mode 100644
index 0000000000..043b652e6c
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctint-neon.c
@@ -0,0 +1,802 @@
+/*
+ * jidctint-neon.c - accurate integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile time.  Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+#define F_1_175_MINUS_1_961  (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390  (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847  (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562  (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899  (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899  (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562  (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765   (F_0_541 + F_0_765)
+
+
+ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
+  F_0_899,             F_0_541,
+  F_2_562,             F_0_298_MINUS_0_899,
+  F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+  F_0_541_PLUS_0_765,  F_1_175,
+  F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+  F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+  0, 0, 0, 0
+};
+
+
+/* Forward declaration of regular and sparse IDCT helper functions */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset);
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset);
+
+
+/* Perform dequantization and inverse DCT on one block of coefficients.  For
+ * reference, the C implementation (jpeg_idct_slow()) can be found in
+ * jidctint.c.
+ *
+ * Optimization techniques used for fast data access:
+ *
+ * In each pass, the inverse DCT is computed for the left and right 4x8 halves
+ * of the DCT block.  This avoids spilling due to register pressure, and the
+ * increased granularity allows for an optimized calculation depending on the
+ * values of the DCT coefficients.  Between passes, intermediate data is stored
+ * in 4x8 workspace buffers.
+ *
+ * Transposing the 8x8 DCT block after each pass can be achieved by transposing
+ * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
+ * diagram below.)  Swapping quadrants is cheap, since the second pass can just
+ * swap the workspace buffer pointers.
+ *
+ *      +-------+-------+                   +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   0   |   1   |                   |   0   |   2   |
+ *      |       |       |    transpose      |       |       |
+ *      +-------+-------+     ------>       +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   2   |   3   |                   |   1   |   3   |
+ *      |       |       |                   |       |       |
+ *      +-------+-------+                   +-------+-------+
+ *
+ * Optimization techniques used to accelerate the inverse DCT calculation:
+ *
+ * In a DCT coefficient block, the coefficients are increasingly likely to be 0
+ * as you move diagonally from top left to bottom right.  If whole rows of
+ * coefficients are 0, then the inverse DCT calculation can be simplified.  On
+ * the first pass of the inverse DCT, we test for three special cases before
+ * defaulting to a full "regular" inverse DCT:
+ *
+ * 1) Coefficients in rows 4-7 are all zero.  In this case, we perform a
+ *    "sparse" simplified inverse DCT on rows 0-3.
+ * 2) AC coefficients (rows 1-7) are all zero.  In this case, the inverse DCT
+ *    result is equal to the dequantized DC coefficients.
+ * 3) AC and DC coefficients are all zero.  In this case, the inverse DCT
+ *    result is all zero.  For the left 4x8 half, this is handled identically
+ *    to Case 2 above.  For the right 4x8 half, we do no work and signal that
+ *    the "sparse" algorithm is required for the second pass.
+ *
+ * In the second pass, only a single special case is tested: whether the AC and
+ * DC coefficients were all zero in the right 4x8 block during the first pass
+ * (refer to Case 3 above.)  If this is the case, then a "sparse" variant of
+ * the second pass is performed for both the left and right halves of the DCT
+ * block.  (The transposition after the first pass means that the right 4x8
+ * block during the first pass becomes rows 4-7 during the second pass.)
+ */
+
+void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  int16_t workspace_l[8 * DCTSIZE / 2];
+  int16_t workspace_r[8 * DCTSIZE / 2];
+
+  /* Compute IDCT first pass on left 4x8 coefficient block. */
+
+  /* Load DCT coefficients in left 4x8 block. */
+  int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
+  int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
+  int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
+  int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
+  int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
+  int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
+  int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
+  int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table for left 4x8 block. */
+  int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
+  int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+  int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+  int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+  int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+  int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+  int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+  int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+  /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
+  int16x4_t bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  if (bitmap_rows_4567 == 0) {
+    bitmap = vorr_s16(bitmap, row3);
+    bitmap = vorr_s16(bitmap, row2);
+    bitmap = vorr_s16(bitmap, row1);
+    int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (left_ac_bitmap == 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l, quadrant);
+      vst4_s16(workspace_r, quadrant);
+    } else {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l, workspace_r);
+    }
+  } else {
+    jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                   row6, row7, quant_row0, quant_row1,
+                                   quant_row2, quant_row3, quant_row4,
+                                   quant_row5, quant_row6, quant_row7,
+                                   workspace_l, workspace_r);
+  }
+
+  /* Compute IDCT first pass on right 4x8 coefficient block. */
+
+  /* Load DCT coefficients in right 4x8 block. */
+  row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
+  row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
+  row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
+  row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
+  row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
+  row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
+  row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
+  row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
+
+  /* Load quantization table for right 4x8 block. */
+  quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
+  quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+  quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+  quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+  quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+  quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+  quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+  quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+  /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
+  bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+  bitmap = vorr_s16(bitmap, row3);
+  bitmap = vorr_s16(bitmap, row2);
+  bitmap = vorr_s16(bitmap, row1);
+  int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  /* If this remains non-zero, a "regular" second pass will be performed. */
+  int64_t right_ac_dc_bitmap = 1;
+
+  if (right_ac_bitmap == 0) {
+    bitmap = vorr_s16(bitmap, row0);
+    right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (right_ac_dc_bitmap != 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
+      vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
+    }
+  } else {
+    if (bitmap_rows_4567 == 0) {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l + 4 * DCTSIZE / 2,
+                                    workspace_r + 4 * DCTSIZE / 2);
+    } else {
+      jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                     row6, row7, quant_row0, quant_row1,
+                                     quant_row2, quant_row3, quant_row4,
+                                     quant_row5, quant_row6, quant_row7,
+                                     workspace_l + 4 * DCTSIZE / 2,
+                                     workspace_r + 4 * DCTSIZE / 2);
+    }
+  }
+
+  /* Second pass: compute IDCT on rows in workspace. */
+
+  /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
+  if (right_ac_dc_bitmap == 0) {
+    jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
+  } else {
+    jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
+  }
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.  (To process the full 8x8 DCT block, this
+ * function-- or some other optimized variant-- needs to be called for both the
+ * left and right 4x8 blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of AC coefficients is all 0.
+ *
+ * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
+ * found in jidctint.c.  Algorithmic changes made here are documented inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+  int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  z3_s16 = vmul_s16(row4, quant_row4);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
+  int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.
+ *
+ * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
+ * 0.  This simplifies the IDCT calculation, accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
+ * coefficients.  (To process the full 8x8 DCT block, this function-- or some
+ * other optimized variant-- needs to be called for both the right and left 4x8
+ * blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
+ * can be found in jidctint.c.  Algorithmic changes made here are documented
+ * inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+  int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
+  int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
+
+
+/* Performs the second pass of the accurate inverse DCT on a 4x8 block
+ * of coefficients.
+ *
+ * This "sparse" version assumes that the coefficient values (after the first
+ * pass) in rows 4-7 are all 0.  This simplifies the IDCT calculation,
+ * accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
diff --git a/media/libjpeg/simd/arm/jidctred-neon.c b/media/libjpeg/simd/arm/jidctred-neon.c
new file mode 100644
index 0000000000..be9627e61d
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctred-neon.c
@@ -0,0 +1,486 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define F_0_211  1730
+#define F_0_509  4176
+#define F_0_601  4926
+#define F_0_720  5906
+#define F_0_765  6270
+#define F_0_850  6967
+#define F_0_899  7373
+#define F_1_061  8697
+#define F_1_272  10426
+#define F_1_451  11893
+#define F_1_847  15137
+#define F_2_172  17799
+#define F_2_562  20995
+#define F_3_624  29692
+
+
+/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.720959822 =  5906 * 2^-13
+ *    0.850430095 =  6967 * 2^-13
+ *    1.272758580 = 10426 * 2^-13
+ *    3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_neon()
+ * match up with those in jpeg_idct_2x2().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
+  -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+  int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+  int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+  int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+  /* Dequantize DCT coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+  row1 = vmulq_s16(row1, quant_row1);
+  row3 = vmulq_s16(row3, quant_row3);
+  row5 = vmulq_s16(row5, quant_row5);
+  row7 = vmulq_s16(row7, quant_row7);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
+
+  /* Pass 1: process columns from input, put results in vectors row0 and
+   * row1.
+   */
+
+  /* Even part */
+  int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+  int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+  /* Odd part */
+  int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
+  int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+  row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+  /* Transpose two rows, ready for second pass. */
+  int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+  int16x8_t cols_0246 = cols_0246_1357.val[0];
+  int16x8_t cols_1357 = cols_0246_1357.val[1];
+  /* Duplicate columns such that each is accessible in its own vector. */
+  int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+                                         vreinterpretq_s32_s16(cols_1357));
+  int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+  int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+  /* Pass 2: process two rows, store to output array. */
+
+  /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+   * care."
+   */
+  int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+  /* Odd part: we're only interested in the bottom half of tmp0. */
+  int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
+  tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+                                      vsubhn_s32(tmp10, tmp0));
+  output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+                            CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+  /* Narrow to 8-bit and convert to unsigned. */
+  uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+  /* Store 2x2 block to memory. */
+  vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+  vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+  vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+  vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}
+
+
+/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.211164243 =  1730 * 2^-13
+ *    0.509795579 =  4176 * 2^-13
+ *    0.601344887 =  4926 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.061594337 =  8697 * 2^-13
+ *    1.451774981 = 11893 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    2.172734803 = 17799 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_neon()
+ * match up with those in jpeg_idct_4x4().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
+  F_1_847, -F_0_765, -F_0_211,  F_1_451,
+ -F_2_172,  F_1_061, -F_0_509, -F_0_601,
+  F_0_899,  F_2_562,        0,        0
+};
+
+void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0  = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1  = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2  = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3  = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5  = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6  = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7  = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
+     */
+    int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
+    row0 = dcval;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_high_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_low_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+  } else {
+    /* All AC coefficients are non-zero; full IDCT calculation required. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+    int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x8_t z2 = vmulq_s16(row2, quant_row2);
+    int16x8_t z3 = vmulq_s16(row6, quant_row6);
+
+    int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
+    int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
+
+    int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
+    int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
+
+    /* Odd part */
+    int16x8_t z1 = vmulq_s16(row7, quant_row7);
+    z2 = vmulq_s16(row5, quant_row5);
+    z3 = vmulq_s16(row3, quant_row3);
+    int16x8_t z4 = vmulq_s16(row1, quant_row1);
+
+    tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
+    tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
+
+    tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
+    tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+  }
+
+  /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+  int16x8x2_t row_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t row_23 = vtrnq_s16(row2, row3);
+
+  int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
+                                    vreinterpretq_s32_s16(row_23.val[0]));
+  int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
+                                    vreinterpretq_s32_s16(row_23.val[1]));
+
+  int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
+  int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
+  int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
+  int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
+  int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
+  int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
+  int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
+
+  /* Commence second pass of IDCT. */
+
+  /* Even part */
+  int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
+  int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+  /* Odd part */
+  tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
+  tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
+  tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
+  tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
+
+  tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
+  tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
+                                          vsubhn_s32(tmp12, tmp0));
+  int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
+                                          vsubhn_s32(tmp10, tmp2));
+  output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
+   * An interleaving store completes the transpose.
+   */
+  uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
+                                    vqmovun_s16(output_cols_13));
+  uint16x4x2_t output_01_23 = { {
+    vreinterpret_u16_u8(output_0123.val[0]),
+    vreinterpret_u16_u8(output_0123.val[1])
+  } };
+
+  /* Store 4x4 block to memory. */
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+  vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+  vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+  vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
+}
diff --git a/media/libjpeg/simd/arm/jquanti-neon.c b/media/libjpeg/simd/arm/jquanti-neon.c
new file mode 100644
index 0000000000..d5d95d89f6
--- /dev/null
+++ b/media/libjpeg/simd/arm/jquanti-neon.c
@@ -0,0 +1,193 @@
+/*
+ * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* After downsampling, the resulting sample values are in the range [0, 255],
+ * but the Discrete Cosine Transform (DCT) operates on values centered around
+ * 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
+                         DCTELEM *workspace)
+{
+  uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+  uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+  uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+  uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+  uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+  uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+  uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+  uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+  int16x8_t row0 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row1 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row2 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row3 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row4 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row5 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row6 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row7 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
+
+  vst1q_s16(workspace + 0 * DCTSIZE, row0);
+  vst1q_s16(workspace + 1 * DCTSIZE, row1);
+  vst1q_s16(workspace + 2 * DCTSIZE, row2);
+  vst1q_s16(workspace + 3 * DCTSIZE, row3);
+  vst1q_s16(workspace + 4 * DCTSIZE, row4);
+  vst1q_s16(workspace + 5 * DCTSIZE, row5);
+  vst1q_s16(workspace + 6 * DCTSIZE, row6);
+  vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
+
+
+/* After the DCT, the resulting array of coefficient values needs to be divided
+ * by an array of quantization values.
+ *
+ * To avoid a slow division operation, the DCT coefficients are multiplied by
+ * the (scaled) reciprocals of the quantization values and then right-shifted.
+ *
+ * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
+ */
+
+void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+                         DCTELEM *workspace)
+{
+  JCOEFPTR out_ptr = coef_block;
+  UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
+  UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
+  DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+  int i;
+
+#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
+#pragma unroll
+#endif
+  for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+    /* Load DCT coefficients. */
+    int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
+    int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
+    int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
+    int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
+    /* Load reciprocals of quantization values. */
+    uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
+    uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
+    int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
+    int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
+    int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
+    int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
+
+    /* Extract sign from coefficients. */
+    int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
+    int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
+    int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
+    int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
+    /* Get absolute value of DCT coefficients. */
+    uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
+    uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
+    uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
+    uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
+    /* Add correction. */
+    abs_row0 = vaddq_u16(abs_row0, corr0);
+    abs_row1 = vaddq_u16(abs_row1, corr1);
+    abs_row2 = vaddq_u16(abs_row2, corr2);
+    abs_row3 = vaddq_u16(abs_row3, corr3);
+
+    /* Multiply DCT coefficients by quantization reciprocals. */
+    int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
+                                                       vget_low_u16(recip0)));
+    int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
+                                                       vget_high_u16(recip0)));
+    int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
+                                                       vget_low_u16(recip1)));
+    int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
+                                                       vget_high_u16(recip1)));
+    int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
+                                                       vget_low_u16(recip2)));
+    int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
+                                                       vget_high_u16(recip2)));
+    int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
+                                                       vget_low_u16(recip3)));
+    int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
+                                                       vget_high_u16(recip3)));
+    /* Narrow back to 16-bit. */
+    row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
+    row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
+    row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
+    row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
+
+    /* Since VSHR only supports an immediate as its second argument, negate the
+     * shift value and shift left.
+     */
+    row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
+                                           vnegq_s16(shift0)));
+    row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
+                                           vnegq_s16(shift1)));
+    row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
+                                           vnegq_s16(shift2)));
+    row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
+                                           vnegq_s16(shift3)));
+
+    /* Restore sign to original product. */
+    row0 = veorq_s16(row0, sign_row0);
+    row0 = vsubq_s16(row0, sign_row0);
+    row1 = veorq_s16(row1, sign_row1);
+    row1 = vsubq_s16(row1, sign_row1);
+    row2 = veorq_s16(row2, sign_row2);
+    row2 = vsubq_s16(row2, sign_row2);
+    row3 = veorq_s16(row3, sign_row3);
+    row3 = vsubq_s16(row3, sign_row3);
+
+    /* Store quantized coefficients to memory. */
+    vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
+    vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
+    vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
+    vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
+  }
+}
diff --git a/media/libjpeg/simd/arm/neon-compat.h b/media/libjpeg/simd/arm/neon-compat.h
new file mode 100644
index 0000000000..2907634e26
--- /dev/null
+++ b/media/libjpeg/simd/arm/neon-compat.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* Define compiler-independent count-leading-zeros and byte-swap macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
+#define BUILTIN_CLZLL(x)  _CountLeadingZeros64(x)
+#define BUILTIN_BSWAP64(x)  _byteswap_uint64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x)  __builtin_clz(x)
+#define BUILTIN_CLZLL(x)  __builtin_clzll(x)
+#define BUILTIN_BSWAP64(x)  __builtin_bswap64(x)
+#else
+#error "Unknown compiler"
+#endif
diff --git a/media/libjpeg/simd/i386/jccolext-avx2.asm b/media/libjpeg/simd/i386/jccolext-avx2.asm
new file mode 100644
index 0000000000..c46d684436
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-avx2.asm
@@ -0,0 +1,578 @@
+;
+; jccolext.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         ecx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [esi+ecx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         ecx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
+    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
+    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
+    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vmovdqa     ymm7, ymm1
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vpxor       ymm1, ymm1, ymm1
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
+    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
+    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
+
+    vmovdqa     ymm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm5=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm7, ymm7, ymm5
+    vpaddd      ymm4, ymm4, ymm5
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
+    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
+
+    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vmovdqa     ymm5, ymm0
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vpxor       ymm0, ymm0, ymm0
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
+    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
+    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
+
+    vmovdqa     ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm5, ymm5, ymm0
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm5, ymm5, ymm1
+    vpaddd      ymm4, ymm4, ymm1
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
+    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
+    vmovdqu     YMMWORD [ebx], ymm5     ; Save Cb
+
+    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
+    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
+    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vmovdqa     ymm7, ymm0
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    vmovdqa     ymm3, [GOTOFF(eax,PD_ONEHALF)]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vpxor       ymm3, ymm3, ymm3
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
+    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
+    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
+
+    vmovdqa     ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm3
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm5, ymm5, ymm1
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
+    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
+
+    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vmovdqa     ymm1, ymm6
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    vmovdqa     ymm2, [GOTOFF(eax,PD_ONEHALF)]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [edi], ymm6     ; Save Y
+
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
+    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
+    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
+
+    vmovdqa     ymm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm0=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm1, ymm1, ymm2
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm1, ymm1, ymm0
+    vpaddd      ymm5, ymm5, ymm0
+    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
+    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
+    vmovdqu     YMMWORD [edx], ymm1     ; Save Cr
+
+    sub         ecx, byte SIZEOF_YMMWORD
+    add         esi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte SIZEOF_YMMWORD           ; outptr0
+    add         ebx, byte SIZEOF_YMMWORD           ; outptr1
+    add         edx, byte SIZEOF_YMMWORD           ; outptr2
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jccolext-mmx.asm b/media/libjpeg/simd/i386/jccolext-mmx.asm
new file mode 100644
index 0000000000..6357a42b2c
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-mmx.asm
@@ -0,0 +1,476 @@
+;
+; jccolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                           JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                           int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    xor         eax, eax
+    mov         al, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    xor         edx, edx
+    mov         dx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        mmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        mmG, dword [esi+ecx]
+    psllq       mmA, DWORD_BIT
+    por         mmA, mmG
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    movq        mmG, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    mov         ecx, SIZEOF_MMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld16:
+    test        cl, 2*SIZEOF_MMWORD
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_ycc_cnv
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+    ; mmA=(00 10 20 01 11 21 02 12)
+    ; mmG=(22 03 13 23 04 14 24 05)
+    ; mmF=(15 25 06 16 26 07 17 27)
+
+    movq        mmD, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
+    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
+
+    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
+    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
+
+    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
+    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
+
+    movq        mmE, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
+    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
+
+    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
+
+    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
+    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
+
+    pxor        mmH, mmH
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
+
+    movq        mmB, mmE
+    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
+    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
+
+    movq        mmF, mmD
+    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
+    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_MMWORD/8
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_MMWORD/8
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_MMWORD/4
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_MMWORD/4
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+    test        cl, SIZEOF_MMWORD/2
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_ycc_cnv
+    movq        mmD, mmA
+    movq        mmC, mmF
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+    ; mmA=(00 10 20 30 01 11 21 31)
+    ; mmF=(02 12 22 32 03 13 23 33)
+    ; mmD=(04 14 24 34 05 15 25 35)
+    ; mmC=(06 16 26 36 07 17 27 37)
+
+    movq        mmB, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
+    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
+
+    movq        mmG, mmD
+    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
+    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
+
+    movq        mmE, mmA
+    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
+
+    movq        mmH, mmB
+    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
+    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
+
+    pxor        mmF, mmF
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
+
+    movq        mmD, mmB
+    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
+    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
+
+    movq        mmG, mmE
+    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
+    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
+
+    punpcklbw   mmF, mmH
+    punpckhbw   mmH, mmH
+    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
+    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=RE
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=RO
+    movq        MMWORD [wk(2)], mm4     ; wk(2)=BE
+    movq        MMWORD [wk(3)], mm5     ; wk(3)=BO
+
+    movq        mm6, mm1
+    punpcklwd   mm1, mm3
+    punpckhwd   mm6, mm3
+    movq        mm7, mm1
+    movq        mm4, mm6
+    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF016_MF033)]  ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movq        MMWORD [wk(4)], mm1     ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movq        MMWORD [wk(5)], mm6     ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        mm1, mm1
+    pxor        mm6, mm6
+    punpcklwd   mm1, mm5                ; mm1=BOL
+    punpckhwd   mm6, mm5                ; mm6=BOH
+    psrld       mm1, 1                  ; mm1=BOL*FIX(0.500)
+    psrld       mm6, 1                  ; mm6=BOH*FIX(0.500)
+
+    movq        mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm5=[PD_ONEHALFM1_CJ]
+
+    paddd       mm7, mm1
+    paddd       mm4, mm6
+    paddd       mm7, mm5
+    paddd       mm4, mm5
+    psrld       mm7, SCALEBITS          ; mm7=CbOL
+    psrld       mm4, SCALEBITS          ; mm4=CbOH
+    packssdw    mm7, mm4                ; mm7=CbO
+
+    movq        mm1, MMWORD [wk(2)]     ; mm1=BE
+
+    movq        mm6, mm0
+    punpcklwd   mm0, mm2
+    punpckhwd   mm6, mm2
+    movq        mm5, mm0
+    movq        mm4, mm6
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF016_MF033)]  ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movq        MMWORD [wk(6)], mm0     ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movq        MMWORD [wk(7)], mm6     ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        mm0, mm0
+    pxor        mm6, mm6
+    punpcklwd   mm0, mm1                ; mm0=BEL
+    punpckhwd   mm6, mm1                ; mm6=BEH
+    psrld       mm0, 1                  ; mm0=BEL*FIX(0.500)
+    psrld       mm6, 1                  ; mm6=BEH*FIX(0.500)
+
+    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
+
+    paddd       mm5, mm0
+    paddd       mm4, mm6
+    paddd       mm5, mm1
+    paddd       mm4, mm1
+    psrld       mm5, SCALEBITS          ; mm5=CbEL
+    psrld       mm4, SCALEBITS          ; mm4=CbEH
+    packssdw    mm5, mm4                ; mm5=CbE
+
+    psllw       mm7, BYTE_BIT
+    por         mm5, mm7                ; mm5=Cb
+    movq        MMWORD [ebx], mm5       ; Save Cb
+
+    movq        mm0, MMWORD [wk(3)]     ; mm0=BO
+    movq        mm6, MMWORD [wk(2)]     ; mm6=BE
+    movq        mm1, MMWORD [wk(1)]     ; mm1=RO
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm3
+    punpckhwd   mm4, mm3
+    movq        mm7, mm0
+    movq        mm5, mm4
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF008_MF041)]  ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
+
+    paddd       mm0, MMWORD [wk(4)]
+    paddd       mm4, MMWORD [wk(5)]
+    paddd       mm0, mm3
+    paddd       mm4, mm3
+    psrld       mm0, SCALEBITS          ; mm0=YOL
+    psrld       mm4, SCALEBITS          ; mm4=YOH
+    packssdw    mm0, mm4                ; mm0=YO
+
+    pxor        mm3, mm3
+    pxor        mm4, mm4
+    punpcklwd   mm3, mm1                ; mm3=ROL
+    punpckhwd   mm4, mm1                ; mm4=ROH
+    psrld       mm3, 1                  ; mm3=ROL*FIX(0.500)
+    psrld       mm4, 1                  ; mm4=ROH*FIX(0.500)
+
+    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
+
+    paddd       mm7, mm3
+    paddd       mm5, mm4
+    paddd       mm7, mm1
+    paddd       mm5, mm1
+    psrld       mm7, SCALEBITS          ; mm7=CrOL
+    psrld       mm5, SCALEBITS          ; mm5=CrOH
+    packssdw    mm7, mm5                ; mm7=CrO
+
+    movq        mm3, MMWORD [wk(0)]     ; mm3=RE
+
+    movq        mm4, mm6
+    punpcklwd   mm6, mm2
+    punpckhwd   mm4, mm2
+    movq        mm1, mm6
+    movq        mm5, mm4
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     mm1, [GOTOFF(eax,PW_MF008_MF041)]  ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
+
+    paddd       mm6, MMWORD [wk(6)]
+    paddd       mm4, MMWORD [wk(7)]
+    paddd       mm6, mm2
+    paddd       mm4, mm2
+    psrld       mm6, SCALEBITS          ; mm6=YEL
+    psrld       mm4, SCALEBITS          ; mm4=YEH
+    packssdw    mm6, mm4                ; mm6=YE
+
+    psllw       mm0, BYTE_BIT
+    por         mm6, mm0                ; mm6=Y
+    movq        MMWORD [edi], mm6       ; Save Y
+
+    pxor        mm2, mm2
+    pxor        mm4, mm4
+    punpcklwd   mm2, mm3                ; mm2=REL
+    punpckhwd   mm4, mm3                ; mm4=REH
+    psrld       mm2, 1                  ; mm2=REL*FIX(0.500)
+    psrld       mm4, 1                  ; mm4=REH*FIX(0.500)
+
+    movq        mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm0=[PD_ONEHALFM1_CJ]
+
+    paddd       mm1, mm2
+    paddd       mm5, mm4
+    paddd       mm1, mm0
+    paddd       mm5, mm0
+    psrld       mm1, SCALEBITS          ; mm1=CrEL
+    psrld       mm5, SCALEBITS          ; mm5=CrEH
+    packssdw    mm1, mm5                ; mm1=CrE
+
+    psllw       mm7, BYTE_BIT
+    por         mm1, mm7                ; mm1=Cr
+    movq        MMWORD [edx], mm1       ; Save Cr
+
+    sub         ecx, byte SIZEOF_MMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
+    add         edi, byte SIZEOF_MMWORD                ; outptr0
+    add         ebx, byte SIZEOF_MMWORD                ; outptr1
+    add         edx, byte SIZEOF_MMWORD                ; outptr2
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jccolext-sse2.asm b/media/libjpeg/simd/i386/jccolext-sse2.asm
new file mode 100644
index 0000000000..c6c80852ac
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-sse2.asm
@@ -0,0 +1,503 @@
+;
+; jccolext.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
+    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm1, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        xmm1, xmm1
+    pxor        xmm6, xmm6
+    punpcklwd   xmm1, xmm5              ; xmm1=BOL
+    punpckhwd   xmm6, xmm5              ; xmm6=BOH
+    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
+
+    movdqa      xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm5=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm1
+    paddd       xmm4, xmm6
+    paddd       xmm7, xmm5
+    paddd       xmm4, xmm5
+    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
+    packssdw    xmm7, xmm4              ; xmm7=CbO
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        xmm0, xmm0
+    pxor        xmm6, xmm6
+    punpcklwd   xmm0, xmm1              ; xmm0=BEL
+    punpckhwd   xmm6, xmm1              ; xmm6=BEH
+    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
+
+    movdqa      xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm5, xmm0
+    paddd       xmm4, xmm6
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
+    packssdw    xmm5, xmm4              ; xmm5=CbE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm5, xmm7              ; xmm5=Cb
+    movdqa      XMMWORD [ebx], xmm5     ; Save Cb
+
+    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    movdqa      xmm7, xmm0
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movdqa      xmm3, [GOTOFF(eax,PD_ONEHALF)]  ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, XMMWORD [wk(4)]
+    paddd       xmm4, XMMWORD [wk(5)]
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    pxor        xmm3, xmm3
+    pxor        xmm4, xmm4
+    punpcklwd   xmm3, xmm1              ; xmm3=ROL
+    punpckhwd   xmm4, xmm1              ; xmm4=ROH
+    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
+
+    movdqa      xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm3
+    paddd       xmm5, xmm4
+    paddd       xmm7, xmm1
+    paddd       xmm5, xmm1
+    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
+    packssdw    xmm7, xmm5              ; xmm7=CrO
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     xmm1, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movdqa      xmm2, [GOTOFF(eax,PD_ONEHALF)]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(6)]
+    paddd       xmm4, XMMWORD [wk(7)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [edi], xmm6     ; Save Y
+
+    pxor        xmm2, xmm2
+    pxor        xmm4, xmm4
+    punpcklwd   xmm2, xmm3              ; xmm2=REL
+    punpckhwd   xmm4, xmm3              ; xmm4=REH
+    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
+
+    movdqa      xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm0=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm1, xmm2
+    paddd       xmm5, xmm4
+    paddd       xmm1, xmm0
+    paddd       xmm5, xmm0
+    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
+    packssdw    xmm1, xmm5              ; xmm1=CrE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Cr
+    movdqa      XMMWORD [edx], xmm1     ; Save Cr
+
+    sub         ecx, byte SIZEOF_XMMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte SIZEOF_XMMWORD                ; outptr0
+    add         ebx, byte SIZEOF_XMMWORD                ; outptr1
+    add         edx, byte SIZEOF_XMMWORD                ; outptr2
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jccolor-avx2.asm b/media/libjpeg/simd/i386/jccolor-avx2.asm
new file mode 100644
index 0000000000..14944e952f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-avx2.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337  times 8 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 8 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 8 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jccolor-mmx.asm b/media/libjpeg/simd/i386/jccolor-mmx.asm
new file mode 100644
index 0000000000..8cb399bdc4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-mmx.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337  times 2 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 2 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 2 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 2 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 2 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 2 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extrgb_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extrgbx_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extbgr_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extbgrx_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extxbgr_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extxrgb_ycc_convert_mmx
+%include "jccolext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jccolor-sse2.asm b/media/libjpeg/simd/i386/jccolor-sse2.asm
new file mode 100644
index 0000000000..686d222ff7
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-sse2.asm
@@ -0,0 +1,120 @@
+;
+; jccolor.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337  times 4 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgbx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgrx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-avx2.asm b/media/libjpeg/simd/i386/jcgray-avx2.asm
new file mode 100644
index 0000000000..560ee0c71e
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-avx2.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF     times 8 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-mmx.asm b/media/libjpeg/simd/i386/jcgray-mmx.asm
new file mode 100644
index 0000000000..79fdf082a8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-mmx.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_mmx)
+
+EXTN(jconst_rgb_gray_convert_mmx):
+
+PW_F0299_F0337 times 2 dw F_0_299, F_0_337
+PW_F0114_F0250 times 2 dw F_0_114, F_0_250
+PD_ONEHALF     times 2 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extrgb_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extrgbx_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extbgr_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extbgrx_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extxbgr_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extxrgb_gray_convert_mmx
+%include "jcgryext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-sse2.asm b/media/libjpeg/simd/i386/jcgray-sse2.asm
new file mode 100644
index 0000000000..cb4b28e8f4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-sse2.asm
@@ -0,0 +1,112 @@
+;
+; jcgray.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF     times 4 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgbx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgrx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jcgryext-avx2.asm b/media/libjpeg/simd/i386/jcgryext-avx2.asm
new file mode 100644
index 0000000000..3fa7973d72
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-avx2.asm
@@ -0,0 +1,457 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         ecx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [esi+ecx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         ecx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     ymm0, ymm5              ; ymm0=BO
+    vmovdqa     ymm6, ymm4              ; ymm6=BE
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    vmovdqa     ymm3, [GOTOFF(eax,PD_ONEHALF)]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, ymm1
+    vpaddd      ymm4, ymm4, ymm7
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    vmovdqa     ymm2, [GOTOFF(eax,PD_ONEHALF)]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [edi], ymm6     ; Save Y
+
+    sub         ecx, byte SIZEOF_YMMWORD
+    add         esi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte SIZEOF_YMMWORD           ; outptr0
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcgryext-mmx.asm b/media/libjpeg/simd/i386/jcgryext-mmx.asm
new file mode 100644
index 0000000000..8af42e5a33
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-mmx.asm
@@ -0,0 +1,355 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx)
+
+EXTN(jsimd_rgb_gray_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    xor         eax, eax
+    mov         al, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    xor         edx, edx
+    mov         dx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        mmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        mmG, dword [esi+ecx]
+    psllq       mmA, DWORD_BIT
+    por         mmA, mmG
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    movq        mmG, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    mov         ecx, SIZEOF_MMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld16:
+    test        cl, 2*SIZEOF_MMWORD
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_gray_cnv
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+    ; mmA=(00 10 20 01 11 21 02 12)
+    ; mmG=(22 03 13 23 04 14 24 05)
+    ; mmF=(15 25 06 16 26 07 17 27)
+
+    movq        mmD, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
+    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
+
+    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
+    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
+
+    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
+    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
+
+    movq        mmE, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
+    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
+
+    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
+
+    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
+    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
+
+    pxor        mmH, mmH
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
+
+    movq        mmB, mmE
+    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
+    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
+
+    movq        mmF, mmD
+    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
+    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_MMWORD/8
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_MMWORD/8
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_MMWORD/4
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_MMWORD/4
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+    test        cl, SIZEOF_MMWORD/2
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_gray_cnv
+    movq        mmD, mmA
+    movq        mmC, mmF
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+    ; mmA=(00 10 20 30 01 11 21 31)
+    ; mmF=(02 12 22 32 03 13 23 33)
+    ; mmD=(04 14 24 34 05 15 25 35)
+    ; mmC=(06 16 26 36 07 17 27 37)
+
+    movq        mmB, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
+    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
+
+    movq        mmG, mmD
+    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
+    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
+
+    movq        mmE, mmA
+    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
+
+    movq        mmH, mmB
+    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
+    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
+
+    pxor        mmF, mmF
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
+
+    movq        mmD, mmB
+    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
+    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
+
+    movq        mmG, mmE
+    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
+    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
+
+    punpcklbw   mmF, mmH
+    punpckhbw   mmH, mmH
+    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
+    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movq        mm6, mm1
+    punpcklwd   mm1, mm3
+    punpckhwd   mm6, mm3
+    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movq        mm7,  mm6               ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movq        mm6, mm0
+    punpcklwd   mm0, mm2
+    punpckhwd   mm6, mm2
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movq        mm0, mm5                ; mm0=BO
+    movq        mm6, mm4                ; mm6=BE
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm3
+    punpckhwd   mm4, mm3
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
+
+    paddd       mm0, mm1
+    paddd       mm4, mm7
+    paddd       mm0, mm3
+    paddd       mm4, mm3
+    psrld       mm0, SCALEBITS          ; mm0=YOL
+    psrld       mm4, SCALEBITS          ; mm4=YOH
+    packssdw    mm0, mm4                ; mm0=YO
+
+    movq        mm4, mm6
+    punpcklwd   mm6, mm2
+    punpckhwd   mm4, mm2
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
+
+    paddd       mm6, MMWORD [wk(0)]
+    paddd       mm4, MMWORD [wk(1)]
+    paddd       mm6, mm2
+    paddd       mm4, mm2
+    psrld       mm6, SCALEBITS          ; mm6=YEL
+    psrld       mm4, SCALEBITS          ; mm4=YEH
+    packssdw    mm6, mm4                ; mm6=YE
+
+    psllw       mm0, BYTE_BIT
+    por         mm6, mm0                ; mm6=Y
+    movq        MMWORD [edi], mm6       ; Save Y
+
+    sub         ecx, byte SIZEOF_MMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
+    add         edi, byte SIZEOF_MMWORD                ; outptr0
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcgryext-sse2.asm b/media/libjpeg/simd/i386/jcgryext-sse2.asm
new file mode 100644
index 0000000000..c9d6ff1e35
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-sse2.asm
@@ -0,0 +1,382 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    pmaddwd     xmm1, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      xmm0, xmm5              ; xmm0=BO
+    movdqa      xmm6, xmm4              ; xmm6=BE
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movdqa      xmm3, [GOTOFF(eax,PD_ONEHALF)]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, xmm1
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movdqa      xmm2, [GOTOFF(eax,PD_ONEHALF)]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(0)]
+    paddd       xmm4, XMMWORD [wk(1)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [edi], xmm6     ; Save Y
+
+    sub         ecx, byte SIZEOF_XMMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte SIZEOF_XMMWORD                ; outptr0
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jchuff-sse2.asm b/media/libjpeg/simd/i386/jchuff-sse2.asm
new file mode 100644
index 0000000000..278cf5e83a
--- /dev/null
+++ b/media/libjpeg/simd/i386/jchuff-sse2.asm
@@ -0,0 +1,761 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based on jchuff.c; see jchuff.c for more details.
+
+%include "jsimdext.inc"
+
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+    alignz      32
+
+jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
+               dq 0x000f, 0x001f, 0x003f, 0x007f
+               dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+
+    alignz      32
+
+%ifdef PIC
+%define NBITS(x)      nbits_base + x
+%else
+%define NBITS(x)      jpeg_nbits_table + x
+%endif
+%define MASK_BITS(x)  NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%define mm_put_buffer     mm0
+%define mm_all_0xff       mm1
+%define mm_temp           mm2
+%define mm_nbits          mm3
+%define mm_code_bits      mm3
+%define mm_code           mm4
+%define mm_overflow_bits  mm5
+%define mm_save_nbits     mm6
+
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values, or
+;      mmN treated as eight unsigned 8-bit values
+; bN[i]:  perform the same operation on all unsigned 8-bit values,
+;         i=0..15 (SSE register) or i=0..7 (MMX register)
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - temp register
+; %2 - low byte of temp register
+; %3 - second byte of temp register
+; %4-%8 (optional) - extra instructions to execute before the macro completes
+; %9 - the label to which to jump when the macro completes
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 9
+%define %%temp   %1
+%define %%tempb  %2
+%define %%temph  %3
+    add         nbits, free_bits             ; nbits += free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    movq        mm_temp, mm_code             ; temp = code;
+    movd        mm_nbits, nbits              ; nbits --> MMX register
+    movd        mm_overflow_bits, free_bits  ; overflow_bits (temp register) = free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    psllq       mm_put_buffer, mm_nbits      ; put_buffer <<= nbits;
+    psrlq       mm_temp, mm_overflow_bits    ; temp >>= overflow_bits;
+    add         free_bits, 64                ; free_bits += 64;
+    por         mm_temp, mm_put_buffer       ; temp |= put_buffer;
+%ifidn %%temp, nbits_base
+    movd        mm_save_nbits, nbits_base    ; save nbits_base
+%endif
+    movq        mm_code_bits, mm_temp        ; code_bits (temp register) = temp;
+    movq        mm_put_buffer, mm_code       ; put_buffer = code;
+    pcmpeqb     mm_temp, mm_all_0xff         ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
+    movq        mm_code, mm_code_bits        ; code = code_bits;
+    psrlq       mm_code_bits, 32             ; code_bits >>= 32;
+    pmovmskb    nbits, mm_temp               ; nbits = 0;  nbits |= ((b_temp[i] >> 7) << i);
+    movd        %%temp, mm_code_bits         ; temp = code_bits;
+    bswap       %%temp                       ; temp = htonl(temp);
+    test        nbits, nbits                 ; if (nbits != 0)  /* Some 0xFF bytes */
+    jnz         %%.SLOW                      ;   goto %%.SLOW
+    mov         dword [buffer], %%temp       ; *(uint32_t)buffer = temp;
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits    ; restore nbits_base
+%endif
+    %4
+    movd        nbits, mm_code               ; nbits = (uint32_t)(code);
+    %5
+    bswap       nbits                        ; nbits = htonl(nbits);
+    mov         dword [buffer + 4], nbits    ; *(uint32_t)(buffer + 4) = nbits;
+    lea         buffer, [buffer + 8]         ; buffer += 8;
+    %6
+    %7
+    %8
+    jmp %9                                   ; return
+%%.SLOW:
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         %%temp, 16                 ; temp >>= 16;
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    movd        nbits, mm_code             ; nbits (temp register) = (uint32_t)(code)
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits  ; restore nbits_base
+%endif
+    bswap       nbits                      ; nbits = htonl(nbits)
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    shr         nbits, 16                  ; nbits >>= 16;
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    %4
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    %5
+    %6
+    %7
+    %8
+    jmp %9                                 ; return;
+%endmacro
+
+%macro PUSH 1
+    push        %1
+%assign stack_offset  stack_offset + 4
+%endmacro
+
+%macro POP 1
+    pop         %1
+%assign stack_offset  stack_offset - 4
+%endmacro
+
+; If PIC is defined, load the address of a symbol defined in this file into a
+; register.  Equivalent to
+;   get_GOT     %1
+;   lea         %1, [GOTOFF(%1, %2)]
+; without using the GOT.
+;
+; Usage:
+; %1 - register into which to load the address of the symbol
+; %2 - symbol whose address should be loaded
+; %3 - optional multi-line macro to execute before the symbol address is loaded
+; %4 - optional multi-line macro to execute after the symbol address is loaded
+;
+; If PIC is not defined, then %3 and %4 are executed in order.
+
+%macro GET_SYM 2-4
+%ifdef PIC
+    call        %%.geteip
+%%.ref:
+    %4
+    add         %1, %2 - %%.ref
+    jmp         short %%.done
+    align       32
+%%.geteip:
+    %3          4               ; must adjust stack pointer because of call
+    mov         %1, POINTER [esp]
+    ret
+    align       32
+%%.done:
+%else
+    %3          0
+    %4
+%endif
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+;                                  JCOEFPTR block, int last_dc_val,
+;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+; Stack layout:
+; Function args
+; Return address
+; Saved ebx
+; Saved ebp
+; Saved esi
+; Saved edi <-- esp_save
+; ...
+; esp_save
+; t_ 64*2 bytes (aligned to 128 bytes)
+;
+; esp is used (as t) to point into t_ (data in lower indices is not used once
+; esp passes over them, so this is signal-safe.)  Aligning to 128 bytes allows
+; us to find the rest of the data again.
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; eax - frame --> buffer
+; ebx - nbits_base (PIC) / emit_temp
+; ecx - dctbl --> size --> state
+; edx - block --> nbits
+; esi - code_temp --> state --> actbl
+; edi - index_temp --> free_bits
+; esp - t
+; ebp - index
+
+%define frame       eax
+%ifdef PIC
+%define nbits_base  ebx
+%endif
+%define emit_temp   ebx
+%define emit_tempb  bl
+%define emit_temph  bh
+%define dctbl       ecx
+%define block       edx
+%define code_temp   esi
+%define index_temp  edi
+%define t           esp
+%define index       ebp
+
+%assign save_frame  DCTSIZE2 * SIZEOF_WORD
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+
+%assign stack_offset      0
+%define arg_state         4 + stack_offset
+%define arg_buffer        8 + stack_offset
+%define arg_block        12 + stack_offset
+%define arg_last_dc_val  16 + stack_offset
+%define arg_dctbl        20 + stack_offset
+%define arg_actbl        24 + stack_offset
+
+                                                          ;X: X = code stream
+    mov         block, [esp + arg_block]
+    PUSH        ebx
+    PUSH        ebp
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    PUSH        esi
+    PUSH        edi
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    mov         frame, esp
+    lea         t, [frame - (save_frame + 4)]
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    and         t, -DCTSIZE2 * SIZEOF_WORD                                             ; t = &t_[0]
+    mov         [t + save_frame], frame
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
+
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movsx       code_temp, word [block]                   ;Z:     code_temp = block[0];
+
+; %1 - stack pointer adjustment
+%macro GET_SYM_BEFORE 1
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
+                                                          ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+    sub         code_temp, [frame + arg_last_dc_val]      ;Z:     code_temp -= last_dc_val;
+
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    index_temp, xmm2                          ;Z:     index_temp = 0;  index_temp |= ((b2[i] >> 7) << i);
+    pmovmskb    index, xmm0                               ;Z:     index = 0;  index |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         index_temp, 16                            ;Z:     index_temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    or          index, index_temp                         ;Z:     index |= index_temp;
+%undef index_temp
+%define free_bits  edi
+%endmacro
+
+%macro GET_SYM_AFTER 0
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    not         index                                     ;Z:     index = ~index;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    mov         dctbl, [frame + arg_dctbl]
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    pcmpeqw     mm_all_0xff, mm_all_0xff                  ;Z:     all_0xff[i] = 0xFF;
+%endmacro
+
+    GET_SYM     nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
+
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    cmp         code_temp, 1 << 31                        ;Z:     Set CF if code_temp < 0x80000000,
+                                                          ;Z:     i.e. if code_temp is positive
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    adc         code_temp, -1                             ;Z:     code_temp += -1 + (code_temp >= 0 ? 1 : 0);
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    movd        mm_temp, code_temp                        ;Z:     temp = code_temp
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    lea         t, [t - SIZEOF_WORD]                      ;Z:     t = &t[-1]
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1  ;F: t[40+i] = w1[i];
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define nbits  edx
+%define nbitsb  dl
+%define nbitsh  dh
+    movzx       nbits, byte [NBITS(code_temp)]            ;Z:     nbits = JPEG_NBITS(code_temp);
+%undef code_temp
+%define state  esi
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    mov         state, [frame + arg_state]
+    movd        mm_nbits, nbits                           ;Z:     nbits --> MMX register
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    movd        mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
+                                                          ;Z:     code = dctbl->ehufco[nbits];
+%define size  ecx
+%define sizeb  cl
+%define sizeh  ch
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    movaps      XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5  ;E: t[32+i] = w5[i];
+    movzx       size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
+                                                          ;Z:     size = dctbl->ehufsi[nbits];
+%undef dctbl
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    movq        mm_put_buffer, [state + working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    mov         free_bits, [state + working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+%undef state
+%define actbl  esi
+    mov         actbl, [frame + arg_actbl]
+%define buffer  eax
+    mov         buffer, [frame + arg_buffer]
+%undef frame
+    jmp        .BEGIN
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+; size <= 32, so this is not really a loop
+.BRLOOP1:                                                 ; .BRLOOP1:
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ; nbits = actbl->ehufsi[0xf0];
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ; code = actbl->ehufco[0xf0];
+    and         index, 0x7ffffff                          ; clear index if size == 32
+    sub         size, 16                                  ; size -= 16;
+    sub         free_bits, nbits                          ; if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP1                             ;   goto .EMIT_BRLOOP1;
+    movd        mm_nbits, nbits                           ; nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ; put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ; put_buffer |= code;
+    jmp         .ERLOOP1                                  ; goto .ERLOOP1;
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+%ifdef PIC
+    times 6     nop
+%else
+    times 2     nop
+%endif
+.BLOOP1:                                                  ; do {  /* size = # of zero bits/elements to skip */
+; if size == 32, index remains unchanged.  Correct in .BRLOOP.
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP1                                  ;     goto .BRLOOP1;
+.ERLOOP1:                                                 ; .ERLOOP1:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+%ifdef PIC
+    add         size, size                                ;   size += size;
+%else
+    lea         size, [size * 2]                          ;   size += size;
+%endif
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+.BEGIN:                                                   ; .BEGIN:
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    add         nbits, size                               ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP1                             ;     insert code, flush buffer, init size, goto .BLOOP1
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP1                                   ; } while (index != 0);
+; Round 2
+; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
+.ELOOP1:                                                  ; .ELOOP1:
+    pmovmskb    size, xmm4                                ; size = 0;  size |= ((b4[i] >> 7) << i);
+    pmovmskb    index, xmm5                               ; index = 0;  index |= ((b5[i] >> 7) << i);
+    shl         size, 16                                  ; size <<= 16;
+    or          index, size                               ; index |= size;
+    not         index                                     ; index = ~index;
+    lea         nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
+                                                          ; nbits = t + 1 + 64;
+    and         nbits, -DCTSIZE2 * SIZEOF_WORD            ; nbits &= -128;  /* now points to &t_[64] */
+    sub         nbits, t                                  ; nbits -= t;
+    shr         nbits, 1                                  ; nbits >>= 1;  /* # of leading 0 bits in old index + 33 */
+    tzcnt       size, index                               ; size = # of trailing 0 bits in index
+    inc         size                                      ; ++size;
+    test        index, index                              ; if (index == 0)
+    jz          .ELOOP2                                   ;   goto .ELOOP2;
+; NOTE: size == 32 cannot happen, since the last element is always 0.
+    shr         index, sizeb                              ; index >>= size;
+    lea         size, [size + nbits - 33]                 ; size = size + nbits - 33;
+    lea         t, [t + size * SIZEOF_WORD]               ; t += size;
+    cmp         size, 16                                  ; if (size <= 16)
+    jle         .ERLOOP2                                  ;   goto .ERLOOP2;
+.BRLOOP2:                                                 ; do {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    sub         size, 16                                  ;   size -= 16;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP2                             ;     insert code and flush put_buffer
+    movd        mm_nbits, nbits                           ;   else { nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+    cmp         size, 16                                  ;     if (size <= 16)
+    jle        .ERLOOP2                                   ;       goto .ERLOOP2;
+    jmp        .BRLOOP2                                   ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align      16
+.BLOOP2:                                                  ; do {  /* size = # of zero bits/elements to skip */
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP2                                  ;     goto .BRLOOP2;
+.ERLOOP2:                                                 ; .ERLOOP2:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+    add         size, size                                ;   size += size;
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    lea         nbits, [nbits + size]                     ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP2                             ;     insert code, flush buffer, init size, goto .BLOOP2
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP2                                   ; } while (index != 0);
+.ELOOP2:                                                  ; .ELOOP2:
+    mov         nbits, t                                  ; nbits = t;
+    lea         t, [t + SIZEOF_WORD]                      ; t = &t[1];
+    and         nbits, DCTSIZE2 * SIZEOF_WORD - 1         ; nbits &= 127;
+    and         t, -DCTSIZE2 * SIZEOF_WORD                ; t &= -128;  /* t = &t_[0]; */
+    cmp         nbits, (DCTSIZE2 - 2) * SIZEOF_WORD       ; if (nbits != 62 * 2)
+    je          .EFN                                      ; {
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
+                                                          ;   code = actbl->ehufco[0];
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  size, sizeb, sizeh, , , , , , .EFN        ;     insert code, flush put_buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    movd        mm_nbits, nbits                           ;     nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+.EFN:                                                     ; } }
+%define frame  esp
+    mov         frame, [t + save_frame]
+%define state  ecx
+    mov         state, [frame + arg_state]
+    movq        [state + working_state.cur.put_buffer.simd], mm_put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    emms
+    mov         [state + working_state.cur.free_bits], free_bits
+                                                          ; state->cur.free_bits = free_bits;
+    POP         edi
+    POP         esi
+    POP         ebp
+    POP         ebx
+    ret
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP1:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , , , \
+      .ERLOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP1:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP1 }, \
+      .ELOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP2:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , \
+      { cmp     size, 16 }, \
+      { jle     .ERLOOP2 }, \
+      .BRLOOP2
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP2:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP2 }, \
+      .ELOOP2
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcphuff-sse2.asm b/media/libjpeg/simd/i386/jcphuff-sse2.asm
new file mode 100644
index 0000000000..c26b48a47d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcphuff-sse2.asm
@@ -0,0 +1,662 @@
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding.  See jcphuff.c for more details.
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+    pxor        N0, N0
+    pxor        N1, N1
+
+    mov         T0, INT [LUT +  0*SIZEOF_INT]
+    mov         T1, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0, INT [LUT +  1*SIZEOF_INT]
+    mov         T1, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    mov         T0, INT [LUT +  2*SIZEOF_INT]
+    mov         T1, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    mov         T0, INT [LUT +  3*SIZEOF_INT]
+    mov         T1, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    mov         T0, INT [LUT +  4*SIZEOF_INT]
+    mov         T1, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    mov         T0, INT [LUT +  5*SIZEOF_INT]
+    mov         T1, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    mov         T0, INT [LUT +  6*SIZEOF_INT]
+    mov         T1, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+
+    mov         T0, INT [LUT +  7*SIZEOF_INT]
+    mov         T1, INT [LUT + 15*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+    pinsrw      X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+    pxor        N0, N0
+    pxor        N1, N1
+    pxor        X1, X1
+
+    mov         T0, INT [LUT +  0*SIZEOF_INT]
+    mov         T1, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+    pxor        N0, N0
+
+    mov         T0, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+
+    mov         T0, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+    pxor        N0, N0
+    pxor        X0, X0
+
+    mov         T1, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 0
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
+    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
+    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
+    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
+    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
+    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
+    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
+
+    pcmpeqw     xmm0, ZERO
+    pcmpeqw     xmm1, ZERO
+    pcmpeqw     xmm2, ZERO
+    pcmpeqw     xmm3, ZERO
+    pcmpeqw     xmm4, ZERO
+    pcmpeqw     xmm5, ZERO
+    pcmpeqw     xmm6, ZERO
+    pcmpeqw     xmm7, XMMWORD [VALUES + (56*2)]
+
+    packsswb    xmm0, xmm1
+    packsswb    xmm2, xmm3
+    packsswb    xmm4, xmm5
+    packsswb    xmm6, xmm7
+
+    pmovmskb    eax, xmm0
+    pmovmskb    ecx, xmm2
+    pmovmskb    edx, xmm4
+    pmovmskb    esi, xmm6
+
+    shl         ecx, 16
+    shl         esi, 16
+
+    or          eax, ecx
+    or          edx, esi
+
+    not         eax
+    not         edx
+
+    mov         edi, ZEROBITS
+
+    mov         INT [edi], eax
+    mov         INT [edi+SIZEOF_INT], edx
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+;                                        const int *jpeg_natural_order_start,
+;                                        int Sl, int Al, JCOEF *values,
+;                                        size_t *zerobits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *zerobits
+
+%define ZERO    xmm7
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define LENEND  eax
+%define LUT     ebx
+%define T0      ecx
+%define T1      edx
+%define BLOCK   esi
+%define VALUES  edi
+%define LEN     ebp
+
+%define ZEROBITS  INT [esp + 5 * 4]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    sub         esp, 4
+    push        ebx
+    push        ecx
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+    push        ebp
+
+    mov         BLOCK, INT [eax + 8]
+    mov         LUT, INT [eax + 12]
+    mov         VALUES, INT [eax + 24]
+    movd        AL, INT [eax + 20]
+    mov         T0, INT [eax + 28]
+    mov         ZEROBITS, T0
+    mov         LEN, INT [eax + 16]
+    pxor        ZERO, ZERO
+    mov         K, LEN
+    and         K, -16
+    shr         K, 4
+    jz          .ELOOP16
+.BLOOP16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    dec         K
+    jnz         .BLOOP16
+    test        LEN, 15
+    je          .PADDING
+.ELOOP16:
+    mov         LENEND, LEN
+    and         LENEND, 7
+
+    test        LEN, 8
+    jz          .TRY7
+    test        LEN, 7
+    jz          .TRY8
+
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    jmp         .PADDING
+.TRY8:
+    LOAD8
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+    jmp         .PADDING
+.TRY7:
+    LOAD7
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+.PADDING:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDING
+    align       16
+.ZEROLOOP:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOP
+.EPADDING:
+    sub         VALUES, DCTSIZE2*2
+
+    REDUCE0
+
+    pop         ebp
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+    pop         ecx
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+;                                         const int *jpeg_natural_order_start,
+;                                         int Sl, int Al, JCOEF *absvalues,
+;                                         size_t *bits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *bits
+
+%define ZERO    xmm7
+%define ONE     xmm5
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define LENEND  eax
+%define LUT     ebx
+%define T0      ecx
+%define T0w      cx
+%define T1      edx
+%define BLOCK   esi
+%define VALUES  edi
+%define KK      ebp
+
+%define ZEROBITS  INT [esp + 5 * 4]
+%define EOB       INT [esp + 5 * 4 + 4]
+%define LEN       INT [esp + 5 * 4 + 8]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    sub         esp, 16
+    push        ebx
+    push        ecx
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+    push        ebp
+
+    pcmpeqw     ONE, ONE
+    psrlw       ONE, 15
+    mov         BLOCK, INT [eax + 8]
+    mov         LUT, INT [eax + 12]
+    mov         VALUES, INT [eax + 24]
+    movd        AL, INT [eax + 20]
+    mov         T0, INT [eax + 28]
+    mov         K,  INT [eax + 16]
+    mov         INT [T0 + 2 * SIZEOF_INT], -1
+    mov         INT [T0 + 3 * SIZEOF_INT], -1
+    mov         ZEROBITS, T0
+    mov         LEN, K
+    pxor        ZERO, ZERO
+    and         K, -16
+    mov         EOB, 0
+    xor         KK, KK
+    shr         K, 4
+    jz          .ELOOPR16
+.BLOOPR16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER16            ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER16:
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    add         KK, 2
+    dec         K
+    jnz         .BLOOPR16
+    test        LEN, 15
+    je          .PADDINGR
+.ELOOPR16:
+    mov         LENEND, LEN
+
+    test        LENEND, 8
+    jz          .TRYR7
+    test        LENEND, 7
+    jz          .TRYR8
+
+    and         LENEND, 7
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER15            ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER15:
+    add         VALUES, 16*2
+    jmp         .PADDINGR
+.TRYR8:
+    LOAD8
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER8             ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER8:
+    add         VALUES, 8*2
+    jmp         .PADDINGR
+.TRYR7:
+    and         LENEND, 7
+    LOAD7
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER7             ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER7:
+    add         VALUES, 8*2
+.PADDINGR:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDINGR
+    align       16
+.ZEROLOOPR:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOPR
+.EPADDINGR:
+    sub         VALUES, DCTSIZE2*2
+
+    REDUCE0
+
+    mov         eax, EOB
+
+    pop         ebp
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+    pop         ecx
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcsample-avx2.asm b/media/libjpeg/simd/i386/jcsample-avx2.asm
new file mode 100644
index 0000000000..0a20802dd8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-avx2.asm
@@ -0,0 +1,388 @@
+;
+; jcsample.asm - downsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    vmovd       xmm7, edx
+    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r24:
+    ; ecx can possibly be 8, 16, 24
+    cmp         ecx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         ecx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpsrlw      ymm2, ymm0, BYTE_BIT
+    vpand       ymm0, ymm0, ymm6
+    vpsrlw      ymm3, ymm1, BYTE_BIT
+    vpand       ymm1, ymm1, ymm6
+
+    vpaddw      ymm0, ymm0, ymm2
+    vpaddw      ymm1, ymm1, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 1
+    vpsrlw      ymm1, ymm1, 1
+
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
+    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r24
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    vmovd       xmm7, edx
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+    vperm2i128  ymm7, ymm7, ymm7, 0
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r24:
+    cmp         ecx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
+    vmovdqu     xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         ecx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    vmovdqu     xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
+    vmovdqu     ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpand       ymm4, ymm0, ymm6
+    vpsrlw      ymm0, ymm0, BYTE_BIT
+    vpand       ymm5, ymm1, ymm6
+    vpsrlw      ymm1, ymm1, BYTE_BIT
+    vpaddw      ymm0, ymm0, ymm4
+    vpaddw      ymm1, ymm1, ymm5
+
+    vpand       ymm4, ymm2, ymm6
+    vpsrlw      ymm2, ymm2, BYTE_BIT
+    vpand       ymm5, ymm3, ymm6
+    vpsrlw      ymm3, ymm3, BYTE_BIT
+    vpaddw      ymm2, ymm2, ymm4
+    vpaddw      ymm3, ymm3, ymm5
+
+    vpaddw      ymm0, ymm0, ymm1
+    vpaddw      ymm2, ymm2, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpsrlw      ymm0, ymm0, 2
+    vpsrlw      ymm2, ymm2, 2
+
+    vpackuswb   ymm0, ymm0, ymm2
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
+    add         edx, byte 2*SIZEOF_YMMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r24
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcsample-mmx.asm b/media/libjpeg/simd/i386/jcsample-mmx.asm
new file mode 100644
index 0000000000..2c223eebe8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-mmx.asm
@@ -0,0 +1,324 @@
+;
+; jcsample.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+;                           JDIMENSION v_samp_factor,
+;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                           JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    movd        mm7, edx
+    pcmpeqw     mm6, mm6
+    punpckldq   mm7, mm7                ; mm7={0, 1, 0, 1}
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mm2, mm0
+    movq        mm3, mm1
+
+    pand        mm0, mm6
+    psrlw       mm2, BYTE_BIT
+    pand        mm1, mm6
+    psrlw       mm3, BYTE_BIT
+
+    paddw       mm0, mm2
+    paddw       mm1, mm3
+    paddw       mm0, mm7
+    paddw       mm1, mm7
+    psrlw       mm0, 1
+    psrlw       mm1, 1
+
+    packuswb    mm0, mm1
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
+    sub         ecx, byte SIZEOF_MMWORD    ; outcol
+    jnz         short .columnloop
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+;                           JDIMENSION v_samp_factor,
+;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                           JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    movd        mm7, edx
+    pcmpeqw     mm6, mm6
+    punpckldq   mm7, mm7                ; mm7={1, 2, 1, 2}
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+    movq        mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pand        mm0, mm6
+    psrlw       mm4, BYTE_BIT
+    pand        mm1, mm6
+    psrlw       mm5, BYTE_BIT
+    paddw       mm0, mm4
+    paddw       mm1, mm5
+
+    movq        mm4, mm2
+    movq        mm5, mm3
+    pand        mm2, mm6
+    psrlw       mm4, BYTE_BIT
+    pand        mm3, mm6
+    psrlw       mm5, BYTE_BIT
+    paddw       mm2, mm4
+    paddw       mm3, mm5
+
+    paddw       mm0, mm1
+    paddw       mm2, mm3
+    paddw       mm0, mm7
+    paddw       mm2, mm7
+    psrlw       mm0, 2
+    psrlw       mm2, 2
+
+    packuswb    mm0, mm2
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+    add         edx, byte 2*SIZEOF_MMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
+    sub         ecx, byte SIZEOF_MMWORD    ; outcol
+    jnz         near .columnloop
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcsample-sse2.asm b/media/libjpeg/simd/i386/jcsample-sse2.asm
new file mode 100644
index 0000000000..4fea60d2e2
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-sse2.asm
@@ -0,0 +1,351 @@
+;
+; jcsample.asm - downsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    pxor        xmm1, xmm1
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    pand        xmm0, xmm6
+    psrlw       xmm2, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm3, BYTE_BIT
+
+    paddw       xmm0, xmm2
+    paddw       xmm1, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    psrlw       xmm0, 1
+    psrlw       xmm1, 1
+
+    packuswb    xmm0, xmm1
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    test        ecx, ecx
+    jnz         short .columnloop_r8
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    pxor        xmm2, xmm2
+    pxor        xmm3, xmm3
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    pand        xmm0, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm0, xmm4
+    paddw       xmm1, xmm5
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    pand        xmm2, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm3, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    paddw       xmm0, xmm1
+    paddw       xmm2, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm7
+    psrlw       xmm0, 2
+    psrlw       xmm2, 2
+
+    packuswb    xmm0, xmm2
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
+    add         edx, byte 2*SIZEOF_XMMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r8
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdcolext-avx2.asm b/media/libjpeg/simd/i386/jdcolext-avx2.asm
new file mode 100644
index 0000000000..015be0416c
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-avx2.asm
@@ -0,0 +1,515 @@
+;
+; jdcolext.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    vmovdqu     ymm5, YMMWORD [ebx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm1, YMMWORD [edx]     ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm0, ymm0, ymm0
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsrlw      ymm0, ymm0, BYTE_BIT    ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpand       ymm4, ymm0, ymm5        ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+    vpsrlw      ymm5, ymm5, BYTE_BIT    ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+    vpand       ymm0, ymm0, ymm1        ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+    vpsrlw      ymm1, ymm1, BYTE_BIT    ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+    vpaddw      ymm2, ymm4, ymm7
+    vpaddw      ymm3, ymm5, ymm7
+    vpaddw      ymm6, ymm0, ymm7
+    vpaddw      ymm7, ymm1, ymm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm4, ymm2, ymm2                     ; ymm4=2*CbE
+    vpaddw      ymm5, ymm3, ymm3                     ; ymm5=2*CbO
+    vpaddw      ymm0, ymm6, ymm6                     ; ymm0=2*CrE
+    vpaddw      ymm1, ymm7, ymm7                     ; ymm1=2*CrO
+
+    vpmulhw     ymm4, ymm4, [GOTOFF(eax,PW_MF0228)]  ; ymm4=(2*CbE * -FIX(0.22800))
+    vpmulhw     ymm5, ymm5, [GOTOFF(eax,PW_MF0228)]  ; ymm5=(2*CbO * -FIX(0.22800))
+    vpmulhw     ymm0, ymm0, [GOTOFF(eax,PW_F0402)]   ; ymm0=(2*CrE * FIX(0.40200))
+    vpmulhw     ymm1, ymm1, [GOTOFF(eax,PW_F0402)]   ; ymm1=(2*CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm5, ymm5, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm4, ymm4, 1                        ; ymm4=(CbE * -FIX(0.22800))
+    vpsraw      ymm5, ymm5, 1                        ; ymm5=(CbO * -FIX(0.22800))
+    vpaddw      ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm0, ymm0, 1                        ; ymm0=(CrE * FIX(0.40200))
+    vpsraw      ymm1, ymm1, 1                        ; ymm1=(CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm5, ymm5, ymm3
+    vpaddw      ymm4, ymm4, ymm2                     ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+    vpaddw      ymm5, ymm5, ymm3                     ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+    vpaddw      ymm0, ymm0, ymm6                     ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+    vpaddw      ymm1, ymm1, ymm7                     ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    vmovdqa     YMMWORD [wk(0)], ymm4                ; wk(0)=(B-Y)E
+    vmovdqa     YMMWORD [wk(1)], ymm5                ; wk(1)=(B-Y)O
+
+    vpunpckhwd  ymm4, ymm2, ymm6
+    vpunpcklwd  ymm2, ymm2, ymm6
+    vpmaddwd    ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpunpckhwd  ymm5, ymm3, ymm7
+    vpunpcklwd  ymm3, ymm3, ymm7
+    vpmaddwd    ymm3, ymm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    vpaddd      ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm4, ymm4, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm4, ymm4, SCALEBITS
+    vpaddd      ymm3, ymm3, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm3, ymm3, SCALEBITS
+    vpsrad      ymm5, ymm5, SCALEBITS
+
+    vpackssdw   ymm2, ymm2, ymm4             ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    vpackssdw   ymm3, ymm3, ymm5             ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    vpsubw      ymm2, ymm2, ymm6             ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    vpsubw      ymm3, ymm3, ymm7             ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    vmovdqu     ymm5, YMMWORD [esi]          ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm4, ymm4, ymm4
+    vpsrlw      ymm4, ymm4, BYTE_BIT         ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm4, ymm4, ymm5             ; ymm4=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm5, ymm5, BYTE_BIT         ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+    vpaddw      ymm0, ymm0, ymm4             ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm5             ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0             ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1             ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm4             ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm5             ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2             ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3             ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, YMMWORD [wk(0)]  ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, YMMWORD [wk(1)]  ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4             ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5             ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         ecx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         edi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    cmp         ecx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         ecx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_YMMWORD/16*4
+    sub         ecx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    vmovd       XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdcolext-mmx.asm b/media/libjpeg/simd/i386/jdcolext-mmx.asm
new file mode 100644
index 0000000000..5813cfcb66
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-mmx.asm
@@ -0,0 +1,404 @@
+;
+; jdcolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                           JDIMENSION input_row, JSAMPARRAY output_buf,
+;                           int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    movq        mm5, MMWORD [ebx]       ; mm5=Cb(01234567)
+    movq        mm1, MMWORD [edx]       ; mm1=Cr(01234567)
+
+    pcmpeqw     mm4, mm4
+    pcmpeqw     mm7, mm7
+    psrlw       mm4, BYTE_BIT
+    psllw       mm7, 7                  ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+    movq        mm0, mm4                ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        mm4, mm5                ; mm4=Cb(0246)=CbE
+    psrlw       mm5, BYTE_BIT           ; mm5=Cb(1357)=CbO
+    pand        mm0, mm1                ; mm0=Cr(0246)=CrE
+    psrlw       mm1, BYTE_BIT           ; mm1=Cr(1357)=CrO
+
+    paddw       mm4, mm7
+    paddw       mm5, mm7
+    paddw       mm0, mm7
+    paddw       mm1, mm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movq        mm2, mm4                ; mm2=CbE
+    movq        mm3, mm5                ; mm3=CbO
+    paddw       mm4, mm4                ; mm4=2*CbE
+    paddw       mm5, mm5                ; mm5=2*CbO
+    movq        mm6, mm0                ; mm6=CrE
+    movq        mm7, mm1                ; mm7=CrO
+    paddw       mm0, mm0                ; mm0=2*CrE
+    paddw       mm1, mm1                ; mm1=2*CrO
+
+    pmulhw      mm4, [GOTOFF(eax,PW_MF0228)]  ; mm4=(2*CbE * -FIX(0.22800))
+    pmulhw      mm5, [GOTOFF(eax,PW_MF0228)]  ; mm5=(2*CbO * -FIX(0.22800))
+    pmulhw      mm0, [GOTOFF(eax,PW_F0402)]   ; mm0=(2*CrE * FIX(0.40200))
+    pmulhw      mm1, [GOTOFF(eax,PW_F0402)]   ; mm1=(2*CrO * FIX(0.40200))
+
+    paddw       mm4, [GOTOFF(eax,PW_ONE)]
+    paddw       mm5, [GOTOFF(eax,PW_ONE)]
+    psraw       mm4, 1                  ; mm4=(CbE * -FIX(0.22800))
+    psraw       mm5, 1                  ; mm5=(CbO * -FIX(0.22800))
+    paddw       mm0, [GOTOFF(eax,PW_ONE)]
+    paddw       mm1, [GOTOFF(eax,PW_ONE)]
+    psraw       mm0, 1                  ; mm0=(CrE * FIX(0.40200))
+    psraw       mm1, 1                  ; mm1=(CrO * FIX(0.40200))
+
+    paddw       mm4, mm2
+    paddw       mm5, mm3
+    paddw       mm4, mm2                ; mm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       mm5, mm3                ; mm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       mm0, mm6                ; mm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       mm1, mm7                ; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(B-Y)E
+    movq        MMWORD [wk(1)], mm5     ; wk(1)=(B-Y)O
+
+    movq        mm4, mm2
+    movq        mm5, mm3
+    punpcklwd   mm2, mm6
+    punpckhwd   mm4, mm6
+    pmaddwd     mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   mm3, mm7
+    punpckhwd   mm5, mm7
+    pmaddwd     mm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       mm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm4, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm2, SCALEBITS
+    psrad       mm4, SCALEBITS
+    paddd       mm3, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm5, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm3, SCALEBITS
+    psrad       mm5, SCALEBITS
+
+    packssdw    mm2, mm4                ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    mm3, mm5                ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       mm2, mm6                ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       mm3, mm7                ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movq        mm5, MMWORD [esi]       ; mm5=Y(01234567)
+
+    pcmpeqw     mm4, mm4
+    psrlw       mm4, BYTE_BIT           ; mm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        mm4, mm5                ; mm4=Y(0246)=YE
+    psrlw       mm5, BYTE_BIT           ; mm5=Y(1357)=YO
+
+    paddw       mm0, mm4                ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+    paddw       mm1, mm5                ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+    packuswb    mm0, mm0                ; mm0=(R0 R2 R4 R6 ** ** ** **)
+    packuswb    mm1, mm1                ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+    paddw       mm2, mm4                ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+    paddw       mm3, mm5                ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+    packuswb    mm2, mm2                ; mm2=(G0 G2 G4 G6 ** ** ** **)
+    packuswb    mm3, mm3                ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+    paddw       mm4,  MMWORD [wk(0)]    ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+    paddw       mm5,  MMWORD [wk(1)]    ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+    packuswb    mm4, mm4                ; mm4=(B0 B2 B4 B6 ** ** ** **)
+    packuswb    mm5, mm5                ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmB                ; mmE=(20 01 22 03 24 05 26 07)
+    punpcklbw   mmD, mmF                ; mmD=(11 21 13 23 15 25 17 27)
+
+    movq        mmG, mmA
+    movq        mmH, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 01 02 12 22 03)
+    punpckhwd   mmG, mmE                ; mmG=(04 14 24 05 06 16 26 07)
+
+    psrlq       mmH, 2*BYTE_BIT         ; mmH=(02 12 04 14 06 16 -- --)
+    psrlq       mmE, 2*BYTE_BIT         ; mmE=(22 03 24 05 26 07 -- --)
+
+    movq        mmC, mmD
+    movq        mmB, mmD
+    punpcklwd   mmD, mmH                ; mmD=(11 21 02 12 13 23 04 14)
+    punpckhwd   mmC, mmH                ; mmC=(15 25 06 16 17 27 -- --)
+
+    psrlq       mmB, 2*BYTE_BIT         ; mmB=(13 23 15 25 17 27 -- --)
+
+    movq        mmF, mmE
+    punpcklwd   mmE, mmB                ; mmE=(22 03 13 23 24 05 15 25)
+    punpckhwd   mmF, mmB                ; mmF=(26 07 17 27 -- -- -- --)
+
+    punpckldq   mmA, mmD                ; mmA=(00 10 20 01 11 21 02 12)
+    punpckldq   mmE, mmG                ; mmE=(22 03 13 23 04 14 24 05)
+    punpckldq   mmC, mmF                ; mmC=(15 25 06 16 26 07 17 27)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_MMWORD
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        mmA, mmC
+    sub         ecx, byte 2*SIZEOF_MMWORD
+    add         edi, byte 2*SIZEOF_MMWORD
+    jmp         short .column_st4
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmE
+    sub         ecx, byte SIZEOF_MMWORD
+    add         edi, byte SIZEOF_MMWORD
+.column_st4:
+    movd        eax, mmA
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st2
+    mov         dword [edi+0*SIZEOF_DWORD], eax
+    psrlq       mmA, DWORD_BIT
+    movd        eax, mmA
+    sub         ecx, byte SIZEOF_DWORD
+    add         edi, byte SIZEOF_DWORD
+.column_st2:
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi+0*SIZEOF_WORD], ax
+    shr         eax, WORD_BIT
+    sub         ecx, byte SIZEOF_WORD
+    add         edi, byte SIZEOF_WORD
+.column_st1:
+    cmp         ecx, byte SIZEOF_BYTE
+    jb          short .nextrow
+    mov         byte [edi+0*SIZEOF_BYTE], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pcmpeqb     mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+    pxor        mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pxor        mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmG                ; mmE=(20 30 22 32 24 34 26 36)
+    punpcklbw   mmB, mmD                ; mmB=(01 11 03 13 05 15 07 17)
+    punpcklbw   mmF, mmH                ; mmF=(21 31 23 33 25 35 27 37)
+
+    movq        mmC, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 30 02 12 22 32)
+    punpckhwd   mmC, mmE                ; mmC=(04 14 24 34 06 16 26 36)
+    movq        mmG, mmB
+    punpcklwd   mmB, mmF                ; mmB=(01 11 21 31 03 13 23 33)
+    punpckhwd   mmG, mmF                ; mmG=(05 15 25 35 07 17 27 37)
+
+    movq        mmD, mmA
+    punpckldq   mmA, mmB                ; mmA=(00 10 20 30 01 11 21 31)
+    punpckhdq   mmD, mmB                ; mmD=(02 12 22 32 03 13 23 33)
+    movq        mmH, mmC
+    punpckldq   mmC, mmG                ; mmC=(04 14 24 34 05 15 25 35)
+    punpckhdq   mmH, mmG                ; mmH=(06 16 26 36 07 17 27 37)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    cmp         ecx, byte SIZEOF_MMWORD/2
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        mmA, mmC
+    movq        mmD, mmH
+    sub         ecx, byte SIZEOF_MMWORD/2
+    add         edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD/4
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmD
+    sub         ecx, byte SIZEOF_MMWORD/4
+    add         edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+    cmp         ecx, byte SIZEOF_MMWORD/8
+    jb          short .nextrow
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdcolext-sse2.asm b/media/libjpeg/simd/i386/jdcolext-sse2.asm
new file mode 100644
index 0000000000..d5572b3294
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-sse2.asm
@@ -0,0 +1,458 @@
+;
+; jdcolext.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm5, XMMWORD [ebx]     ; xmm5=Cb(0123456789ABCDEF)
+    movdqa      xmm1, XMMWORD [edx]     ; xmm1=Cr(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    pcmpeqw     xmm7, xmm7
+    psrlw       xmm4, BYTE_BIT
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+    movdqa      xmm0, xmm4              ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        xmm4, xmm5              ; xmm4=Cb(02468ACE)=CbE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Cb(13579BDF)=CbO
+    pand        xmm0, xmm1              ; xmm0=Cr(02468ACE)=CrE
+    psrlw       xmm1, BYTE_BIT          ; xmm1=Cr(13579BDF)=CrO
+
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm2, xmm4              ; xmm2=CbE
+    movdqa      xmm3, xmm5              ; xmm3=CbO
+    paddw       xmm4, xmm4              ; xmm4=2*CbE
+    paddw       xmm5, xmm5              ; xmm5=2*CbO
+    movdqa      xmm6, xmm0              ; xmm6=CrE
+    movdqa      xmm7, xmm1              ; xmm7=CrO
+    paddw       xmm0, xmm0              ; xmm0=2*CrE
+    paddw       xmm1, xmm1              ; xmm1=2*CrO
+
+    pmulhw      xmm4, [GOTOFF(eax,PW_MF0228)]  ; xmm4=(2*CbE * -FIX(0.22800))
+    pmulhw      xmm5, [GOTOFF(eax,PW_MF0228)]  ; xmm5=(2*CbO * -FIX(0.22800))
+    pmulhw      xmm0, [GOTOFF(eax,PW_F0402)]   ; xmm0=(2*CrE * FIX(0.40200))
+    pmulhw      xmm1, [GOTOFF(eax,PW_F0402)]   ; xmm1=(2*CrO * FIX(0.40200))
+
+    paddw       xmm4, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm5, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm4, 1                 ; xmm4=(CbE * -FIX(0.22800))
+    psraw       xmm5, 1                 ; xmm5=(CbO * -FIX(0.22800))
+    paddw       xmm0, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm1, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm0, 1                 ; xmm0=(CrE * FIX(0.40200))
+    psraw       xmm1, 1                 ; xmm1=(CrO * FIX(0.40200))
+
+    paddw       xmm4, xmm2
+    paddw       xmm5, xmm3
+    paddw       xmm4, xmm2              ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       xmm5, xmm3              ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       xmm0, xmm6              ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       xmm1, xmm7              ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm4, xmm6
+    pmaddwd     xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   xmm3, xmm7
+    punpckhwd   xmm5, xmm7
+    pmaddwd     xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       xmm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm4, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm4, SCALEBITS
+    paddd       xmm3, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm5, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm3, SCALEBITS
+    psrad       xmm5, SCALEBITS
+
+    packssdw    xmm2, xmm4              ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    xmm3, xmm5              ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       xmm2, xmm6              ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       xmm3, xmm7              ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movdqa      xmm5, XMMWORD [esi]     ; xmm5=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    psrlw       xmm4, BYTE_BIT          ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm4, xmm5              ; xmm4=Y(02468ACE)=YE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Y(13579BDF)=YO
+
+    paddw       xmm0, xmm4              ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm5              ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm4              ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm5              ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, XMMWORD [wk(0)]   ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+    paddw       xmm5, XMMWORD [wk(1)]   ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         ecx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    cmp         ecx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_XMMWORD/8*4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    movd        XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdcolor-avx2.asm b/media/libjpeg/simd/i386/jdcolor-avx2.asm
new file mode 100644
index 0000000000..e05b60d001
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-avx2.asm
@@ -0,0 +1,118 @@
+;
+; jdcolor.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jdcolor-mmx.asm b/media/libjpeg/simd/i386/jdcolor-mmx.asm
new file mode 100644
index 0000000000..fb7e7bcce4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-mmx.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402        times 4 dw  F_0_402
+PW_MF0228       times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE          times 4 dw  1
+PD_ONEHALF      times 2 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extrgb_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extrgbx_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extbgr_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extbgrx_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extxbgr_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extxrgb_convert_mmx
+%include "jdcolext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jdcolor-sse2.asm b/media/libjpeg/simd/i386/jdcolor-sse2.asm
new file mode 100644
index 0000000000..b736255317
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-sse2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgb_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgbx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgrx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxrgb_convert_sse2
+%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-avx2.asm b/media/libjpeg/simd/i386/jdmerge-avx2.asm
new file mode 100644
index 0000000000..711e6792d0
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-avx2.asm
@@ -0,0 +1,136 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-mmx.asm b/media/libjpeg/simd/i386/jdmerge-mmx.asm
new file mode 100644
index 0000000000..6e8311d408
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-mmx.asm
@@ -0,0 +1,123 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_mmx)
+
+EXTN(jconst_merged_upsample_mmx):
+
+PW_F0402        times 4 dw  F_0_402
+PW_MF0228       times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE          times 4 dw  1
+PD_ONEHALF      times 2 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extrgb_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extrgbx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extrgbx_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extbgr_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extbgrx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extbgrx_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extxbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extxbgr_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extxrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extxrgb_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-sse2.asm b/media/libjpeg/simd/i386/jdmerge-sse2.asm
new file mode 100644
index 0000000000..e32f90aa17
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-sse2.asm
@@ -0,0 +1,135 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jdmrgext-avx2.asm b/media/libjpeg/simd/i386/jdmrgext-avx2.asm
new file mode 100644
index 0000000000..e35f7282bc
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-avx2.asm
@@ -0,0 +1,575 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    vmovdqu     ymm6, YMMWORD [ebx]     ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm7, YMMWORD [edx]     ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpcmpeqw    ymm3, ymm3, ymm3
+    vpsllw      ymm3, ymm3, 7           ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpermq      ymm6, ymm6, 0xd8        ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpermq      ymm7, ymm7, 0xd8        ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpunpcklbw  ymm4, ymm6, ymm1        ; ymm4=Cb(0123456789ABCDEF)=CbL
+    vpunpckhbw  ymm6, ymm6, ymm1        ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+    vpunpcklbw  ymm0, ymm7, ymm1        ; ymm0=Cr(0123456789ABCDEF)=CrL
+    vpunpckhbw  ymm7, ymm7, ymm1        ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+    vpaddw      ymm5, ymm6, ymm3
+    vpaddw      ymm2, ymm4, ymm3
+    vpaddw      ymm1, ymm7, ymm3
+    vpaddw      ymm3, ymm0, ymm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm6, ymm5, ymm5             ; ymm6=2*CbH
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbL
+    vpaddw      ymm7, ymm1, ymm1             ; ymm7=2*CrH
+    vpaddw      ymm0, ymm3, ymm3             ; ymm0=2*CrL
+
+    vpmulhw     ymm6, ymm6, [GOTOFF(eax,PW_MF0228)]  ; ymm6=(2*CbH * -FIX(0.22800))
+    vpmulhw     ymm4, ymm4, [GOTOFF(eax,PW_MF0228)]  ; ymm4=(2*CbL * -FIX(0.22800))
+    vpmulhw     ymm7, ymm7, [GOTOFF(eax,PW_F0402)]   ; ymm7=(2*CrH * FIX(0.40200))
+    vpmulhw     ymm0, ymm0, [GOTOFF(eax,PW_F0402)]   ; ymm0=(2*CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm6, ymm6, 1                     ; ymm6=(CbH * -FIX(0.22800))
+    vpsraw      ymm4, ymm4, 1                     ; ymm4=(CbL * -FIX(0.22800))
+    vpaddw      ymm7, ymm7, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm7, ymm7, 1                     ; ymm7=(CrH * FIX(0.40200))
+    vpsraw      ymm0, ymm0, 1                     ; ymm0=(CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, ymm5
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm6, ymm6, ymm5                  ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+    vpaddw      ymm4, ymm4, ymm2                  ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+    vpaddw      ymm7, ymm7, ymm1                  ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+    vpaddw      ymm0, ymm0, ymm3                  ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    vmovdqa     YMMWORD [wk(0)], ymm6             ; wk(0)=(B-Y)H
+    vmovdqa     YMMWORD [wk(1)], ymm7             ; wk(1)=(R-Y)H
+
+    vpunpckhwd  ymm6, ymm5, ymm1
+    vpunpcklwd  ymm5, ymm5, ymm1
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpunpckhwd  ymm7, ymm2, ymm3
+    vpunpcklwd  ymm2, ymm2, ymm3
+    vpmaddwd    ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    vpaddd      ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm5, ymm5, SCALEBITS
+    vpsrad      ymm6, ymm6, SCALEBITS
+    vpaddd      ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm7, ymm7, SCALEBITS
+
+    vpackssdw   ymm5, ymm5, ymm6        ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    vpackssdw   ymm2, ymm2, ymm7        ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    vpsubw      ymm5, ymm5, ymm1        ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    vpsubw      ymm2, ymm2, ymm3        ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    vmovdqa     YMMWORD [wk(2)], ymm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    vmovdqa     ymm0, YMMWORD [wk(1)]   ; ymm0=(R-Y)H
+    vmovdqa     ymm2, YMMWORD [wk(2)]   ; ymm2=(G-Y)H
+    vmovdqa     ymm4, YMMWORD [wk(0)]   ; ymm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    vmovdqu     ymm7, YMMWORD [esi]     ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm6, ymm6, ymm7        ; ymm6=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm7, ymm7, BYTE_BIT    ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+    vmovdqa     ymm1, ymm0              ; ymm1=ymm0=(R-Y)(L/H)
+    vmovdqa     ymm3, ymm2              ; ymm3=ymm2=(G-Y)(L/H)
+    vmovdqa     ymm5, ymm4              ; ymm5=ymm4=(B-Y)(L/H)
+
+    vpaddw      ymm0, ymm0, ymm6        ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm7        ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0        ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1        ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm6        ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm7        ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2        ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3        ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, ymm6        ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, ymm7        ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4        ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5        ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         ecx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         edi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    cmp         ecx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         ecx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_YMMWORD/16*4
+    sub         ecx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    vmovd       XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, POINTER [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdmrgext-mmx.asm b/media/libjpeg/simd/i386/jdmrgext-mmx.asm
new file mode 100644
index 0000000000..eb3e36b475
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-mmx.asm
@@ -0,0 +1,460 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    movq        mm6, MMWORD [ebx]       ; mm6=Cb(01234567)
+    movq        mm7, MMWORD [edx]       ; mm7=Cr(01234567)
+
+    pxor        mm1, mm1                ; mm1=(all 0's)
+    pcmpeqw     mm3, mm3
+    psllw       mm3, 7                  ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+    movq        mm4, mm6
+    punpckhbw   mm6, mm1                ; mm6=Cb(4567)=CbH
+    punpcklbw   mm4, mm1                ; mm4=Cb(0123)=CbL
+    movq        mm0, mm7
+    punpckhbw   mm7, mm1                ; mm7=Cr(4567)=CrH
+    punpcklbw   mm0, mm1                ; mm0=Cr(0123)=CrL
+
+    paddw       mm6, mm3
+    paddw       mm4, mm3
+    paddw       mm7, mm3
+    paddw       mm0, mm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movq        mm5, mm6                ; mm5=CbH
+    movq        mm2, mm4                ; mm2=CbL
+    paddw       mm6, mm6                ; mm6=2*CbH
+    paddw       mm4, mm4                ; mm4=2*CbL
+    movq        mm1, mm7                ; mm1=CrH
+    movq        mm3, mm0                ; mm3=CrL
+    paddw       mm7, mm7                ; mm7=2*CrH
+    paddw       mm0, mm0                ; mm0=2*CrL
+
+    pmulhw      mm6, [GOTOFF(eax,PW_MF0228)]  ; mm6=(2*CbH * -FIX(0.22800))
+    pmulhw      mm4, [GOTOFF(eax,PW_MF0228)]  ; mm4=(2*CbL * -FIX(0.22800))
+    pmulhw      mm7, [GOTOFF(eax,PW_F0402)]   ; mm7=(2*CrH * FIX(0.40200))
+    pmulhw      mm0, [GOTOFF(eax,PW_F0402)]   ; mm0=(2*CrL * FIX(0.40200))
+
+    paddw       mm6, [GOTOFF(eax,PW_ONE)]
+    paddw       mm4, [GOTOFF(eax,PW_ONE)]
+    psraw       mm6, 1                  ; mm6=(CbH * -FIX(0.22800))
+    psraw       mm4, 1                  ; mm4=(CbL * -FIX(0.22800))
+    paddw       mm7, [GOTOFF(eax,PW_ONE)]
+    paddw       mm0, [GOTOFF(eax,PW_ONE)]
+    psraw       mm7, 1                  ; mm7=(CrH * FIX(0.40200))
+    psraw       mm0, 1                  ; mm0=(CrL * FIX(0.40200))
+
+    paddw       mm6, mm5
+    paddw       mm4, mm2
+    paddw       mm6, mm5                ; mm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       mm4, mm2                ; mm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       mm7, mm1                ; mm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       mm0, mm3                ; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movq        MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
+    movq        MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
+
+    movq        mm6, mm5
+    movq        mm7, mm2
+    punpcklwd   mm5, mm1
+    punpckhwd   mm6, mm1
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   mm2, mm3
+    punpckhwd   mm7, mm3
+    pmaddwd     mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       mm5, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm6, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm5, SCALEBITS
+    psrad       mm6, SCALEBITS
+    paddd       mm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm7, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm2, SCALEBITS
+    psrad       mm7, SCALEBITS
+
+    packssdw    mm5, mm6                ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    mm2, mm7                ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       mm5, mm1                ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       mm2, mm3                ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movq        MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    movq        mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
+    movq        mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
+    movq        mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    movq        mm7, MMWORD [esi]       ; mm7=Y(01234567)
+
+    pcmpeqw     mm6, mm6
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        mm6, mm7                ; mm6=Y(0246)=YE
+    psrlw       mm7, BYTE_BIT           ; mm7=Y(1357)=YO
+
+    movq        mm1, mm0                ; mm1=mm0=(R-Y)(L/H)
+    movq        mm3, mm2                ; mm3=mm2=(G-Y)(L/H)
+    movq        mm5, mm4                ; mm5=mm4=(B-Y)(L/H)
+
+    paddw       mm0, mm6                ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+    paddw       mm1, mm7                ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+    packuswb    mm0, mm0                ; mm0=(R0 R2 R4 R6 ** ** ** **)
+    packuswb    mm1, mm1                ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+    paddw       mm2, mm6                ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+    paddw       mm3, mm7                ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+    packuswb    mm2, mm2                ; mm2=(G0 G2 G4 G6 ** ** ** **)
+    packuswb    mm3, mm3                ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+    paddw       mm4, mm6                ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+    paddw       mm5, mm7                ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+    packuswb    mm4, mm4                ; mm4=(B0 B2 B4 B6 ** ** ** **)
+    packuswb    mm5, mm5                ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmB                ; mmE=(20 01 22 03 24 05 26 07)
+    punpcklbw   mmD, mmF                ; mmD=(11 21 13 23 15 25 17 27)
+
+    movq        mmG, mmA
+    movq        mmH, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 01 02 12 22 03)
+    punpckhwd   mmG, mmE                ; mmG=(04 14 24 05 06 16 26 07)
+
+    psrlq       mmH, 2*BYTE_BIT         ; mmH=(02 12 04 14 06 16 -- --)
+    psrlq       mmE, 2*BYTE_BIT         ; mmE=(22 03 24 05 26 07 -- --)
+
+    movq        mmC, mmD
+    movq        mmB, mmD
+    punpcklwd   mmD, mmH                ; mmD=(11 21 02 12 13 23 04 14)
+    punpckhwd   mmC, mmH                ; mmC=(15 25 06 16 17 27 -- --)
+
+    psrlq       mmB, 2*BYTE_BIT         ; mmB=(13 23 15 25 17 27 -- --)
+
+    movq        mmF, mmE
+    punpcklwd   mmE, mmB                ; mmE=(22 03 13 23 24 05 15 25)
+    punpckhwd   mmF, mmB                ; mmF=(26 07 17 27 -- -- -- --)
+
+    punpckldq   mmA, mmD                ; mmA=(00 10 20 01 11 21 02 12)
+    punpckldq   mmE, mmG                ; mmE=(22 03 13 23 04 14 24 05)
+    punpckldq   mmC, mmF                ; mmC=(15 25 06 16 26 07 17 27)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          near .endcolumn
+
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    dec         al                                     ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_MMWORD
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        mmA, mmC
+    sub         ecx, byte 2*SIZEOF_MMWORD
+    add         edi, byte 2*SIZEOF_MMWORD
+    jmp         short .column_st4
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmE
+    sub         ecx, byte SIZEOF_MMWORD
+    add         edi, byte SIZEOF_MMWORD
+.column_st4:
+    movd        eax, mmA
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st2
+    mov         dword [edi+0*SIZEOF_DWORD], eax
+    psrlq       mmA, DWORD_BIT
+    movd        eax, mmA
+    sub         ecx, byte SIZEOF_DWORD
+    add         edi, byte SIZEOF_DWORD
+.column_st2:
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi+0*SIZEOF_WORD], ax
+    shr         eax, WORD_BIT
+    sub         ecx, byte SIZEOF_WORD
+    add         edi, byte SIZEOF_WORD
+.column_st1:
+    cmp         ecx, byte SIZEOF_BYTE
+    jb          short .endcolumn
+    mov         byte [edi+0*SIZEOF_BYTE], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pcmpeqb     mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+    pxor        mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pxor        mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmG                ; mmE=(20 30 22 32 24 34 26 36)
+    punpcklbw   mmB, mmD                ; mmB=(01 11 03 13 05 15 07 17)
+    punpcklbw   mmF, mmH                ; mmF=(21 31 23 33 25 35 27 37)
+
+    movq        mmC, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 30 02 12 22 32)
+    punpckhwd   mmC, mmE                ; mmC=(04 14 24 34 06 16 26 36)
+    movq        mmG, mmB
+    punpcklwd   mmB, mmF                ; mmB=(01 11 21 31 03 13 23 33)
+    punpckhwd   mmG, mmF                ; mmG=(05 15 25 35 07 17 27 37)
+
+    movq        mmD, mmA
+    punpckldq   mmA, mmB                ; mmA=(00 10 20 30 01 11 21 31)
+    punpckhdq   mmD, mmB                ; mmD=(02 12 22 32 03 13 23 33)
+    movq        mmH, mmC
+    punpckldq   mmC, mmG                ; mmC=(04 14 24 34 05 15 25 35)
+    punpckhdq   mmH, mmG                ; mmH=(06 16 26 36 07 17 27 37)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .endcolumn
+
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    dec         al                                     ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    cmp         ecx, byte SIZEOF_MMWORD/2
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        mmA, mmC
+    movq        mmD, mmH
+    sub         ecx, byte SIZEOF_MMWORD/2
+    add         edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD/4
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmD
+    sub         ecx, byte SIZEOF_MMWORD/4
+    add         edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+    cmp         ecx, byte SIZEOF_MMWORD/8
+    jb          short .endcolumn
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, JDIMENSION [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdmrgext-sse2.asm b/media/libjpeg/simd/i386/jdmrgext-sse2.asm
new file mode 100644
index 0000000000..c113dc4d27
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-sse2.asm
@@ -0,0 +1,517 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    movdqa      xmm6, XMMWORD [ebx]     ; xmm6=Cb(0123456789ABCDEF)
+    movdqa      xmm7, XMMWORD [edx]     ; xmm7=Cr(0123456789ABCDEF)
+
+    pxor        xmm1, xmm1              ; xmm1=(all 0's)
+    pcmpeqw     xmm3, xmm3
+    psllw       xmm3, 7                 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    movdqa      xmm4, xmm6
+    punpckhbw   xmm6, xmm1              ; xmm6=Cb(89ABCDEF)=CbH
+    punpcklbw   xmm4, xmm1              ; xmm4=Cb(01234567)=CbL
+    movdqa      xmm0, xmm7
+    punpckhbw   xmm7, xmm1              ; xmm7=Cr(89ABCDEF)=CrH
+    punpcklbw   xmm0, xmm1              ; xmm0=Cr(01234567)=CrL
+
+    paddw       xmm6, xmm3
+    paddw       xmm4, xmm3
+    paddw       xmm7, xmm3
+    paddw       xmm0, xmm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm5, xmm6              ; xmm5=CbH
+    movdqa      xmm2, xmm4              ; xmm2=CbL
+    paddw       xmm6, xmm6              ; xmm6=2*CbH
+    paddw       xmm4, xmm4              ; xmm4=2*CbL
+    movdqa      xmm1, xmm7              ; xmm1=CrH
+    movdqa      xmm3, xmm0              ; xmm3=CrL
+    paddw       xmm7, xmm7              ; xmm7=2*CrH
+    paddw       xmm0, xmm0              ; xmm0=2*CrL
+
+    pmulhw      xmm6, [GOTOFF(eax,PW_MF0228)]  ; xmm6=(2*CbH * -FIX(0.22800))
+    pmulhw      xmm4, [GOTOFF(eax,PW_MF0228)]  ; xmm4=(2*CbL * -FIX(0.22800))
+    pmulhw      xmm7, [GOTOFF(eax,PW_F0402)]   ; xmm7=(2*CrH * FIX(0.40200))
+    pmulhw      xmm0, [GOTOFF(eax,PW_F0402)]   ; xmm0=(2*CrL * FIX(0.40200))
+
+    paddw       xmm6, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm4, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm6, 1                 ; xmm6=(CbH * -FIX(0.22800))
+    psraw       xmm4, 1                 ; xmm4=(CbL * -FIX(0.22800))
+    paddw       xmm7, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm0, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm7, 1                 ; xmm7=(CrH * FIX(0.40200))
+    psraw       xmm0, 1                 ; xmm0=(CrL * FIX(0.40200))
+
+    paddw       xmm6, xmm5
+    paddw       xmm4, xmm2
+    paddw       xmm6, xmm5              ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       xmm4, xmm2              ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       xmm7, xmm1              ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       xmm0, xmm3              ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm5, xmm1
+    punpckhwd   xmm6, xmm1
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   xmm2, xmm3
+    punpckhwd   xmm7, xmm3
+    pmaddwd     xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       xmm5, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm6, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm5, SCALEBITS
+    psrad       xmm6, SCALEBITS
+    paddd       xmm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm7, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm7, SCALEBITS
+
+    packssdw    xmm5, xmm6              ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    xmm2, xmm7              ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       xmm5, xmm1              ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       xmm2, xmm3              ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movdqa      XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    movdqa      xmm7, XMMWORD [esi]     ; xmm7=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm6, xmm6
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm6, xmm7              ; xmm6=Y(02468ACE)=YE
+    psrlw       xmm7, BYTE_BIT          ; xmm7=Y(13579BDF)=YO
+
+    movdqa      xmm1, xmm0              ; xmm1=xmm0=(R-Y)(L/H)
+    movdqa      xmm3, xmm2              ; xmm3=xmm2=(G-Y)(L/H)
+    movdqa      xmm5, xmm4              ; xmm5=xmm4=(B-Y)(L/H)
+
+    paddw       xmm0, xmm6              ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm7              ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm6              ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm7              ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, xmm6              ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+    paddw       xmm5, xmm7              ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         ecx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    cmp         ecx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_XMMWORD/8*4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    movd        XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, POINTER [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdsample-avx2.asm b/media/libjpeg/simd/i386/jdsample-avx2.asm
new file mode 100644
index 0000000000..a800c35e08
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-avx2.asm
@@ -0,0 +1,760 @@
+;
+; jdsample.asm - upsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE   times 16 dw 1
+PW_TWO   times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
+    vpcmpeqb    xmm7, xmm7, xmm7
+    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
+    vpand       ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    add         eax, byte SIZEOF_YMMWORD-1
+    and         eax, byte -SIZEOF_YMMWORD
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    vpcmpeqb    xmm6, xmm6, xmm6
+    vpslldq     xmm6, xmm6, (SIZEOF_XMMWORD-1)
+    vperm2i128  ymm6, ymm6, ymm6, 1             ; (---- ---- ... ---- ---- ff) MSB is ff
+    vpand       ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vperm2i128  ymm6, ymm0, ymm6, 0x20
+    vpslldq     ymm6, ymm6, 15
+
+.upsample:
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
+
+    vperm2i128  ymm2, ymm0, ymm1, 0x20
+    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
+    vperm2i128  ymm4, ymm0, ymm1, 0x03
+    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
+
+    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
+    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
+
+    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
+
+    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
+    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
+    vpunpcklbw  ymm0, ymm3, ymm0                ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+    vperm2i128  ymm3, ymm0, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vperm2i128  ymm6, ymm0, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
+
+    vpmullw     ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
+    vpaddw      ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
+    vpaddw      ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
+
+    vpaddw      ymm2, ymm2, ymm1
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm3, ymm3, ymm1
+    vpaddw      ymm6, ymm6, ymm4
+    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm3, ymm3, BYTE_BIT
+    vpsllw      ymm6, ymm6, BYTE_BIT
+    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
+
+    sub         eax, byte SIZEOF_YMMWORD
+    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    vmovdqu     ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
+    vmovdqu     ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
+    vmovdqu     ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
+
+    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm3, ymm2, ymm3        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+    vpcmpeqb    xmm7, xmm7, xmm7
+    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
+
+    vpand       ymm1, ymm1, ymm7        ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vpand       ymm2, ymm2, ymm7        ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+    vmovdqa     YMMWORD [wk(0)], ymm1
+    vmovdqa     YMMWORD [wk(1)], ymm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_YMMWORD-1
+    and         eax, byte -SIZEOF_YMMWORD
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpcmpeqb    xmm1, xmm1, xmm1
+    vpslldq     xmm1, xmm1, (SIZEOF_XMMWORD-2)
+    vperm2i128  ymm1, ymm1, ymm1, 1             ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+    vpand       ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
+    vpand       ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
+
+    vmovdqa     YMMWORD [wk(2)], ymm1          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+    vmovdqa     YMMWORD [wk(3)], ymm2          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+    jmp         near .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    vmovdqu     ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
+    vmovdqu     ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
+    vmovdqu     ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
+
+    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm7, ymm2, ymm3        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
+
+    vperm2i128  ymm1, ymm3, ymm1, 0x20
+    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+    vperm2i128  ymm2, ymm3, ymm2, 0x20
+    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+
+    vmovdqa     YMMWORD [wk(2)], ymm1
+    vmovdqa     YMMWORD [wk(3)], ymm2
+
+.upsample:
+    ; -- process the upper row
+
+    vmovdqu     ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+
+    vperm2i128  ymm0, ymm1, ymm7, 0x03
+    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm4, ymm1, ymm3, 0x20
+    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm5, ymm1, ymm7, 0x03
+    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm6, ymm1, ymm3, 0x20
+    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm2, ymm1, ymm3, 0x03
+    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm4, ymm1, ymm3, 0x03
+    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm1, ymm7, 0x20
+    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(0)], ymm4
+
+    vpmullw     ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
+    vpaddw      ymm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm5, ymm5, ymm3
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm3
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpsllw      ymm2, ymm2, BYTE_BIT
+    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
+
+    ; -- process the lower row
+
+    vmovdqu     ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+
+    vperm2i128  ymm7, ymm1, ymm6, 0x03
+    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm3, ymm1, ymm4, 0x20
+    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm0, ymm1, ymm6, 0x03
+    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm2, ymm1, ymm4, 0x20
+    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm5, ymm1, ymm4, 0x03
+    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm3, ymm1, ymm4, 0x03
+    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm1, ymm6, 0x20
+    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(1)], ymm3
+
+    vpmullw     ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    vpaddw      ymm1, ymm1, ymm6
+    vpaddw      ymm0, ymm0, ymm4
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm7, ymm7, ymm6
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpsllw      ymm5, ymm5, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
+    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_YMMWORD
+    add         ecx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_YMMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_YMMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (SIZEOF_YMMWORD-1)
+    and         edx, -SIZEOF_YMMWORD
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .above_16
+
+    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         short .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         eax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD    ; inptr
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (SIZEOF_YMMWORD-1)
+    and         edx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .above_16
+
+    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         near .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         eax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr
+    add         ebx, 2*SIZEOF_YMMWORD     ; outptr0
+    add         edi, 2*SIZEOF_YMMWORD     ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdsample-mmx.asm b/media/libjpeg/simd/i386/jdsample-mmx.asm
new file mode 100644
index 0000000000..12c49f0eab
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-mmx.asm
@@ -0,0 +1,731 @@
+;
+; jdsample.asm - upsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE   times 4 dw 1
+PW_TWO   times 4 dw 2
+PW_THREE times 4 dw 3
+PW_SEVEN times 4 dw 7
+PW_EIGHT times 4 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
+;                               JDIMENSION downsampled_width,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v1_fancy_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_MMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        mm0, mm0                ; mm0=(all 0's)
+    pcmpeqb     mm7, mm7
+    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
+    pand        mm7,  MMWORD [esi+0*SIZEOF_MMWORD]
+
+    add         eax, byte SIZEOF_MMWORD-1
+    and         eax, byte -SIZEOF_MMWORD
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    pcmpeqb     mm6, mm6
+    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+    pand        mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    movq        mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm2, mm1
+    movq        mm3, mm1                ; mm1=( 0 1 2 3 4 5 6 7)
+    psllq       mm2, BYTE_BIT           ; mm2=( - 0 1 2 3 4 5 6)
+    psrlq       mm3, BYTE_BIT           ; mm3=( 1 2 3 4 5 6 7 -)
+
+    por         mm2, mm7                ; mm2=(-1 0 1 2 3 4 5 6)
+    por         mm3, mm6                ; mm3=( 1 2 3 4 5 6 7 8)
+
+    movq        mm7, mm1
+    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
+
+    movq        mm4, mm1
+    punpcklbw   mm1, mm0                ; mm1=( 0 1 2 3)
+    punpckhbw   mm4, mm0                ; mm4=( 4 5 6 7)
+    movq        mm5, mm2
+    punpcklbw   mm2, mm0                ; mm2=(-1 0 1 2)
+    punpckhbw   mm5, mm0                ; mm5=( 3 4 5 6)
+    movq        mm6, mm3
+    punpcklbw   mm3, mm0                ; mm3=( 1 2 3 4)
+    punpckhbw   mm6, mm0                ; mm6=( 5 6 7 8)
+
+    pmullw      mm1, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm2, [GOTOFF(ebx,PW_ONE)]
+    paddw       mm5, [GOTOFF(ebx,PW_ONE)]
+    paddw       mm3, [GOTOFF(ebx,PW_TWO)]
+    paddw       mm6, [GOTOFF(ebx,PW_TWO)]
+
+    paddw       mm2, mm1
+    paddw       mm5, mm4
+    psrlw       mm2, 2                  ; mm2=OutLE=( 0  2  4  6)
+    psrlw       mm5, 2                  ; mm5=OutHE=( 8 10 12 14)
+    paddw       mm3, mm1
+    paddw       mm6, mm4
+    psrlw       mm3, 2                  ; mm3=OutLO=( 1  3  5  7)
+    psrlw       mm6, 2                  ; mm6=OutHO=( 9 11 13 15)
+
+    psllw       mm3, BYTE_BIT
+    psllw       mm6, BYTE_BIT
+    por         mm2, mm3                ; mm2=OutL=( 0  1  2  3  4  5  6  7)
+    por         mm5, mm6                ; mm5=OutH=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+    sub         eax, byte SIZEOF_MMWORD
+    add         esi, byte 1*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_MMWORD  ; outptr
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
+;                               JDIMENSION downsampled_width,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v2_fancy_upsample_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_MMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    movq        mm0, MMWORD [ebx+0*SIZEOF_MMWORD]  ; mm0=row[ 0][0]
+    movq        mm1, MMWORD [ecx+0*SIZEOF_MMWORD]  ; mm1=row[-1][0]
+    movq        mm2, MMWORD [esi+0*SIZEOF_MMWORD]  ; mm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        mm3, mm3                ; mm3=(all 0's)
+    movq        mm4, mm0
+    punpcklbw   mm0, mm3                ; mm0=row[ 0][0]( 0 1 2 3)
+    punpckhbw   mm4, mm3                ; mm4=row[ 0][0]( 4 5 6 7)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm3                ; mm1=row[-1][0]( 0 1 2 3)
+    punpckhbw   mm5, mm3                ; mm5=row[-1][0]( 4 5 6 7)
+    movq        mm6, mm2
+    punpcklbw   mm2, mm3                ; mm2=row[+1][0]( 0 1 2 3)
+    punpckhbw   mm6, mm3                ; mm6=row[+1][0]( 4 5 6 7)
+
+    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+
+    pcmpeqb     mm7, mm7
+    psrlq       mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
+
+    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
+    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
+    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
+    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
+
+    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1  ; temporarily save
+    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5  ; the intermediate data
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+    pand        mm1, mm7                ; mm1=( 0 - - -)
+    pand        mm2, mm7                ; mm2=( 0 - - -)
+
+    movq        MMWORD [wk(0)], mm1
+    movq        MMWORD [wk(1)], mm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_MMWORD-1
+    and         eax, byte -SIZEOF_MMWORD
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pcmpeqb     mm1, mm1
+    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
+    movq        mm2, mm1
+
+    pand        mm1, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm1=( - - - 7)
+    pand        mm2, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm2=( - - - 7)
+
+    movq        MMWORD [wk(2)], mm1
+    movq        MMWORD [wk(3)], mm2
+
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    movq        mm0, MMWORD [ebx+1*SIZEOF_MMWORD]  ; mm0=row[ 0][1]
+    movq        mm1, MMWORD [ecx+1*SIZEOF_MMWORD]  ; mm1=row[-1][1]
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]  ; mm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        mm3, mm3                ; mm3=(all 0's)
+    movq        mm4, mm0
+    punpcklbw   mm0, mm3                ; mm0=row[ 0][1]( 0 1 2 3)
+    punpckhbw   mm4, mm3                ; mm4=row[ 0][1]( 4 5 6 7)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm3                ; mm1=row[-1][1]( 0 1 2 3)
+    punpckhbw   mm5, mm3                ; mm5=row[-1][1]( 4 5 6 7)
+    movq        mm6, mm2
+    punpcklbw   mm2, mm3                ; mm2=row[+1][1]( 0 1 2 3)
+    punpckhbw   mm6, mm3                ; mm6=row[+1][1]( 4 5 6 7)
+
+    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+
+    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
+    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
+    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
+    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
+
+    movq        MMWORD [edx+2*SIZEOF_MMWORD], mm1  ; temporarily save
+    movq        MMWORD [edx+3*SIZEOF_MMWORD], mm5  ; the intermediate data
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
+    psllq       mm2, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
+
+    movq        MMWORD [wk(2)], mm1
+    movq        MMWORD [wk(3)], mm2
+
+.upsample:
+    ; -- process the upper row
+
+    movq        mm7, MMWORD [edx+0*SIZEOF_MMWORD]  ; mm7=Int0L=( 0 1 2 3)
+    movq        mm3, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm3=Int0H=( 4 5 6 7)
+
+    movq        mm0, mm7
+    movq        mm4, mm3
+    psrlq       mm0, 2*BYTE_BIT                  ; mm0=( 1 2 3 -)
+    psllq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
+    movq        mm5, mm7
+    movq        mm6, mm3
+    psrlq       mm5, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
+    psllq       mm6, 2*BYTE_BIT                  ; mm6=( - 4 5 6)
+
+    por         mm0, mm4                         ; mm0=( 1 2 3 4)
+    por         mm5, mm6                         ; mm5=( 3 4 5 6)
+
+    movq        mm1, mm7
+    movq        mm2, mm3
+    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
+    psrlq       mm2, 2*BYTE_BIT                  ; mm2=( 5 6 7 -)
+    movq        mm4, mm3
+    psrlq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
+
+    por         mm1, MMWORD [wk(0)]              ; mm1=(-1 0 1 2)
+    por         mm2, MMWORD [wk(2)]              ; mm2=( 5 6 7 8)
+
+    movq        MMWORD [wk(0)], mm4
+
+    pmullw      mm7, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm3, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm5, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm0, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       mm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       mm1, mm7
+    paddw       mm5, mm3
+    psrlw       mm1, 4                  ; mm1=Out0LE=( 0  2  4  6)
+    psrlw       mm5, 4                  ; mm5=Out0HE=( 8 10 12 14)
+    paddw       mm0, mm7
+    paddw       mm2, mm3
+    psrlw       mm0, 4                  ; mm0=Out0LO=( 1  3  5  7)
+    psrlw       mm2, 4                  ; mm2=Out0HO=( 9 11 13 15)
+
+    psllw       mm0, BYTE_BIT
+    psllw       mm2, BYTE_BIT
+    por         mm1, mm0                ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
+    por         mm5, mm2                ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+    ; -- process the lower row
+
+    movq        mm6, MMWORD [edi+0*SIZEOF_MMWORD]  ; mm6=Int1L=( 0 1 2 3)
+    movq        mm4, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm4=Int1H=( 4 5 6 7)
+
+    movq        mm7, mm6
+    movq        mm3, mm4
+    psrlq       mm7, 2*BYTE_BIT                  ; mm7=( 1 2 3 -)
+    psllq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
+    movq        mm0, mm6
+    movq        mm2, mm4
+    psrlq       mm0, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
+    psllq       mm2, 2*BYTE_BIT                  ; mm2=( - 4 5 6)
+
+    por         mm7, mm3                         ; mm7=( 1 2 3 4)
+    por         mm0, mm2                         ; mm0=( 3 4 5 6)
+
+    movq        mm1, mm6
+    movq        mm5, mm4
+    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
+    psrlq       mm5, 2*BYTE_BIT                  ; mm5=( 5 6 7 -)
+    movq        mm3, mm4
+    psrlq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
+
+    por         mm1, MMWORD [wk(1)]              ; mm1=(-1 0 1 2)
+    por         mm5, MMWORD [wk(3)]              ; mm5=( 5 6 7 8)
+
+    movq        MMWORD [wk(1)], mm3
+
+    pmullw      mm6, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm0, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm7, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       mm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       mm1, mm6
+    paddw       mm0, mm4
+    psrlw       mm1, 4                  ; mm1=Out1LE=( 0  2  4  6)
+    psrlw       mm0, 4                  ; mm0=Out1HE=( 8 10 12 14)
+    paddw       mm7, mm6
+    paddw       mm5, mm4
+    psrlw       mm7, 4                  ; mm7=Out1LO=( 1  3  5  7)
+    psrlw       mm5, 4                  ; mm5=Out1HO=( 9 11 13 15)
+
+    psllw       mm7, BYTE_BIT
+    psllw       mm5, BYTE_BIT
+    por         mm1, mm7                ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
+    por         mm0, mm5                ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_MMWORD
+    add         ecx, byte 1*SIZEOF_MMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_MMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_MMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_MMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_MMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
+
+EXTN(jsimd_h2v1_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_MMWORD)-1
+    and         edx, byte -(2*SIZEOF_MMWORD)
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+    movq        mm1, mm0
+    punpcklbw   mm0, mm0
+    punpckhbw   mm1, mm1
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm3, mm2
+    punpcklbw   mm2, mm2
+    punpckhbw   mm3, mm3
+
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 4*SIZEOF_MMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
+
+EXTN(jsimd_h2v2_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_MMWORD)-1
+    and         edx, byte -(2*SIZEOF_MMWORD)
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+    movq        mm1, mm0
+    punpcklbw   mm0, mm0
+    punpckhbw   mm1, mm1
+
+    movq        MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm3, mm2
+    punpcklbw   mm2, mm2
+    punpckhbw   mm3, mm3
+
+    movq        MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         ebx, byte 4*SIZEOF_MMWORD  ; outptr0
+    add         edi, byte 4*SIZEOF_MMWORD  ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdsample-sse2.asm b/media/libjpeg/simd/i386/jdsample-sse2.asm
new file mode 100644
index 0000000000..4e28d2f4b8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-sse2.asm
@@ -0,0 +1,724 @@
+;
+; jdsample.asm - upsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE   times 8 dw 1
+PW_TWO   times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        xmm0, xmm0              ; xmm0=(all 0's)
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)
+    pand        xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    add         eax, byte SIZEOF_XMMWORD-1
+    and         eax, byte -SIZEOF_XMMWORD
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    pcmpeqb     xmm6, xmm6
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+    pand        xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
+    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
+    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
+
+    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
+    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
+
+    movdqa      xmm7, xmm1
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
+
+    movdqa      xmm4, xmm1
+    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm2
+    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
+    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
+    movdqa      xmm6, xmm3
+    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
+    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
+
+    pmullw      xmm1, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm2, [GOTOFF(ebx,PW_ONE)]
+    paddw       xmm5, [GOTOFF(ebx,PW_ONE)]
+    paddw       xmm3, [GOTOFF(ebx,PW_TWO)]
+    paddw       xmm6, [GOTOFF(ebx,PW_TWO)]
+
+    paddw       xmm2, xmm1
+    paddw       xmm5, xmm4
+    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+    paddw       xmm3, xmm1
+    paddw       xmm6, xmm4
+    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm3, BYTE_BIT
+    psllw       xmm6, BYTE_BIT
+    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+    sub         eax, byte SIZEOF_XMMWORD
+    add         esi, byte 1*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    movdqa      xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
+    movdqa      xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
+    movdqa      xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-2)
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
+    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
+
+    movdqa      XMMWORD [wk(0)], xmm1
+    movdqa      XMMWORD [wk(1)], xmm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_XMMWORD-1
+    and         eax, byte -SIZEOF_XMMWORD
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pcmpeqb     xmm1, xmm1
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)
+    movdqa      xmm2, xmm1
+
+    pand        xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+    pand        xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+    jmp         near .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    movdqa      xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
+    movdqa      xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
+    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
+
+    movdqa      XMMWORD [wk(2)], xmm1
+    movdqa      XMMWORD [wk(3)], xmm2
+
+.upsample:
+    ; -- process the upper row
+
+    movdqa      xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
+    movdqa      xmm5, xmm7
+    movdqa      xmm6, xmm3
+    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
+
+    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
+    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm2, xmm3
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm4, xmm3
+    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(0)], xmm4
+
+    pmullw      xmm7, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm3, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm5, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm0, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       xmm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       xmm1, xmm7
+    paddw       xmm5, xmm3
+    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm3
+    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm0, BYTE_BIT
+    psllw       xmm2, BYTE_BIT
+    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+    ; -- process the lower row
+
+    movdqa      xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+    movdqa      xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
+    movdqa      xmm0, xmm6
+    movdqa      xmm2, xmm4
+    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
+
+    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
+    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm3, xmm4
+    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(1)], xmm3
+
+    pmullw      xmm6, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm0, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm7, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       xmm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       xmm1, xmm6
+    paddw       xmm0, xmm4
+    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm7, xmm6
+    paddw       xmm5, xmm4
+    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm7, BYTE_BIT
+    psllw       xmm5, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_XMMWORD
+    add         ecx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_XMMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_XMMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_XMMWORD)-1
+    and         edx, byte -(2*SIZEOF_XMMWORD)
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 4*SIZEOF_XMMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_XMMWORD)-1
+    and         edx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         ebx, byte 4*SIZEOF_XMMWORD  ; outptr0
+    add         edi, byte 4*SIZEOF_XMMWORD  ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          short .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctflt-3dn.asm b/media/libjpeg/simd/i386/jfdctflt-3dn.asm
new file mode 100644
index 0000000000..322ab16325
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctflt-3dn.asm
@@ -0,0 +1,318 @@
+;
+; jfdctflt.asm - floating-point FDCT (3DNow!)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382 times 2 dd 0.382683432365089771728460
+PD_0_707 times 2 dd 0.707106781186547524400844
+PD_0_541 times 2 dd 0.541196100146196984399723
+PD_1_306 times 2 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_3dnow(FAST_FLOAT *data)
+;
+
+%define data(b)       (b) + 8           ; FAST_FLOAT *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_3dnow)
+
+EXTN(jsimd_fdct_float_3dnow):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+    movq        mm4, mm0                ; transpose coefficients
+    punpckldq   mm0, mm1                ; mm0=(00 10)=data0
+    punpckhdq   mm4, mm1                ; mm4=(01 11)=data1
+    movq        mm5, mm2                ; transpose coefficients
+    punpckldq   mm2, mm3                ; mm2=(06 16)=data6
+    punpckhdq   mm5, mm3                ; mm5=(07 17)=data7
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
+    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
+    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
+    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
+
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
+
+    movq        mm4, mm1                ; transpose coefficients
+    punpckldq   mm1, mm3                ; mm1=(02 12)=data2
+    punpckhdq   mm4, mm3                ; mm4=(03 13)=data3
+    movq        mm0, mm2                ; transpose coefficients
+    punpckldq   mm2, mm5                ; mm2=(04 14)=data4
+    punpckhdq   mm0, mm5                ; mm0=(05 15)=data5
+
+    movq        mm3, mm4
+    movq        mm5, mm1
+    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
+    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
+    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
+    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm2, mm7
+    movq        mm0, mm6
+    pfsub       mm7, mm4                ; mm7=tmp13
+    pfsub       mm6, mm1                ; mm6=tmp12
+    pfadd       mm2, mm4                ; mm2=tmp10
+    pfadd       mm0, mm1                ; mm0=tmp11
+
+    pfadd       mm6, mm7
+    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
+
+    movq        mm4, mm2
+    movq        mm1, mm7
+    pfsub       mm2, mm0                ; mm2=data4
+    pfsub       mm7, mm6                ; mm7=data6
+    pfadd       mm4, mm0                ; mm4=data0
+    pfadd       mm1, mm6                ; mm1=data2
+
+    movq        MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
+
+    pfadd       mm3, mm5                ; mm3=tmp10
+    pfadd       mm5, mm0                ; mm5=tmp11
+    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
+
+    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
+
+    movq        mm2, mm3                     ; mm2=tmp10
+    pfsub       mm3, mm0
+    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
+    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+    pfadd       mm2, mm3                     ; mm2=z2
+    pfadd       mm0, mm3                     ; mm0=z4
+
+    movq        mm7, mm6
+    pfsub       mm6, mm5                ; mm6=z13
+    pfadd       mm7, mm5                ; mm7=z11
+
+    movq        mm4, mm6
+    movq        mm1, mm7
+    pfsub       mm6, mm2                ; mm6=data3
+    pfsub       mm7, mm0                ; mm7=data7
+    pfadd       mm4, mm2                ; mm4=data5
+    pfadd       mm1, mm0                ; mm1=data1
+
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    add         edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+    movq        mm4, mm0                ; transpose coefficients
+    punpckldq   mm0, mm1                ; mm0=(00 01)=data0
+    punpckhdq   mm4, mm1                ; mm4=(10 11)=data1
+    movq        mm5, mm2                ; transpose coefficients
+    punpckldq   mm2, mm3                ; mm2=(60 61)=data6
+    punpckhdq   mm5, mm3                ; mm5=(70 71)=data7
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
+    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
+    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
+    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
+
+    movq        mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
+
+    movq        mm4, mm1                ; transpose coefficients
+    punpckldq   mm1, mm3                ; mm1=(20 21)=data2
+    punpckhdq   mm4, mm3                ; mm4=(30 31)=data3
+    movq        mm0, mm2                ; transpose coefficients
+    punpckldq   mm2, mm5                ; mm2=(40 41)=data4
+    punpckhdq   mm0, mm5                ; mm0=(50 51)=data5
+
+    movq        mm3, mm4
+    movq        mm5, mm1
+    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
+    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
+    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
+    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm2, mm7
+    movq        mm0, mm6
+    pfsub       mm7, mm4                ; mm7=tmp13
+    pfsub       mm6, mm1                ; mm6=tmp12
+    pfadd       mm2, mm4                ; mm2=tmp10
+    pfadd       mm0, mm1                ; mm0=tmp11
+
+    pfadd       mm6, mm7
+    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
+
+    movq        mm4, mm2
+    movq        mm1, mm7
+    pfsub       mm2, mm0                ; mm2=data4
+    pfsub       mm7, mm6                ; mm7=data6
+    pfadd       mm4, mm0                ; mm4=data0
+    pfadd       mm1, mm6                ; mm1=data2
+
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
+
+    pfadd       mm3, mm5                ; mm3=tmp10
+    pfadd       mm5, mm0                ; mm5=tmp11
+    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
+
+    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
+
+    movq        mm2, mm3                     ; mm2=tmp10
+    pfsub       mm3, mm0
+    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
+    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+    pfadd       mm2, mm3                     ; mm2=z2
+    pfadd       mm0, mm3                     ; mm0=z4
+
+    movq        mm7, mm6
+    pfsub       mm6, mm5                ; mm6=z13
+    pfadd       mm7, mm5                ; mm7=z11
+
+    movq        mm4, mm6
+    movq        mm1, mm7
+    pfsub       mm6, mm2                ; mm6=data3
+    pfsub       mm7, mm0                ; mm7=data7
+    pfadd       mm4, mm2                ; mm4=data5
+    pfadd       mm1, mm0                ; mm1=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    add         edx, byte 2*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .columnloop
+
+    femms                               ; empty MMX/3DNow! state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctflt-sse.asm b/media/libjpeg/simd/i386/jfdctflt-sse.asm
new file mode 100644
index 0000000000..86952c6499
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctflt-sse.asm
@@ -0,0 +1,369 @@
+;
+; jfdctflt.asm - floating-point FDCT (SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro  unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+%define data(b)       (b) + 8           ; FAST_FLOAT *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+    ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(20 30 21 31)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 32 23 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(24 34 25 35)
+    unpckhps    xmm5, xmm3              ; xmm5=(26 36 27 37)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+    ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(02 12 03 13)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(04 14 05 15)
+    unpckhps    xmm2, xmm3              ; xmm2=(06 16 07 17)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 10 20 30)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(01 11 21 31)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(06 16 26 36)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(07 17 27 37)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(02 12 22 32)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(03 13 23 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(04 14 24 34)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(05 15 25 35)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [GOTOFF(ebx,PD_0_707)]  ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [GOTOFF(ebx,PD_0_707)]  ; xmm3=z3
+
+    movaps      xmm1, xmm2                    ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [GOTOFF(ebx,PD_0_382)]  ; xmm2=z5
+    mulps       xmm1, [GOTOFF(ebx,PD_0_541)]  ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [GOTOFF(ebx,PD_1_306)]  ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2                    ; xmm1=z2
+    addps       xmm6, xmm2                    ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+    ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(02 03 12 13)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 23 32 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(42 43 52 53)
+    unpckhps    xmm5, xmm3              ; xmm5=(62 63 72 73)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+    ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 01 10 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(20 21 30 31)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(40 41 50 51)
+    unpckhps    xmm2, xmm3              ; xmm2=(60 61 70 71)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 01 02 03)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(10 11 12 13)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(60 61 62 63)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(70 71 72 73)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(20 21 22 23)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(30 31 32 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(40 41 42 43)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(50 51 52 53)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [GOTOFF(ebx,PD_0_707)]  ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [GOTOFF(ebx,PD_0_707)]  ; xmm3=z3
+
+    movaps      xmm1, xmm2                    ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [GOTOFF(ebx,PD_0_382)]  ; xmm2=z5
+    mulps       xmm1, [GOTOFF(ebx,PD_0_541)]  ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [GOTOFF(ebx,PD_1_306)]  ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2                    ; xmm1=z2
+    addps       xmm6, xmm2                    ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         edx, byte 4*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .columnloop
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctfst-mmx.asm b/media/libjpeg/simd/i386/jfdctfst-mmx.asm
new file mode 100644
index 0000000000..80645a50d7
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctfst-mmx.asm
@@ -0,0 +1,395 @@
+;
+; jfdctfst.asm - fast integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_mmx(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_mmx)
+
+EXTN(jsimd_fdct_ifast_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+    ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(24 34 25 35)
+    punpckhwd   mm5, mm3                ; mm5=(26 36 27 37)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+    ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm4, mm7                ; mm4=(02 12 03 13)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(04 14 05 15)
+    punpckhwd   mm2, mm3                ; mm2=(06 16 07 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 10 20 30)=data0
+    punpckhdq   mm7, mm0                ; mm7=(01 11 21 31)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(06 16 26 36)=data6
+    punpckhdq   mm3, mm5                ; mm3=(07 17 27 37)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(02 12 22 32)=data2
+    punpckhdq   mm7, mm2                ; mm7=(03 13 23 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(04 14 24 34)=data4
+    punpckhdq   mm6, mm3                ; mm6=(05 15 25 35)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    psubw       mm5, mm7                ; mm5=tmp13
+    psubw       mm0, mm4                ; mm0=tmp12
+    paddw       mm1, mm7                ; mm1=tmp10
+    paddw       mm6, mm4                ; mm6=tmp11
+
+    paddw       mm0, mm5
+    psllw       mm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm0, [GOTOFF(ebx,PW_F0707)]  ; mm0=z1
+
+    movq        mm7, mm1
+    movq        mm4, mm5
+    psubw       mm1, mm6                ; mm1=data4
+    psubw       mm5, mm0                ; mm5=data6
+    paddw       mm7, mm6                ; mm7=data0
+    paddw       mm4, mm0                ; mm4=data2
+
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+    ; -- Odd part
+
+    movq        mm6, MMWORD [wk(0)]     ; mm6=tmp6
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp7
+
+    paddw       mm2, mm3                ; mm2=tmp10
+    paddw       mm3, mm6                ; mm3=tmp11
+    paddw       mm6, mm0                ; mm6=tmp12, mm0=tmp7
+
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm6, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       mm3, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm3, [GOTOFF(ebx,PW_F0707)]  ; mm3=z3
+
+    movq        mm1, mm2                     ; mm1=tmp10
+    psubw       mm2, mm6
+    pmulhw      mm2, [GOTOFF(ebx,PW_F0382)]  ; mm2=z5
+    pmulhw      mm1, [GOTOFF(ebx,PW_F0541)]  ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+    pmulhw      mm6, [GOTOFF(ebx,PW_F1306)]  ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+    paddw       mm1, mm2                     ; mm1=z2
+    paddw       mm6, mm2                     ; mm6=z4
+
+    movq        mm5, mm0
+    psubw       mm0, mm3                ; mm0=z13
+    paddw       mm5, mm3                ; mm5=z11
+
+    movq        mm7, mm0
+    movq        mm4, mm5
+    psubw       mm0, mm1                ; mm0=data3
+    psubw       mm5, mm6                ; mm5=data7
+    paddw       mm7, mm1                ; mm7=data5
+    paddw       mm4, mm6                ; mm4=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+    add         edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+    ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(02 03 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(22 23 32 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(42 43 52 53)
+    punpckhwd   mm5, mm3                ; mm5=(62 63 72 73)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+    ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 01 10 11)
+    punpckhwd   mm4, mm7                ; mm4=(20 21 30 31)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(40 41 50 51)
+    punpckhwd   mm2, mm3                ; mm2=(60 61 70 71)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03)=data0
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(60 61 62 63)=data6
+    punpckhdq   mm3, mm5                ; mm3=(70 71 72 73)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(20 21 22 23)=data2
+    punpckhdq   mm7, mm2                ; mm7=(30 31 32 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(40 41 42 43)=data4
+    punpckhdq   mm6, mm3                ; mm6=(50 51 52 53)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    psubw       mm5, mm7                ; mm5=tmp13
+    psubw       mm0, mm4                ; mm0=tmp12
+    paddw       mm1, mm7                ; mm1=tmp10
+    paddw       mm6, mm4                ; mm6=tmp11
+
+    paddw       mm0, mm5
+    psllw       mm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm0, [GOTOFF(ebx,PW_F0707)]  ; mm0=z1
+
+    movq        mm7, mm1
+    movq        mm4, mm5
+    psubw       mm1, mm6                ; mm1=data4
+    psubw       mm5, mm0                ; mm5=data6
+    paddw       mm7, mm6                ; mm7=data0
+    paddw       mm4, mm0                ; mm4=data2
+
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+    ; -- Odd part
+
+    movq        mm6, MMWORD [wk(0)]     ; mm6=tmp6
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp7
+
+    paddw       mm2, mm3                ; mm2=tmp10
+    paddw       mm3, mm6                ; mm3=tmp11
+    paddw       mm6, mm0                ; mm6=tmp12, mm0=tmp7
+
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm6, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       mm3, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm3, [GOTOFF(ebx,PW_F0707)]  ; mm3=z3
+
+    movq        mm1, mm2                     ; mm1=tmp10
+    psubw       mm2, mm6
+    pmulhw      mm2, [GOTOFF(ebx,PW_F0382)]  ; mm2=z5
+    pmulhw      mm1, [GOTOFF(ebx,PW_F0541)]  ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+    pmulhw      mm6, [GOTOFF(ebx,PW_F1306)]  ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+    paddw       mm1, mm2                     ; mm1=z2
+    paddw       mm6, mm2                     ; mm6=z4
+
+    movq        mm5, mm0
+    psubw       mm0, mm3                ; mm0=z13
+    paddw       mm5, mm3                ; mm5=z11
+
+    movq        mm7, mm0
+    movq        mm4, mm5
+    psubw       mm0, mm1                ; mm0=data3
+    psubw       mm5, mm6                ; mm5=data7
+    paddw       mm7, mm1                ; mm7=data5
+    paddw       mm4, mm6                ; mm4=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+    add         edx, byte 4*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .columnloop
+
+    emms                                ; empty MMX state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctfst-sse2.asm b/media/libjpeg/simd/i386/jfdctfst-sse2.asm
new file mode 100644
index 0000000000..446fa7a68f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctfst-sse2.asm
@@ -0,0 +1,403 @@
+;
+; jfdctfst.asm - fast integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    psubw       xmm3, xmm1              ; xmm3=tmp13
+    psubw       xmm6, xmm7              ; xmm6=tmp12
+    paddw       xmm4, xmm1              ; xmm4=tmp10
+    paddw       xmm0, xmm7              ; xmm0=tmp11
+
+    paddw       xmm6, xmm3
+    psllw       xmm6, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm6, [GOTOFF(ebx,PW_F0707)]  ; xmm6=z1
+
+    movdqa      xmm1, xmm4
+    movdqa      xmm7, xmm3
+    psubw       xmm4, xmm0              ; xmm4=data4
+    psubw       xmm3, xmm6              ; xmm3=data6
+    paddw       xmm1, xmm0              ; xmm1=data0
+    paddw       xmm7, xmm6              ; xmm7=data2
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+    ; -- Odd part
+
+    paddw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm5, xmm0              ; xmm5=tmp11
+    paddw       xmm0, xmm6              ; xmm0=tmp12, xmm6=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F0707)]  ; xmm5=z3
+
+    movdqa      xmm4, xmm2                    ; xmm4=tmp10
+    psubw       xmm2, xmm0
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F0382)]  ; xmm2=z5
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F0541)]  ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F1306)]  ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm2                    ; xmm4=z2
+    paddw       xmm0, xmm2                    ; xmm0=z4
+
+    movdqa      xmm3, xmm6
+    psubw       xmm6, xmm5              ; xmm6=z13
+    paddw       xmm3, xmm5              ; xmm3=z11
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm5, xmm3
+    psubw       xmm6, xmm4              ; xmm6=data3
+    psubw       xmm3, xmm0              ; xmm3=data7
+    paddw       xmm2, xmm4              ; xmm2=data5
+    paddw       xmm5, xmm0              ; xmm5=data1
+
+    ; ---- Pass 2: process columns.
+
+;   mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+    ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm4, xmm5              ; xmm4=(40 41 50 51 60 61 70 71)
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm6              ; xmm7=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm0, xmm6              ; xmm0=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+    ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+    ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm2              ; xmm5=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm7, xmm2              ; xmm7=(44 45 54 55 64 65 74 75)
+    movdqa      xmm0, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm3              ; xmm6=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm0, xmm3              ; xmm0=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm2, xmm5              ; transpose coefficients(phase 2)
+    punpckldq   xmm5, xmm6              ; xmm5=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm2, xmm6              ; xmm2=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm0              ; xmm7=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm3, xmm0              ; xmm3=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm6              ; xmm1=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm2, xmm6              ; xmm2=(20 21 22 23 30 31 32 33)
+    movdqa      xmm7, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm7, xmm0              ; xmm7=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm5              ; xmm1=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm6, xmm5              ; xmm6=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm3              ; xmm7=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm0, xmm3              ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm3, xmm1
+    psubw       xmm6, xmm7              ; xmm6=data1-data6=tmp6
+    psubw       xmm1, xmm0              ; xmm1=data0-data7=tmp7
+    paddw       xmm5, xmm7              ; xmm5=data1+data6=tmp1
+    paddw       xmm3, xmm0              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+    movdqa      xmm6, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm7              ; xmm2=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm6, xmm7              ; xmm6=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm1, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm1, xmm0              ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm0, xmm2
+    paddw       xmm6, xmm4              ; xmm6=data3+data4=tmp3
+    paddw       xmm2, xmm1              ; xmm2=data2+data5=tmp2
+    psubw       xmm7, xmm4              ; xmm7=data3-data4=tmp4
+    psubw       xmm0, xmm1              ; xmm0=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm1, xmm5
+    psubw       xmm3, xmm6              ; xmm3=tmp13
+    psubw       xmm5, xmm2              ; xmm5=tmp12
+    paddw       xmm4, xmm6              ; xmm4=tmp10
+    paddw       xmm1, xmm2              ; xmm1=tmp11
+
+    paddw       xmm5, xmm3
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F0707)]  ; xmm5=z1
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm2, xmm3
+    psubw       xmm4, xmm1              ; xmm4=data4
+    psubw       xmm3, xmm5              ; xmm3=data6
+    paddw       xmm6, xmm1              ; xmm6=data0
+    paddw       xmm2, xmm5              ; xmm2=data2
+
+    movdqa      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+    ; -- Odd part
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    paddw       xmm7, xmm0              ; xmm7=tmp10
+    paddw       xmm0, xmm1              ; xmm0=tmp11
+    paddw       xmm1, xmm5              ; xmm1=tmp12, xmm5=tmp7
+
+    psllw       xmm7, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F0707)]  ; xmm0=z3
+
+    movdqa      xmm4, xmm7                    ; xmm4=tmp10
+    psubw       xmm7, xmm1
+    pmulhw      xmm7, [GOTOFF(ebx,PW_F0382)]  ; xmm7=z5
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F0541)]  ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm1, [GOTOFF(ebx,PW_F1306)]  ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm7                    ; xmm4=z2
+    paddw       xmm1, xmm7                    ; xmm1=z4
+
+    movdqa      xmm3, xmm5
+    psubw       xmm5, xmm0              ; xmm5=z13
+    paddw       xmm3, xmm0              ; xmm3=z11
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm2, xmm3
+    psubw       xmm5, xmm4              ; xmm5=data3
+    psubw       xmm3, xmm1              ; xmm3=data7
+    paddw       xmm6, xmm4              ; xmm6=data5
+    paddw       xmm2, xmm1              ; xmm2=data1
+
+    movdqa      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+    movdqa      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctint-avx2.asm b/media/libjpeg/simd/i386/jfdctint-avx2.asm
new file mode 100644
index 0000000000..23cf733135
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-avx2.asm
@@ -0,0 +1,331 @@
+;
+; jfdctint.asm - accurate integer FDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpckhwd  %6, %1, %2
+    vpunpcklwd  %7, %3, %4
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 01 11 02 12 03 13  40 50 41 51 42 52 43 53)
+    ; %6=(04 14 05 15 06 16 07 17  44 54 45 55 46 56 47 57)
+    ; %7=(20 30 21 31 22 32 23 33  60 70 61 71 62 72 63 73)
+    ; %8=(24 34 25 35 26 36 27 37  64 74 65 75 66 76 67 77)
+
+    vpunpckldq  %1, %5, %7
+    vpunpckhdq  %2, %5, %7
+    vpunpckldq  %3, %6, %8
+    vpunpckhdq  %4, %6, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %2=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %3=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %4=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpermq      %1, %1, 0x8D
+    vpermq      %2, %2, 0x8D
+    vpermq      %3, %3, 0xD8
+    vpermq      %4, %4, 0xD8
+    ; transpose coefficients(phase 3)
+    ; %1=(01 11 21 31 41 51 61 71  00 10 20 30 40 50 60 70)
+    ; %2=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %3=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %4=(06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9:    Pass (1 or 2)
+
+%macro dodct 9
+    vpsubw      %5, %1, %4              ; %5=data1_0-data6_7=tmp6_7
+    vpaddw      %6, %1, %4              ; %6=data1_0+data6_7=tmp1_0
+    vpaddw      %7, %2, %3              ; %7=data3_2+data4_5=tmp3_2
+    vpsubw      %8, %2, %3              ; %8=data3_2-data4_5=tmp4_5
+
+    ; -- Even part
+
+    vperm2i128  %6, %6, %6, 0x01        ; %6=tmp0_1
+    vpaddw      %1, %6, %7              ; %1=tmp0_1+tmp3_2=tmp10_11
+    vpsubw      %6, %6, %7              ; %6=tmp0_1-tmp3_2=tmp13_12
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=tmp11_10
+    vpsignw     %1, %1, [GOTOFF(ebx, PW_1_NEG1)]  ; %1=tmp10_neg11
+    vpaddw      %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+    vpsllw      %1, %7, PASS1_BITS      ; %1=data0_4
+%else
+    vpaddw      %7, %7, [GOTOFF(ebx, PW_DESCALE_P2X)]
+    vpsraw      %1, %7, PASS1_BITS      ; %1=data0_4
+%endif
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    vperm2i128  %7, %6, %6, 0x01        ; %7=tmp12_13
+    vpunpcklwd  %2, %6, %7
+    vpunpckhwd  %6, %6, %7
+    vpmaddwd    %2, %2, [GOTOFF(ebx, PW_F130_F054_MF130_F054)]  ; %2=data2_6L
+    vpmaddwd    %6, %6, [GOTOFF(ebx, PW_F130_F054_MF130_F054)]  ; %6=data2_6H
+
+    vpaddd      %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %6, %6, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %6, %6, DESCALE_P %+ %9
+
+    vpackssdw   %3, %2, %6              ; %6=data2_6
+
+    ; -- Odd part
+
+    vpaddw      %7, %8, %5              ; %7=tmp4_5+tmp6_7=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %2, %7, %7, 0x01        ; %2=z4_3
+    vpunpcklwd  %6, %7, %2
+    vpunpckhwd  %7, %7, %2
+    vpmaddwd    %6, %6, [GOTOFF(ebx, PW_MF078_F117_F078_F117)]  ; %6=z3_4L
+    vpmaddwd    %7, %7, [GOTOFF(ebx, PW_MF078_F117_F078_F117)]  ; %7=z3_4H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    vperm2i128  %4, %5, %5, 0x01        ; %4=tmp7_6
+    vpunpcklwd  %2, %8, %4
+    vpunpckhwd  %4, %8, %4
+    vpmaddwd    %2, %2, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)]  ; %2=tmp4_5L
+    vpmaddwd    %4, %4, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)]  ; %4=tmp4_5H
+
+    vpaddd      %2, %2, %6              ; %2=data7_5L
+    vpaddd      %4, %4, %7              ; %4=data7_5H
+
+    vpaddd      %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %4, %4, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %4, %4, DESCALE_P %+ %9
+
+    vpackssdw   %4, %2, %4              ; %4=data7_5
+
+    vperm2i128  %2, %8, %8, 0x01        ; %2=tmp5_4
+    vpunpcklwd  %8, %5, %2
+    vpunpckhwd  %5, %5, %2
+    vpmaddwd    %8, %8, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)]  ; %8=tmp6_7L
+    vpmaddwd    %5, %5, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)]  ; %5=tmp6_7H
+
+    vpaddd      %8, %8, %6              ; %8=data3_1L
+    vpaddd      %5, %5, %7              ; %5=data3_1H
+
+    vpaddd      %8, %8, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %5, %5, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %8, %8, DESCALE_P %+ %9
+    vpsrad      %5, %5, DESCALE_P %+ %9
+
+    vpackssdw   %2, %8, %5              ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089   times 4  dw  (F_3_072 - F_2_562), -F_2_562
+                           times 4  dw  (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X             times 16 dw  1 << (PASS1_BITS - 1)
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(ebp)]  ; (DCTELEM *)
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    ; ymm4=(00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17)
+    ; ymm5=(20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37)
+    ; ymm6=(40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57)
+    ; ymm7=(60 61 62 63 64 65 66 67  70 71 72 73 74 75 76 77)
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20
+    vperm2i128  ymm1, ymm4, ymm6, 0x31
+    vperm2i128  ymm2, ymm5, ymm7, 0x20
+    vperm2i128  ymm3, ymm5, ymm7, 0x31
+    ; ymm0=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; ymm1=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+    ; ---- Pass 2: process columns.
+
+    vperm2i128  ymm4, ymm1, ymm3, 0x20  ; ymm4=data3_7
+    vperm2i128  ymm1, ymm1, ymm3, 0x31  ; ymm1=data1_5
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+    vperm2i128 ymm3, ymm0, ymm1, 0x30   ; ymm3=data0_1
+    vperm2i128 ymm5, ymm2, ymm1, 0x20   ; ymm5=data2_3
+    vperm2i128 ymm6, ymm0, ymm4, 0x31   ; ymm6=data4_5
+    vperm2i128 ymm7, ymm2, ymm4, 0x21   ; ymm7=data6_7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], ymm3
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], ymm5
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], ymm6
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], ymm7
+
+    vzeroupper
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctint-mmx.asm b/media/libjpeg/simd/i386/jfdctint-mmx.asm
new file mode 100644
index 0000000000..34a43b9e5e
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-mmx.asm
@@ -0,0 +1,620 @@
+;
+; jfdctint.asm - accurate integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054   times 2 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 2 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 2 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 2 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 4 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_mmx(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_mmx)
+
+EXTN(jsimd_fdct_islow_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+    ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(24 34 25 35)
+    punpckhwd   mm5, mm3                ; mm5=(26 36 27 37)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+    ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm4, mm7                ; mm4=(02 12 03 13)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(04 14 05 15)
+    punpckhwd   mm2, mm3                ; mm2=(06 16 07 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 10 20 30)=data0
+    punpckhdq   mm7, mm0                ; mm7=(01 11 21 31)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(06 16 26 36)=data6
+    punpckhdq   mm3, mm5                ; mm3=(07 17 27 37)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(02 12 22 32)=data2
+    punpckhdq   mm7, mm2                ; mm7=(03 13 23 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(04 14 24 34)=data4
+    punpckhdq   mm6, mm3                ; mm6=(05 15 25 35)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    paddw       mm5, mm7                ; mm5=tmp10
+    paddw       mm0, mm4                ; mm0=tmp11
+    psubw       mm1, mm7                ; mm1=tmp13
+    psubw       mm6, mm4                ; mm6=tmp12
+
+    movq        mm7, mm5
+    paddw       mm5, mm0                ; mm5=tmp10+tmp11
+    psubw       mm7, mm0                ; mm7=tmp10-tmp11
+
+    psllw       mm5, PASS1_BITS         ; mm5=data0
+    psllw       mm7, PASS1_BITS         ; mm7=data4
+
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movq        mm4, mm1                ; mm1=tmp13
+    movq        mm0, mm1
+    punpcklwd   mm4, mm6                ; mm6=tmp12
+    punpckhwd   mm0, mm6
+    movq        mm1, mm4
+    movq        mm6, mm0
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=data2L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F130_F054)]   ; mm0=data2H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=data6L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F054_MF130)]  ; mm6=data6H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm4, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm1, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm4, mm0                ; mm4=data2
+    packssdw    mm1, mm6                ; mm1=data6
+
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+    ; -- Odd part
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp6
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp7
+
+    movq        mm0, mm2                ; mm2=tmp4
+    movq        mm6, mm3                ; mm3=tmp5
+    paddw       mm0, mm5                ; mm0=z3
+    paddw       mm6, mm7                ; mm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm4, mm0
+    movq        mm1, mm0
+    punpcklwd   mm4, mm6
+    punpckhwd   mm1, mm6
+    movq        mm0, mm4
+    movq        mm6, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF078_F117)]  ; mm4=z3L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF078_F117)]  ; mm1=z3H
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F117_F078)]   ; mm0=z4L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F117_F078)]   ; mm6=z4H
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=z3L
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movq        mm4, mm2
+    movq        mm1, mm2
+    punpcklwd   mm4, mm7
+    punpckhwd   mm1, mm7
+    movq        mm2, mm4
+    movq        mm7, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm4=tmp4L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm1=tmp4H
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF089_F060)]   ; mm2=tmp7L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF089_F060)]   ; mm7=tmp7H
+
+    paddd       mm4, MMWORD [wk(0)]     ; mm4=data7L
+    paddd       mm1, MMWORD [wk(1)]     ; mm1=data7H
+    paddd       mm2, mm0                ; mm2=data1L
+    paddd       mm7, mm6                ; mm7=data1H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm4, DESCALE_P1
+    psrad       mm1, DESCALE_P1
+    paddd       mm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm2, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+
+    packssdw    mm4, mm1                ; mm4=data7
+    packssdw    mm2, mm7                ; mm2=data1
+
+    movq        MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+    movq        mm1, mm3
+    movq        mm7, mm3
+    punpcklwd   mm1, mm5
+    punpckhwd   mm7, mm5
+    movq        mm3, mm1
+    movq        mm5, mm7
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm1=tmp5L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm7=tmp5H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF256_F050)]   ; mm3=tmp6L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_MF256_F050)]   ; mm5=tmp6H
+
+    paddd       mm1, mm0                ; mm1=data5L
+    paddd       mm7, mm6                ; mm7=data5H
+    paddd       mm3, MMWORD [wk(0)]     ; mm3=data3L
+    paddd       mm5, MMWORD [wk(1)]     ; mm5=data3H
+
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm1, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+    paddd       mm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm3, DESCALE_P1
+    psrad       mm5, DESCALE_P1
+
+    packssdw    mm1, mm7                ; mm1=data5
+    packssdw    mm3, mm5                ; mm3=data3
+
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+    add         edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+    ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(02 03 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(22 23 32 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(42 43 52 53)
+    punpckhwd   mm5, mm3                ; mm5=(62 63 72 73)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+    ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 01 10 11)
+    punpckhwd   mm4, mm7                ; mm4=(20 21 30 31)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(40 41 50 51)
+    punpckhwd   mm2, mm3                ; mm2=(60 61 70 71)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03)=data0
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(60 61 62 63)=data6
+    punpckhdq   mm3, mm5                ; mm3=(70 71 72 73)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(20 21 22 23)=data2
+    punpckhdq   mm7, mm2                ; mm7=(30 31 32 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(40 41 42 43)=data4
+    punpckhdq   mm6, mm3                ; mm6=(50 51 52 53)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    paddw       mm5, mm7                ; mm5=tmp10
+    paddw       mm0, mm4                ; mm0=tmp11
+    psubw       mm1, mm7                ; mm1=tmp13
+    psubw       mm6, mm4                ; mm6=tmp12
+
+    movq        mm7, mm5
+    paddw       mm5, mm0                ; mm5=tmp10+tmp11
+    psubw       mm7, mm0                ; mm7=tmp10-tmp11
+
+    paddw       mm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    paddw       mm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    psraw       mm5, PASS1_BITS         ; mm5=data0
+    psraw       mm7, PASS1_BITS         ; mm7=data4
+
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movq        mm4, mm1                ; mm1=tmp13
+    movq        mm0, mm1
+    punpcklwd   mm4, mm6                ; mm6=tmp12
+    punpckhwd   mm0, mm6
+    movq        mm1, mm4
+    movq        mm6, mm0
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=data2L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F130_F054)]   ; mm0=data2H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=data6L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F054_MF130)]  ; mm6=data6H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm4, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm1, DESCALE_P2
+    psrad       mm6, DESCALE_P2
+
+    packssdw    mm4, mm0                ; mm4=data2
+    packssdw    mm1, mm6                ; mm1=data6
+
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+    ; -- Odd part
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp6
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp7
+
+    movq        mm0, mm2                ; mm2=tmp4
+    movq        mm6, mm3                ; mm3=tmp5
+    paddw       mm0, mm5                ; mm0=z3
+    paddw       mm6, mm7                ; mm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm4, mm0
+    movq        mm1, mm0
+    punpcklwd   mm4, mm6
+    punpckhwd   mm1, mm6
+    movq        mm0, mm4
+    movq        mm6, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF078_F117)]  ; mm4=z3L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF078_F117)]  ; mm1=z3H
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F117_F078)]   ; mm0=z4L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F117_F078)]   ; mm6=z4H
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=z3L
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movq        mm4, mm2
+    movq        mm1, mm2
+    punpcklwd   mm4, mm7
+    punpckhwd   mm1, mm7
+    movq        mm2, mm4
+    movq        mm7, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm4=tmp4L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm1=tmp4H
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF089_F060)]   ; mm2=tmp7L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF089_F060)]   ; mm7=tmp7H
+
+    paddd       mm4, MMWORD [wk(0)]     ; mm4=data7L
+    paddd       mm1, MMWORD [wk(1)]     ; mm1=data7H
+    paddd       mm2, mm0                ; mm2=data1L
+    paddd       mm7, mm6                ; mm7=data1H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm4, DESCALE_P2
+    psrad       mm1, DESCALE_P2
+    paddd       mm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm2, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+
+    packssdw    mm4, mm1                ; mm4=data7
+    packssdw    mm2, mm7                ; mm2=data1
+
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+    movq        mm1, mm3
+    movq        mm7, mm3
+    punpcklwd   mm1, mm5
+    punpckhwd   mm7, mm5
+    movq        mm3, mm1
+    movq        mm5, mm7
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm1=tmp5L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm7=tmp5H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF256_F050)]   ; mm3=tmp6L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_MF256_F050)]   ; mm5=tmp6H
+
+    paddd       mm1, mm0                ; mm1=data5L
+    paddd       mm7, mm6                ; mm7=data5H
+    paddd       mm3, MMWORD [wk(0)]     ; mm3=data3L
+    paddd       mm5, MMWORD [wk(1)]     ; mm5=data3H
+
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm1, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+    paddd       mm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm3, DESCALE_P2
+    psrad       mm5, DESCALE_P2
+
+    packssdw    mm1, mm7                ; mm1=data5
+    packssdw    mm3, mm5                ; mm3=data3
+
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+    add         edx, byte 4*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .columnloop
+
+    emms                                ; empty MMX state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctint-sse2.asm b/media/libjpeg/simd/i386/jfdctint-sse2.asm
new file mode 100644
index 0000000000..6f8e18cb9d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-sse2.asm
@@ -0,0 +1,633 @@
+;
+; jfdctint.asm - accurate integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054   times 4 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        6
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    paddw       xmm3, xmm1              ; xmm3=tmp10
+    paddw       xmm6, xmm7              ; xmm6=tmp11
+    psubw       xmm4, xmm1              ; xmm4=tmp13
+    psubw       xmm0, xmm7              ; xmm0=tmp12
+
+    movdqa      xmm1, xmm3
+    paddw       xmm3, xmm6              ; xmm3=tmp10+tmp11
+    psubw       xmm1, xmm6              ; xmm1=tmp10-tmp11
+
+    psllw       xmm3, PASS1_BITS        ; xmm3=data0
+    psllw       xmm1, PASS1_BITS        ; xmm1=data4
+
+    movdqa      XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+    movdqa      XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm7, xmm4              ; xmm4=tmp13
+    movdqa      xmm6, xmm4
+    punpcklwd   xmm7, xmm0              ; xmm0=tmp12
+    punpckhwd   xmm6, xmm0
+    movdqa      xmm4, xmm7
+    movdqa      xmm0, xmm6
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F130_F054)]   ; xmm7=data2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F130_F054)]   ; xmm6=data2H
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm4=data6L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm0=data6H
+
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm7, xmm6              ; xmm7=data2
+    packssdw    xmm4, xmm0              ; xmm4=data6
+
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+    ; -- Odd part
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+    movdqa      xmm6, xmm2              ; xmm2=tmp4
+    movdqa      xmm0, xmm5              ; xmm5=tmp5
+    paddw       xmm6, xmm3              ; xmm6=z3
+    paddw       xmm0, xmm1              ; xmm0=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm7, xmm0
+    punpckhwd   xmm4, xmm0
+    movdqa      xmm6, xmm7
+    movdqa      xmm0, xmm4
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm7=z3L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm4=z3H
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F117_F078)]   ; xmm6=z4L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F117_F078)]   ; xmm0=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm7, xmm2
+    movdqa      xmm4, xmm2
+    punpcklwd   xmm7, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm2, xmm7
+    movdqa      xmm1, xmm4
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm7=tmp4L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm4=tmp4H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm2=tmp7L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm1=tmp7H
+
+    paddd       xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+    paddd       xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+    paddd       xmm2, xmm6              ; xmm2=data1L
+    paddd       xmm1, xmm0              ; xmm1=data1H
+
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+
+    packssdw    xmm7, xmm4              ; xmm7=data7
+    packssdw    xmm2, xmm1              ; xmm2=data1
+
+    movdqa      xmm4, xmm5
+    movdqa      xmm1, xmm5
+    punpcklwd   xmm4, xmm3
+    punpckhwd   xmm1, xmm3
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm4=tmp5L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm1=tmp5H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm5=tmp6L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm3=tmp6H
+
+    paddd       xmm4, xmm6              ; xmm4=data5L
+    paddd       xmm1, xmm0              ; xmm1=data5H
+    paddd       xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+    paddd       xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+
+    packssdw    xmm4, xmm1              ; xmm4=data5
+    packssdw    xmm5, xmm3              ; xmm5=data3
+
+    ; ---- Pass 2: process columns.
+
+;   mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+    movdqa      xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+    ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+    ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm2              ; xmm6=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm1, xmm2              ; xmm1=(40 41 50 51 60 61 70 71)
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm3, xmm5              ; xmm3=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+    movdqa      xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+    ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+    ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm4              ; xmm2=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm0, xmm4              ; xmm0=(44 45 54 55 64 65 74 75)
+    movdqa      xmm3, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm7              ; xmm5=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm3, xmm7              ; xmm3=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm4, xmm5              ; xmm4=(24 25 26 27 34 35 36 37)
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm7, xmm3              ; xmm7=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm5              ; xmm6=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm4, xmm5              ; xmm4=(20 21 22 23 30 31 32 33)
+    movdqa      xmm0, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm3              ; xmm1=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm0, xmm3              ; xmm0=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm2              ; xmm6=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm5, xmm2              ; xmm5=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm7              ; xmm0=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm3, xmm7              ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm7, xmm6
+    psubw       xmm5, xmm0              ; xmm5=data1-data6=tmp6
+    psubw       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    paddw       xmm2, xmm0              ; xmm2=data1+data6=tmp1
+    paddw       xmm7, xmm3              ; xmm7=data0+data7=tmp0
+
+    movdqa      xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movdqa      xmm5, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm5, xmm0              ; xmm5=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm3              ; xmm1=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm6, xmm3              ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm3, xmm4
+    paddw       xmm5, xmm1              ; xmm5=data3+data4=tmp3
+    paddw       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    psubw       xmm0, xmm1              ; xmm0=data3-data4=tmp4
+    psubw       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm6, xmm2
+    paddw       xmm7, xmm5              ; xmm7=tmp10
+    paddw       xmm2, xmm4              ; xmm2=tmp11
+    psubw       xmm1, xmm5              ; xmm1=tmp13
+    psubw       xmm6, xmm4              ; xmm6=tmp12
+
+    movdqa      xmm5, xmm7
+    paddw       xmm7, xmm2              ; xmm7=tmp10+tmp11
+    psubw       xmm5, xmm2              ; xmm5=tmp10-tmp11
+
+    paddw       xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    paddw       xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    psraw       xmm7, PASS1_BITS        ; xmm7=data0
+    psraw       xmm5, PASS1_BITS        ; xmm5=data4
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+    movdqa      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm4, xmm1              ; xmm1=tmp13
+    movdqa      xmm2, xmm1
+    punpcklwd   xmm4, xmm6              ; xmm6=tmp12
+    punpckhwd   xmm2, xmm6
+    movdqa      xmm1, xmm4
+    movdqa      xmm6, xmm2
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F130_F054)]   ; xmm4=data2L
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F130_F054)]   ; xmm2=data2H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=data6L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm6=data6H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm6, DESCALE_P2
+
+    packssdw    xmm4, xmm2              ; xmm4=data2
+    packssdw    xmm1, xmm6              ; xmm1=data6
+
+    movdqa      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+    ; -- Odd part
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    movdqa      xmm2, xmm0              ; xmm0=tmp4
+    movdqa      xmm6, xmm3              ; xmm3=tmp5
+    paddw       xmm2, xmm7              ; xmm2=z3
+    paddw       xmm6, xmm5              ; xmm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm1, xmm2
+    punpcklwd   xmm4, xmm6
+    punpckhwd   xmm1, xmm6
+    movdqa      xmm2, xmm4
+    movdqa      xmm6, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm4=z3L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm1=z3H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F117_F078)]   ; xmm2=z4L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F117_F078)]   ; xmm6=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm1, xmm0
+    punpcklwd   xmm4, xmm5
+    punpckhwd   xmm1, xmm5
+    movdqa      xmm0, xmm4
+    movdqa      xmm5, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm4=tmp4L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm1=tmp4H
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm0=tmp7L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm5=tmp7H
+
+    paddd       xmm4,  XMMWORD [wk(0)]  ; xmm4=data7L
+    paddd       xmm1,  XMMWORD [wk(1)]  ; xmm1=data7H
+    paddd       xmm0, xmm2              ; xmm0=data1L
+    paddd       xmm5, xmm6              ; xmm5=data1H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+
+    packssdw    xmm4, xmm1              ; xmm4=data7
+    packssdw    xmm0, xmm5              ; xmm0=data1
+
+    movdqa      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+    movdqa      xmm1, xmm3
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm1, xmm7
+    punpckhwd   xmm5, xmm7
+    movdqa      xmm3, xmm1
+    movdqa      xmm7, xmm5
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm1=tmp5L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm5=tmp5H
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm3=tmp6L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm7=tmp6H
+
+    paddd       xmm1, xmm2              ; xmm1=data5L
+    paddd       xmm5, xmm6              ; xmm5=data5H
+    paddd       xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+    paddd       xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+    paddd       xmm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm1, xmm5              ; xmm1=data5
+    packssdw    xmm3, xmm7              ; xmm3=data3
+
+    movdqa      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctflt-3dn.asm b/media/libjpeg/simd/i386/jidctflt-3dn.asm
new file mode 100644
index 0000000000..87951910d8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-3dn.asm
@@ -0,0 +1,451 @@
+;
+; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414        times 2 dd 1.414213562373095048801689
+PD_1_847        times 2 dd 1.847759065022573512256366
+PD_1_082        times 2 dd 1.082392200292393968799446
+PD_2_613        times 2 dd 2.613125929752753055713286
+PD_RNDINT_MAGIC times 2 dd 100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 8 db CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_3dnow)
+
+EXTN(jsimd_idct_float_3dnow):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/2                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    pushpic     ebx                     ; save GOT address
+    mov         ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    or          eax, ebx
+    poppic      ebx                     ; restore GOT address
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm0, mm0
+    psrad       mm0, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm0, mm0
+
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0
+    punpckhdq   mm1, mm1
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movd        mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm0, mm0
+    punpcklwd   mm1, mm1
+    psrad       mm0, (DWORD_BIT-WORD_BIT)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm0, mm0
+    pi2fd       mm1, mm1
+
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    punpcklwd   mm2, mm2
+    punpcklwd   mm3, mm3
+    psrad       mm2, (DWORD_BIT-WORD_BIT)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm2, mm2
+    pi2fd       mm3, mm3
+
+    pfmul       mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pfsub       mm0, mm2                ; mm0=tmp11
+    pfsub       mm1, mm3
+    pfadd       mm4, mm2                ; mm4=tmp10
+    pfadd       mm5, mm3                ; mm5=tmp13
+
+    pfmul       mm1, [GOTOFF(ebx,PD_1_414)]
+    pfsub       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm5                ; mm4=tmp3
+    pfsub       mm0, mm1                ; mm0=tmp2
+    pfadd       mm6, mm5                ; mm6=tmp0
+    pfadd       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; tmp3
+    movq        MMWORD [wk(0)], mm0     ; tmp2
+
+    ; -- Odd part
+
+    movd        mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movd        mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm2, mm2
+    punpcklwd   mm3, mm3
+    psrad       mm2, (DWORD_BIT-WORD_BIT)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm2, mm2
+    pi2fd       mm3, mm3
+
+    pfmul       mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    punpcklwd   mm5, mm5
+    punpcklwd   mm1, mm1
+    psrad       mm5, (DWORD_BIT-WORD_BIT)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm5, mm5
+    pi2fd       mm1, mm1
+
+    pfmul       mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    pfadd       mm2, mm1                ; mm2=z11
+    pfadd       mm5, mm3                ; mm5=z13
+    pfsub       mm4, mm1                ; mm4=z12
+    pfsub       mm0, mm3                ; mm0=z10
+
+    movq        mm1, mm2
+    pfsub       mm2, mm5
+    pfadd       mm1, mm5                ; mm1=tmp7
+
+    pfmul       mm2, [GOTOFF(ebx,PD_1_414)]  ; mm2=tmp11
+
+    movq        mm3, mm0
+    pfadd       mm0, mm4
+    pfmul       mm0, [GOTOFF(ebx,PD_1_847)]  ; mm0=z5
+    pfmul       mm3, [GOTOFF(ebx,PD_2_613)]  ; mm3=(z10 * 2.613125930)
+    pfmul       mm4, [GOTOFF(ebx,PD_1_082)]  ; mm4=(z12 * 1.082392200)
+    pfsubr      mm3, mm0                     ; mm3=tmp12
+    pfsub       mm4, mm0                     ; mm4=tmp10
+
+    ; -- Final output stage
+
+    pfsub       mm3, mm1                ; mm3=tmp6
+    movq        mm5, mm6
+    movq        mm0, mm7
+    pfadd       mm6, mm1                ; mm6=data0=(00 01)
+    pfadd       mm7, mm3                ; mm7=data1=(10 11)
+    pfsub       mm5, mm1                ; mm5=data7=(70 71)
+    pfsub       mm0, mm3                ; mm0=data6=(60 61)
+    pfsub       mm2, mm3                ; mm2=tmp5
+
+    movq        mm1, mm6                ; transpose coefficients
+    punpckldq   mm6, mm7                ; mm6=(00 10)
+    punpckhdq   mm1, mm7                ; mm1=(01 11)
+    movq        mm3, mm0                ; transpose coefficients
+    punpckldq   mm0, mm5                ; mm0=(60 70)
+    punpckhdq   mm3, mm5                ; mm3=(61 71)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp2
+    movq        mm5, MMWORD [wk(1)]     ; mm5=tmp3
+
+    pfadd       mm4, mm2                ; mm4=tmp4
+    movq        mm6, mm7
+    movq        mm1, mm5
+    pfadd       mm7, mm2                ; mm7=data2=(20 21)
+    pfadd       mm5, mm4                ; mm5=data4=(40 41)
+    pfsub       mm6, mm2                ; mm6=data5=(50 51)
+    pfsub       mm1, mm4                ; mm1=data3=(30 31)
+
+    movq        mm0, mm7                ; transpose coefficients
+    punpckldq   mm7, mm1                ; mm7=(20 30)
+    punpckhdq   mm0, mm1                ; mm0=(21 31)
+    movq        mm3, mm5                ; transpose coefficients
+    punpckldq   mm5, mm6                ; mm5=(40 50)
+    punpckhdq   mm3, mm6                ; mm3=(41 51)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+    add         esi, byte 2*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 2*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/2                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pfsub       mm0, mm2                ; mm0=tmp11
+    pfsub       mm1, mm3
+    pfadd       mm4, mm2                ; mm4=tmp10
+    pfadd       mm5, mm3                ; mm5=tmp13
+
+    pfmul       mm1, [GOTOFF(ebx,PD_1_414)]
+    pfsub       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm5                ; mm4=tmp3
+    pfsub       mm0, mm1                ; mm0=tmp2
+    pfadd       mm6, mm5                ; mm6=tmp0
+    pfadd       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; tmp3
+    movq        MMWORD [wk(0)], mm0     ; tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    pfadd       mm2, mm1                ; mm2=z11
+    pfadd       mm5, mm3                ; mm5=z13
+    pfsub       mm4, mm1                ; mm4=z12
+    pfsub       mm0, mm3                ; mm0=z10
+
+    movq        mm1, mm2
+    pfsub       mm2, mm5
+    pfadd       mm1, mm5                ; mm1=tmp7
+
+    pfmul       mm2, [GOTOFF(ebx,PD_1_414)]  ; mm2=tmp11
+
+    movq        mm3, mm0
+    pfadd       mm0, mm4
+    pfmul       mm0, [GOTOFF(ebx,PD_1_847)]  ; mm0=z5
+    pfmul       mm3, [GOTOFF(ebx,PD_2_613)]  ; mm3=(z10 * 2.613125930)
+    pfmul       mm4, [GOTOFF(ebx,PD_1_082)]  ; mm4=(z12 * 1.082392200)
+    pfsubr      mm3, mm0                     ; mm3=tmp12
+    pfsub       mm4, mm0                     ; mm4=tmp10
+
+    ; -- Final output stage
+
+    pfsub       mm3, mm1                ; mm3=tmp6
+    movq        mm5, mm6
+    movq        mm0, mm7
+    pfadd       mm6, mm1                ; mm6=data0=(00 10)
+    pfadd       mm7, mm3                ; mm7=data1=(01 11)
+    pfsub       mm5, mm1                ; mm5=data7=(07 17)
+    pfsub       mm0, mm3                ; mm0=data6=(06 16)
+    pfsub       mm2, mm3                ; mm2=tmp5
+
+    movq        mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; mm1=[PD_RNDINT_MAGIC]
+    pcmpeqd     mm3, mm3
+    psrld       mm3, WORD_BIT           ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+    pfadd       mm6, mm1                ; mm6=roundint(data0/8)=(00 ** 10 **)
+    pfadd       mm7, mm1                ; mm7=roundint(data1/8)=(01 ** 11 **)
+    pfadd       mm0, mm1                ; mm0=roundint(data6/8)=(06 ** 16 **)
+    pfadd       mm5, mm1                ; mm5=roundint(data7/8)=(07 ** 17 **)
+
+    pand        mm6, mm3                ; mm6=(00 -- 10 --)
+    pslld       mm7, WORD_BIT           ; mm7=(-- 01 -- 11)
+    pand        mm0, mm3                ; mm0=(06 -- 16 --)
+    pslld       mm5, WORD_BIT           ; mm5=(-- 07 -- 17)
+    por         mm6, mm7                ; mm6=(00 01 10 11)
+    por         mm0, mm5                ; mm0=(06 07 16 17)
+
+    movq        mm1, MMWORD [wk(0)]     ; mm1=tmp2
+    movq        mm3, MMWORD [wk(1)]     ; mm3=tmp3
+
+    pfadd       mm4, mm2                ; mm4=tmp4
+    movq        mm7, mm1
+    movq        mm5, mm3
+    pfadd       mm1, mm2                ; mm1=data2=(02 12)
+    pfadd       mm3, mm4                ; mm3=data4=(04 14)
+    pfsub       mm7, mm2                ; mm7=data5=(05 15)
+    pfsub       mm5, mm4                ; mm5=data3=(03 13)
+
+    movq        mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; mm2=[PD_RNDINT_MAGIC]
+    pcmpeqd     mm4, mm4
+    psrld       mm4, WORD_BIT           ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+    pfadd       mm3, mm2                ; mm3=roundint(data4/8)=(04 ** 14 **)
+    pfadd       mm7, mm2                ; mm7=roundint(data5/8)=(05 ** 15 **)
+    pfadd       mm1, mm2                ; mm1=roundint(data2/8)=(02 ** 12 **)
+    pfadd       mm5, mm2                ; mm5=roundint(data3/8)=(03 ** 13 **)
+
+    pand        mm3, mm4                ; mm3=(04 -- 14 --)
+    pslld       mm7, WORD_BIT           ; mm7=(-- 05 -- 15)
+    pand        mm1, mm4                ; mm1=(02 -- 12 --)
+    pslld       mm5, WORD_BIT           ; mm5=(-- 03 -- 13)
+    por         mm3, mm7                ; mm3=(04 05 14 15)
+    por         mm1, mm5                ; mm1=(02 03 12 13)
+
+    movq        mm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm2=[PB_CENTERJSAMP]
+
+    packsswb    mm6, mm3                ; mm6=(00 01 10 11 04 05 14 15)
+    packsswb    mm1, mm0                ; mm1=(02 03 12 13 06 07 16 17)
+    paddb       mm6, mm2
+    paddb       mm1, mm2
+
+    movq        mm4, mm6                ; transpose coefficients(phase 2)
+    punpcklwd   mm6, mm1                ; mm6=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(04 05 06 07 14 15 16 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 3)
+    punpckldq   mm6, mm4                ; mm6=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm7, mm4                ; mm7=(10 11 12 13 14 15 16 17)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 2*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 2*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctflt-sse.asm b/media/libjpeg/simd/i386/jidctflt-sse.asm
new file mode 100644
index 0000000000..b27ecfdf46
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-sse.asm
@@ -0,0 +1,571 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414       times 4 dd  1.414213562373095048801689
+PD_1_847       times 4 dd  1.847759065022573512256366
+PD_1_082       times 4 dd  1.082392200292393968799446
+PD_M2_613      times 4 dd -2.613125929752753055713286
+PD_0_125       times 4 dd  0.125        ; 1/8
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse)
+
+EXTN(jsimd_idct_float_sse):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm1, mm0                   ; mm1=(** 02 ** 03)
+    punpcklwd   mm0, mm0                   ; mm0=(00 00 01 01)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in0H=(02 03)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in0L=(00 01)
+    cvtpi2ps    xmm3, mm1                  ; xmm3=(02 03 ** **)
+    cvtpi2ps    xmm0, mm0                  ; xmm0=(00 01 ** **)
+    movlhps     xmm0, xmm3                 ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm4, mm0                ; mm4=(** 02 ** 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm5, mm1                ; mm5=(** 22 ** 23)
+    punpcklwd   mm1, mm1                ; mm1=(20 20 21 21)
+
+    psrad       mm4, (DWORD_BIT-WORD_BIT)  ; mm4=in0H=(02 03)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in0L=(00 01)
+    cvtpi2ps    xmm4, mm4                  ; xmm4=(02 03 ** **)
+    cvtpi2ps    xmm0, mm0                  ; xmm0=(00 01 ** **)
+    psrad       mm5, (DWORD_BIT-WORD_BIT)  ; mm5=in2H=(22 23)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in2L=(20 21)
+    cvtpi2ps    xmm5, mm5                  ; xmm5=(22 23 ** **)
+    cvtpi2ps    xmm1, mm1                  ; xmm1=(20 21 ** **)
+
+    punpckhwd   mm6, mm2                ; mm6=(** 42 ** 43)
+    punpcklwd   mm2, mm2                ; mm2=(40 40 41 41)
+    punpckhwd   mm7, mm3                ; mm7=(** 62 ** 63)
+    punpcklwd   mm3, mm3                ; mm3=(60 60 61 61)
+
+    psrad       mm6, (DWORD_BIT-WORD_BIT)  ; mm6=in4H=(42 43)
+    psrad       mm2, (DWORD_BIT-WORD_BIT)  ; mm2=in4L=(40 41)
+    cvtpi2ps    xmm6, mm6                  ; xmm6=(42 43 ** **)
+    cvtpi2ps    xmm2, mm2                  ; xmm2=(40 41 ** **)
+    psrad       mm7, (DWORD_BIT-WORD_BIT)  ; mm7=in6H=(62 63)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)  ; mm3=in6L=(60 61)
+    cvtpi2ps    xmm7, mm7                  ; xmm7=(62 63 ** **)
+    cvtpi2ps    xmm3, mm3                  ; xmm3=(60 61 ** **)
+
+    movlhps     xmm0, xmm4              ; xmm0=in0=(00 01 02 03)
+    movlhps     xmm1, xmm5              ; xmm1=in2=(20 21 22 23)
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movlhps     xmm2, xmm6              ; xmm2=in4=(40 41 42 43)
+    movlhps     xmm3, xmm7              ; xmm3=in6=(60 61 62 63)
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm6, mm4                ; mm6=(** 12 ** 13)
+    punpcklwd   mm4, mm4                ; mm4=(10 10 11 11)
+    punpckhwd   mm2, mm0                ; mm2=(** 32 ** 33)
+    punpcklwd   mm0, mm0                ; mm0=(30 30 31 31)
+
+    psrad       mm6, (DWORD_BIT-WORD_BIT)  ; mm6=in1H=(12 13)
+    psrad       mm4, (DWORD_BIT-WORD_BIT)  ; mm4=in1L=(10 11)
+    cvtpi2ps    xmm4, mm6                  ; xmm4=(12 13 ** **)
+    cvtpi2ps    xmm2, mm4                  ; xmm2=(10 11 ** **)
+    psrad       mm2, (DWORD_BIT-WORD_BIT)  ; mm2=in3H=(32 33)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in3L=(30 31)
+    cvtpi2ps    xmm0, mm2                  ; xmm0=(32 33 ** **)
+    cvtpi2ps    xmm3, mm0                  ; xmm3=(30 31 ** **)
+
+    punpckhwd   mm7, mm5                ; mm7=(** 52 ** 53)
+    punpcklwd   mm5, mm5                ; mm5=(50 50 51 51)
+    punpckhwd   mm3, mm1                ; mm3=(** 72 ** 73)
+    punpcklwd   mm1, mm1                ; mm1=(70 70 71 71)
+
+    movlhps     xmm2, xmm4              ; xmm2=in1=(10 11 12 13)
+    movlhps     xmm3, xmm0              ; xmm3=in3=(30 31 32 33)
+
+    psrad       mm7, (DWORD_BIT-WORD_BIT)  ; mm7=in5H=(52 53)
+    psrad       mm5, (DWORD_BIT-WORD_BIT)  ; mm5=in5L=(50 51)
+    cvtpi2ps    xmm4, mm7                  ; xmm4=(52 53 ** **)
+    cvtpi2ps    xmm5, mm5                  ; xmm5=(50 51 ** **)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)  ; mm3=in7H=(72 73)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in7L=(70 71)
+    cvtpi2ps    xmm0, mm3                  ; xmm0=(72 73 ** **)
+    cvtpi2ps    xmm1, mm1                  ; xmm1=(70 71 ** **)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movlhps     xmm5, xmm4              ; xmm5=in5=(50 51 52 53)
+    movlhps     xmm1, xmm0              ; xmm1=in7=(70 71 72 73)
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [GOTOFF(ebx,PD_0_125)]  ; xmm1=[PD_0_125]
+
+    mulps       xmm6, xmm1              ; descale(1/8)
+    mulps       xmm7, xmm1              ; descale(1/8)
+    mulps       xmm5, xmm1              ; descale(1/8)
+    mulps       xmm0, xmm1              ; descale(1/8)
+
+    movhlps     xmm3, xmm6
+    movhlps     xmm1, xmm7
+    cvtps2pi    mm0, xmm6               ; round to int32, mm0=data0L=(00 10)
+    cvtps2pi    mm1, xmm7               ; round to int32, mm1=data1L=(01 11)
+    cvtps2pi    mm2, xmm3               ; round to int32, mm2=data0H=(20 30)
+    cvtps2pi    mm3, xmm1               ; round to int32, mm3=data1H=(21 31)
+    packssdw    mm0, mm2                ; mm0=data0=(00 10 20 30)
+    packssdw    mm1, mm3                ; mm1=data1=(01 11 21 31)
+
+    movhlps     xmm6, xmm5
+    movhlps     xmm7, xmm0
+    cvtps2pi    mm4, xmm5               ; round to int32, mm4=data7L=(07 17)
+    cvtps2pi    mm5, xmm0               ; round to int32, mm5=data6L=(06 16)
+    cvtps2pi    mm6, xmm6               ; round to int32, mm6=data7H=(27 37)
+    cvtps2pi    mm7, xmm7               ; round to int32, mm7=data6H=(26 36)
+    packssdw    mm4, mm6                ; mm4=data7=(07 17 27 37)
+    packssdw    mm5, mm7                ; mm5=data6=(06 16 26 36)
+
+    packsswb    mm0, mm5                ; mm0=(00 10 20 30 06 16 26 36)
+    packsswb    mm1, mm4                ; mm1=(01 11 21 31 07 17 27 37)
+
+    movaps      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp2
+    movaps      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movaps      xmm6, [GOTOFF(ebx,PD_0_125)]  ; xmm6=[PD_0_125]
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm5, xmm3
+    movaps      xmm0, xmm1
+    addps       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+    addps       xmm1, xmm4              ; xmm1=data4=(04 14 24 34)
+    subps       xmm5, xmm2              ; xmm5=data5=(05 15 25 35)
+    subps       xmm0, xmm4              ; xmm0=data3=(03 13 23 33)
+
+    mulps       xmm3, xmm6              ; descale(1/8)
+    mulps       xmm1, xmm6              ; descale(1/8)
+    mulps       xmm5, xmm6              ; descale(1/8)
+    mulps       xmm0, xmm6              ; descale(1/8)
+
+    movhlps     xmm7, xmm3
+    movhlps     xmm2, xmm1
+    cvtps2pi    mm2, xmm3               ; round to int32, mm2=data2L=(02 12)
+    cvtps2pi    mm3, xmm1               ; round to int32, mm3=data4L=(04 14)
+    cvtps2pi    mm6, xmm7               ; round to int32, mm6=data2H=(22 32)
+    cvtps2pi    mm7, xmm2               ; round to int32, mm7=data4H=(24 34)
+    packssdw    mm2, mm6                ; mm2=data2=(02 12 22 32)
+    packssdw    mm3, mm7                ; mm3=data4=(04 14 24 34)
+
+    movhlps     xmm4, xmm5
+    movhlps     xmm6, xmm0
+    cvtps2pi    mm5, xmm5               ; round to int32, mm5=data5L=(05 15)
+    cvtps2pi    mm4, xmm0               ; round to int32, mm4=data3L=(03 13)
+    cvtps2pi    mm6, xmm4               ; round to int32, mm6=data5H=(25 35)
+    cvtps2pi    mm7, xmm6               ; round to int32, mm7=data3H=(23 33)
+    packssdw    mm5, mm6                ; mm5=data5=(05 15 25 35)
+    packssdw    mm4, mm7                ; mm4=data3=(03 13 23 33)
+
+    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
+
+    packsswb    mm2, mm3                ; mm2=(02 12 22 32 04 14 24 34)
+    packsswb    mm4, mm5                ; mm4=(03 13 23 33 05 15 25 35)
+
+    paddb       mm0, mm6
+    paddb       mm1, mm6
+    paddb       mm2, mm6
+    paddb       mm4, mm6
+
+    movq        mm7, mm0                ; transpose coefficients(phase 1)
+    punpcklbw   mm0, mm1                ; mm0=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm7, mm1                ; mm7=(06 07 16 17 26 27 36 37)
+    movq        mm3, mm2                ; transpose coefficients(phase 1)
+    punpcklbw   mm2, mm4                ; mm2=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm3, mm4                ; mm3=(04 05 14 15 24 25 34 35)
+
+    movq        mm5, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm2                ; mm0=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm5, mm2                ; mm5=(20 21 22 23 30 31 32 33)
+    movq        mm6, mm3                ; transpose coefficients(phase 2)
+    punpcklwd   mm3, mm7                ; mm3=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm6, mm7                ; mm6=(24 25 26 27 34 35 36 37)
+
+    movq        mm1, mm0                ; transpose coefficients(phase 3)
+    punpckldq   mm0, mm3                ; mm0=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm1, mm3                ; mm1=(10 11 12 13 14 15 16 17)
+    movq        mm4, mm5                ; transpose coefficients(phase 3)
+    punpckldq   mm5, mm6                ; mm5=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm4, mm6                ; mm4=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctflt-sse2.asm b/media/libjpeg/simd/i386/jidctflt-sse2.asm
new file mode 100644
index 0000000000..c646eaef76
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-sse2.asm
@@ -0,0 +1,497 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414        times 4  dd  1.414213562373095048801689
+PD_1_847        times 4  dd  1.847759065022573512256366
+PD_1_082        times 4  dd  1.082392200292393968799446
+PD_M2_613       times 4  dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm2
+    por         xmm3, xmm4
+    por         xmm5, xmm6
+    por         xmm1, xmm3
+    por         xmm5, xmm7
+    por         xmm1, xmm5
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
+
+    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
+    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
+    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm1=[PD_RNDINT_MAGIC]
+    pcmpeqd     xmm3, xmm3
+    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
+    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
+
+    movaps      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm7, xmm1
+    movaps      xmm5, xmm3
+    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
+    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
+    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
+    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
+
+    movaps      xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm2=[PD_RNDINT_MAGIC]
+    pcmpeqd     xmm4, xmm4
+    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
+    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
+
+    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+    paddb       xmm6, xmm2
+    paddb       xmm1, xmm2
+
+    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
+    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctfst-mmx.asm b/media/libjpeg/simd/i386/jidctfst-mmx.asm
new file mode 100644
index 0000000000..24622d4369
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctfst-mmx.asm
@@ -0,0 +1,499 @@
+;
+; jidctfst.asm - fast integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))       ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414       times 4 dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 4 dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 4 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_mmx)
+
+EXTN(jsimd_idct_ifast_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    psubw       mm0, mm2                ; mm0=tmp11
+    psubw       mm1, mm3
+    paddw       mm4, mm2                ; mm4=tmp10
+    paddw       mm5, mm3                ; mm5=tmp13
+
+    psllw       mm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    psubw       mm4, mm5                ; mm4=tmp3
+    psubw       mm0, mm1                ; mm0=tmp2
+    paddw       mm6, mm5                ; mm6=tmp0
+    paddw       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=tmp3
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    psubw       mm2, mm1                ; mm2=z12
+    psubw       mm5, mm3                ; mm5=z10
+    paddw       mm4, mm1                ; mm4=z11
+    paddw       mm0, mm3                ; mm0=z13
+
+    movq        mm1, mm5                ; mm1=z10(unscaled)
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm5, PRE_MULTIPLY_SCALE_BITS
+
+    movq        mm3, mm4
+    psubw       mm4, mm0
+    paddw       mm3, mm0                ; mm3=tmp7
+
+    psllw       mm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm4, [GOTOFF(ebx,PW_F1414)]  ; mm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movq        mm0, mm5
+    paddw       mm5, mm2
+    pmulhw      mm5, [GOTOFF(ebx,PW_F1847)]   ; mm5=z5
+    pmulhw      mm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      mm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       mm0, mm1
+    psubw       mm2, mm5                ; mm2=tmp10
+    paddw       mm0, mm5                ; mm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       mm0, mm3                ; mm0=tmp6
+    movq        mm1, mm6
+    movq        mm5, mm7
+    paddw       mm6, mm3                ; mm6=data0=(00 01 02 03)
+    paddw       mm7, mm0                ; mm7=data1=(10 11 12 13)
+    psubw       mm1, mm3                ; mm1=data7=(70 71 72 73)
+    psubw       mm5, mm0                ; mm5=data6=(60 61 62 63)
+    psubw       mm4, mm0                ; mm4=tmp5
+
+    movq        mm3, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm3, mm7                ; mm3=(02 12 03 13)
+    movq        mm0, mm5                ; transpose coefficients(phase 1)
+    punpcklwd   mm5, mm1                ; mm5=(60 70 61 71)
+    punpckhwd   mm0, mm1                ; mm0=(62 72 63 73)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp2
+    movq        mm1, MMWORD [wk(1)]     ; mm1=tmp3
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(60 70 61 71)
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=(62 72 63 73)
+
+    paddw       mm2, mm4                ; mm2=tmp4
+    movq        mm5, mm7
+    movq        mm0, mm1
+    paddw       mm7, mm4                ; mm7=data2=(20 21 22 23)
+    paddw       mm1, mm2                ; mm1=data4=(40 41 42 43)
+    psubw       mm5, mm4                ; mm5=data5=(50 51 52 53)
+    psubw       mm0, mm2                ; mm0=data3=(30 31 32 33)
+
+    movq        mm4, mm7                ; transpose coefficients(phase 1)
+    punpcklwd   mm7, mm0                ; mm7=(20 30 21 31)
+    punpckhwd   mm4, mm0                ; mm4=(22 32 23 33)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm5                ; mm1=(40 50 41 51)
+    punpckhwd   mm2, mm5                ; mm2=(42 52 43 53)
+
+    movq        mm0, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm7                ; mm6=(00 10 20 30)
+    punpckhdq   mm0, mm7                ; mm0=(01 11 21 31)
+    movq        mm5, mm3                ; transpose coefficients(phase 2)
+    punpckldq   mm3, mm4                ; mm3=(02 12 22 32)
+    punpckhdq   mm5, mm4                ; mm5=(03 13 23 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=(60 70 61 71)
+    movq        mm4, MMWORD [wk(1)]     ; mm4=(62 72 63 73)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm7                ; mm1=(40 50 60 70)
+    punpckhdq   mm6, mm7                ; mm6=(41 51 61 71)
+    movq        mm0, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm4                ; mm2=(42 52 62 72)
+    punpckhdq   mm0, mm4                ; mm0=(43 53 63 73)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_IFAST_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    psubw       mm0, mm2                ; mm0=tmp11
+    psubw       mm1, mm3
+    paddw       mm4, mm2                ; mm4=tmp10
+    paddw       mm5, mm3                ; mm5=tmp13
+
+    psllw       mm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    psubw       mm4, mm5                ; mm4=tmp3
+    psubw       mm0, mm1                ; mm0=tmp2
+    paddw       mm6, mm5                ; mm6=tmp0
+    paddw       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=tmp3
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    psubw       mm2, mm1                ; mm2=z12
+    psubw       mm5, mm3                ; mm5=z10
+    paddw       mm4, mm1                ; mm4=z11
+    paddw       mm0, mm3                ; mm0=z13
+
+    movq        mm1, mm5                ; mm1=z10(unscaled)
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm5, PRE_MULTIPLY_SCALE_BITS
+
+    movq        mm3, mm4
+    psubw       mm4, mm0
+    paddw       mm3, mm0                ; mm3=tmp7
+
+    psllw       mm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm4, [GOTOFF(ebx,PW_F1414)]  ; mm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movq        mm0, mm5
+    paddw       mm5, mm2
+    pmulhw      mm5, [GOTOFF(ebx,PW_F1847)]   ; mm5=z5
+    pmulhw      mm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      mm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       mm0, mm1
+    psubw       mm2, mm5                ; mm2=tmp10
+    paddw       mm0, mm5                ; mm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       mm0, mm3                ; mm0=tmp6
+    movq        mm1, mm6
+    movq        mm5, mm7
+    paddw       mm6, mm3                ; mm6=data0=(00 10 20 30)
+    paddw       mm7, mm0                ; mm7=data1=(01 11 21 31)
+    psraw       mm6, (PASS1_BITS+3)     ; descale
+    psraw       mm7, (PASS1_BITS+3)     ; descale
+    psubw       mm1, mm3                ; mm1=data7=(07 17 27 37)
+    psubw       mm5, mm0                ; mm5=data6=(06 16 26 36)
+    psraw       mm1, (PASS1_BITS+3)     ; descale
+    psraw       mm5, (PASS1_BITS+3)     ; descale
+    psubw       mm4, mm0                ; mm4=tmp5
+
+    packsswb    mm6, mm5                ; mm6=(00 10 20 30 06 16 26 36)
+    packsswb    mm7, mm1                ; mm7=(01 11 21 31 07 17 27 37)
+
+    movq        mm3, MMWORD [wk(0)]     ; mm3=tmp2
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp3
+
+    paddw       mm2, mm4                ; mm2=tmp4
+    movq        mm5, mm3
+    movq        mm1, mm0
+    paddw       mm3, mm4                ; mm3=data2=(02 12 22 32)
+    paddw       mm0, mm2                ; mm0=data4=(04 14 24 34)
+    psraw       mm3, (PASS1_BITS+3)     ; descale
+    psraw       mm0, (PASS1_BITS+3)     ; descale
+    psubw       mm5, mm4                ; mm5=data5=(05 15 25 35)
+    psubw       mm1, mm2                ; mm1=data3=(03 13 23 33)
+    psraw       mm5, (PASS1_BITS+3)     ; descale
+    psraw       mm1, (PASS1_BITS+3)     ; descale
+
+    movq        mm4, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm4=[PB_CENTERJSAMP]
+
+    packsswb    mm3, mm0                ; mm3=(02 12 22 32 04 14 24 34)
+    packsswb    mm1, mm5                ; mm1=(03 13 23 33 05 15 25 35)
+
+    paddb       mm6, mm4
+    paddb       mm7, mm4
+    paddb       mm3, mm4
+    paddb       mm1, mm4
+
+    movq        mm2, mm6                ; transpose coefficients(phase 1)
+    punpcklbw   mm6, mm7                ; mm6=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm2, mm7                ; mm2=(06 07 16 17 26 27 36 37)
+    movq        mm0, mm3                ; transpose coefficients(phase 1)
+    punpcklbw   mm3, mm1                ; mm3=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm0, mm1                ; mm0=(04 05 14 15 24 25 34 35)
+
+    movq        mm5, mm6                ; transpose coefficients(phase 2)
+    punpcklwd   mm6, mm3                ; mm6=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm5, mm3                ; mm5=(20 21 22 23 30 31 32 33)
+    movq        mm4, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm2                ; mm0=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm4, mm2                ; mm4=(24 25 26 27 34 35 36 37)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 3)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13 14 15 16 17)
+    movq        mm1, mm5                ; transpose coefficients(phase 3)
+    punpckldq   mm5, mm4                ; mm5=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm1, mm4                ; mm1=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_JCOEF     ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                          ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctfst-sse2.asm b/media/libjpeg/simd/i386/jidctfst-sse2.asm
new file mode 100644
index 0000000000..19704ffa48
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctfst-sse2.asm
@@ -0,0 +1,501 @@
+;
+; jidctfst.asm - fast integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))       ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414       times 8  dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 8  dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 8  dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 8  dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm7, xmm0              ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0              ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm7, xmm7              ; xmm7=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm6, xmm0, 0x00        ; xmm6=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm2, xmm0, 0x55        ; xmm2=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm5, xmm0, 0xAA        ; xmm5=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm0, xmm0, 0xFF        ; xmm0=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm1, xmm7, 0x00        ; xmm1=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm4, xmm7, 0x55        ; xmm4=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm3, xmm7, 0xAA        ; xmm3=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm7, xmm7, 0xFF        ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    psubw       xmm0, xmm2              ; xmm0=tmp11
+    psubw       xmm1, xmm3
+    paddw       xmm4, xmm2              ; xmm4=tmp10
+    paddw       xmm5, xmm3              ; xmm5=tmp13
+
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       xmm1, xmm5              ; xmm1=tmp12
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm0
+    psubw       xmm4, xmm5              ; xmm4=tmp3
+    psubw       xmm0, xmm1              ; xmm0=tmp2
+    paddw       xmm6, xmm5              ; xmm6=tmp0
+    paddw       xmm7, xmm1              ; xmm7=tmp1
+
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movdqa      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm0, xmm5
+    psubw       xmm2, xmm1              ; xmm2=z12
+    psubw       xmm5, xmm3              ; xmm5=z10
+    paddw       xmm4, xmm1              ; xmm4=z11
+    paddw       xmm0, xmm3              ; xmm0=z13
+
+    movdqa      xmm1, xmm5              ; xmm1=z10(unscaled)
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm3, xmm4
+    psubw       xmm4, xmm0
+    paddw       xmm3, xmm0              ; xmm3=tmp7
+
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F1414)]  ; xmm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm0, xmm5
+    paddw       xmm5, xmm2
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F1847)]   ; xmm5=z5
+    pmulhw      xmm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       xmm0, xmm1
+    psubw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm0, xmm5              ; xmm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm0, xmm3              ; xmm0=tmp6
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm7
+    paddw       xmm6, xmm3              ; xmm6=data0=(00 01 02 03 04 05 06 07)
+    paddw       xmm7, xmm0              ; xmm7=data1=(10 11 12 13 14 15 16 17)
+    psubw       xmm1, xmm3              ; xmm1=data7=(70 71 72 73 74 75 76 77)
+    psubw       xmm5, xmm0              ; xmm5=data6=(60 61 62 63 64 65 66 67)
+    psubw       xmm4, xmm0              ; xmm4=tmp5
+
+    movdqa      xmm3, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm3, xmm7              ; xmm3=(04 14 05 15 06 16 07 17)
+    movdqa      xmm0, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm1              ; xmm5=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm0, xmm1              ; xmm0=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+    paddw       xmm2, xmm4              ; xmm2=tmp4
+    movdqa      xmm5, xmm7
+    movdqa      xmm0, xmm1
+    paddw       xmm7, xmm4              ; xmm7=data2=(20 21 22 23 24 25 26 27)
+    paddw       xmm1, xmm2              ; xmm1=data4=(40 41 42 43 44 45 46 47)
+    psubw       xmm5, xmm4              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+    psubw       xmm0, xmm2              ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm0              ; xmm7=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm0              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm5              ; xmm2=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm0, xmm3              ; transpose coefficients(phase 2)
+    punpckldq   xmm3, xmm4              ; xmm3=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm0, xmm4              ; xmm0=(06 16 26 36 07 17 27 37)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7              ; xmm6=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm5, xmm7              ; xmm5=(02 12 22 32 03 13 23 33)
+
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm4              ; xmm1=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm3, xmm4              ; xmm3=(42 52 62 72 43 53 63 73)
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm7              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm0, xmm7              ; xmm0=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm1              ; xmm6=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm4, xmm1              ; xmm4=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm5, xmm3              ; xmm5=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm7, xmm3              ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm7, xmm3              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm3, xmm0              ; xmm3=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm7, xmm0              ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm0, xmm5
+    psubw       xmm6, xmm1              ; xmm6=tmp11
+    psubw       xmm5, xmm3
+    paddw       xmm2, xmm1              ; xmm2=tmp10
+    paddw       xmm0, xmm3              ; xmm0=tmp13
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F1414)]
+    psubw       xmm5, xmm0              ; xmm5=tmp12
+
+    movdqa      xmm1, xmm2
+    movdqa      xmm3, xmm6
+    psubw       xmm2, xmm0              ; xmm2=tmp3
+    psubw       xmm6, xmm5              ; xmm6=tmp2
+    paddw       xmm1, xmm0              ; xmm1=tmp0
+    paddw       xmm3, xmm5              ; xmm3=tmp1
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+    ; -- Odd part
+
+    ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm4
+    psubw       xmm0, xmm7              ; xmm0=z12
+    psubw       xmm4, xmm5              ; xmm4=z10
+    paddw       xmm2, xmm7              ; xmm2=z11
+    paddw       xmm6, xmm5              ; xmm6=z13
+
+    movdqa      xmm7, xmm4              ; xmm7=z10(unscaled)
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm5, xmm2
+    psubw       xmm2, xmm6
+    paddw       xmm5, xmm6              ; xmm5=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F1414)]  ; xmm2=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm6, xmm4
+    paddw       xmm4, xmm0
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F1847)]   ; xmm4=z5
+    pmulhw      xmm6, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F1082)]
+    psubw       xmm6, xmm7
+    psubw       xmm0, xmm4              ; xmm0=tmp10
+    paddw       xmm6, xmm4              ; xmm6=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm6, xmm5              ; xmm6=tmp6
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm3
+    paddw       xmm1, xmm5              ; xmm1=data0=(00 10 20 30 40 50 60 70)
+    paddw       xmm3, xmm6              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    psraw       xmm1, (PASS1_BITS+3)    ; descale
+    psraw       xmm3, (PASS1_BITS+3)    ; descale
+    psubw       xmm7, xmm5              ; xmm7=data7=(07 17 27 37 47 57 67 77)
+    psubw       xmm4, xmm6              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psubw       xmm2, xmm6              ; xmm2=tmp5
+
+    packsswb    xmm1, xmm4        ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm7        ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+    paddw       xmm0, xmm2              ; xmm0=tmp4
+    movdqa      xmm4, xmm5
+    movdqa      xmm7, xmm6
+    paddw       xmm5, xmm2              ; xmm5=data2=(02 12 22 32 42 52 62 72)
+    paddw       xmm6, xmm0              ; xmm6=data4=(04 14 24 34 44 54 64 74)
+    psraw       xmm5, (PASS1_BITS+3)    ; descale
+    psraw       xmm6, (PASS1_BITS+3)    ; descale
+    psubw       xmm4, xmm2              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+    psubw       xmm7, xmm0              ; xmm7=data3=(03 13 23 33 43 53 63 73)
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+
+    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
+
+    packsswb    xmm5, xmm6        ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm7, xmm4        ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm1, xmm2
+    paddb       xmm3, xmm2
+    paddb       xmm5, xmm2
+    paddb       xmm7, xmm2
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 1)
+    punpcklbw   xmm1, xmm3        ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm3        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm6, xmm5        ; transpose coefficients(phase 1)
+    punpcklbw   xmm5, xmm7        ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm6, xmm7        ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm1        ; transpose coefficients(phase 2)
+    punpcklwd   xmm1, xmm5        ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm5        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm0        ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm2, xmm0        ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm3, xmm1        ; transpose coefficients(phase 3)
+    punpckldq   xmm1, xmm6        ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm3, xmm6        ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm7, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm2        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm7, xmm2        ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm5, xmm1, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm3, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm6, xmm4, 0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm2, xmm7, 0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+    mov         edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctint-avx2.asm b/media/libjpeg/simd/i386/jidctint-avx2.asm
new file mode 100644
index 0000000000..199c7df3b6
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-avx2.asm
@@ -0,0 +1,453 @@
+;
+; jidctint.asm - accurate integer IDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %5=(00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71)
+    ; %6=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %7=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %8=(07 17 27 37 47 57 67 77  06 16 26 36 46 56 66 76)
+
+    vpermq      %5, %1, 0xD8
+    vpermq      %6, %2, 0x72
+    vpermq      %7, %3, 0xD8
+    vpermq      %8, %4, 0x72
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %6=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %7=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %8=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpunpcklwd  %1, %5, %6
+    vpunpckhwd  %2, %5, %6
+    vpunpcklwd  %3, %7, %8
+    vpunpckhwd  %4, %7, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 02 10 12 20 22 30 32  40 42 50 52 60 62 70 72)
+    ; %2=(01 03 11 13 21 23 31 33  41 43 51 53 61 63 71 73)
+    ; %3=(04 06 14 16 24 26 34 36  44 46 54 56 64 66 74 76)
+    ; %4=(05 07 15 17 25 27 35 37  45 47 55 57 65 67 75 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpcklwd  %6, %3, %4
+    vpunpckhwd  %7, %1, %2
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 3)
+    ; %5=(00 01 02 03 10 11 12 13  40 41 42 43 50 51 52 53)
+    ; %6=(04 05 06 07 14 15 16 17  44 45 46 47 54 55 56 57)
+    ; %7=(20 21 22 23 30 31 32 33  60 61 62 63 70 71 72 73)
+    ; %8=(24 25 26 27 34 35 36 37  64 65 66 67 74 75 76 77)
+
+    vpunpcklqdq %1, %5, %6
+    vpunpckhqdq %2, %5, %6
+    vpunpcklqdq %3, %7, %8
+    vpunpckhqdq %4, %7, %8
+    ; transpose coefficients(phase 4)
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
+; %1-%4:  Input/output registers
+; %5-%12: Temp registers
+; %9:     Pass (1 or 2)
+
+%macro dodct 13
+    ; -- Even part
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    vperm2i128  %6, %3, %3, 0x01        ; %6=in6_2
+    vpunpcklwd  %5, %3, %6              ; %5=in26_62L
+    vpunpckhwd  %6, %3, %6              ; %6=in26_62H
+    vpmaddwd    %5, %5, [GOTOFF(ebx,PW_F130_F054_MF130_F054)]  ; %5=tmp3_2L
+    vpmaddwd    %6, %6, [GOTOFF(ebx,PW_F130_F054_MF130_F054)]  ; %6=tmp3_2H
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=in4_0
+    vpsignw     %1, %1, [GOTOFF(ebx,PW_1_NEG1)]
+    vpaddw      %7, %7, %1              ; %7=(in0+in4)_(in0-in4)
+
+    vpxor       %1, %1, %1
+    vpunpcklwd  %8, %1, %7              ; %8=tmp0_1L
+    vpunpckhwd  %1, %1, %7              ; %1=tmp0_1H
+    vpsrad      %8, %8, (16-CONST_BITS)  ; vpsrad %8,16 & vpslld %8,CONST_BITS
+    vpsrad      %1, %1, (16-CONST_BITS)  ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+    vpsubd      %3, %8, %5
+    vmovdqu     %11, %3                 ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+    vpaddd      %3, %8, %5
+    vmovdqu     %9, %3                  ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+    vpsubd      %3, %1, %6
+    vmovdqu     %12, %3                 ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+    vpaddd      %3, %1, %6
+    vmovdqu     %10, %3                 ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+    ; -- Odd part
+
+    vpaddw      %1, %4, %2              ; %1=in7_5+in3_1=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %8, %1, %1, 0x01        ; %8=z4_3
+    vpunpcklwd  %7, %1, %8              ; %7=z34_43L
+    vpunpckhwd  %8, %1, %8              ; %8=z34_43H
+    vpmaddwd    %7, %7, [GOTOFF(ebx,PW_MF078_F117_F078_F117)]  ; %7=z3_4L
+    vpmaddwd    %8, %8, [GOTOFF(ebx,PW_MF078_F117_F078_F117)]  ; %8=z3_4H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    vperm2i128  %2, %2, %2, 0x01        ; %2=in1_3
+    vpunpcklwd  %3, %4, %2              ; %3=in71_53L
+    vpunpckhwd  %4, %4, %2              ; %4=in71_53H
+
+    vpmaddwd    %5, %3, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)]  ; %5=tmp0_1L
+    vpmaddwd    %6, %4, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)]  ; %6=tmp0_1H
+    vpaddd      %5, %5, %7              ; %5=tmp0_1L+z3_4L=tmp0_1L
+    vpaddd      %6, %6, %8              ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+    vpmaddwd    %3, %3, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)]  ; %3=tmp3_2L
+    vpmaddwd    %4, %4, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)]  ; %4=tmp3_2H
+    vperm2i128  %7, %7, %7, 0x01        ; %7=z4_3L
+    vperm2i128  %8, %8, %8, 0x01        ; %8=z4_3H
+    vpaddd      %7, %3, %7              ; %7=tmp3_2L+z4_3L=tmp3_2L
+    vpaddd      %8, %4, %8              ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+    ; -- Final output stage
+
+    vmovdqu     %3, %9
+    vmovdqu     %4, %10
+
+    vpaddd      %1, %3, %7              ; %1=tmp10_11L+tmp3_2L=data0_1L
+    vpaddd      %2, %4, %8              ; %2=tmp10_11H+tmp3_2H=data0_1H
+    vpaddd      %1, %1, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %1, %1, DESCALE_P %+ %13
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpackssdw   %1, %1, %2              ; %1=data0_1
+
+    vpsubd      %3, %3, %7              ; %3=tmp10_11L-tmp3_2L=data7_6L
+    vpsubd      %4, %4, %8              ; %4=tmp10_11H-tmp3_2H=data7_6H
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %4, %4, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %4, %4, DESCALE_P %+ %13
+    vpackssdw   %4, %3, %4              ; %4=data7_6
+
+    vmovdqu     %7, %11
+    vmovdqu     %8, %12
+
+    vpaddd      %2, %7, %5              ; %7=tmp13_12L+tmp0_1L=data3_2L
+    vpaddd      %3, %8, %6              ; %8=tmp13_12H+tmp0_1H=data3_2H
+    vpaddd      %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpackssdw   %2, %2, %3              ; %2=data3_2
+
+    vpsubd      %3, %7, %5              ; %7=tmp13_12L-tmp0_1L=data4_5L
+    vpsubd      %6, %8, %6              ; %8=tmp13_12H-tmp0_1H=data4_5H
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %6, %6, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %6, %6, DESCALE_P %+ %13
+    vpackssdw   %3, %3, %6              ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050   times 4  dw -F_0_899, (F_1_501 - F_0_899)
+                           times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP             times 32 db  CENTERJSAMPLE
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, xmm0
+    vpacksswb   xmm1, xmm1, xmm1
+    vpacksswb   xmm1, xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    vpmullw     xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vpsllw      xmm5, xmm5, PASS1_BITS
+
+    vpunpcklwd  xmm4, xmm5, xmm5        ; xmm4=(00 00 01 01 02 02 03 03)
+    vpunpckhwd  xmm5, xmm5, xmm5        ; xmm5=(04 04 05 05 06 06 07 07)
+    vinserti128 ymm4, ymm4, xmm5, 1
+
+    vpshufd     ymm0, ymm4, 0x00        ; ymm0=col0_4=(00 00 00 00 00 00 00 00  04 04 04 04 04 04 04 04)
+    vpshufd     ymm1, ymm4, 0x55        ; ymm1=col1_5=(01 01 01 01 01 01 01 01  05 05 05 05 05 05 05 05)
+    vpshufd     ymm2, ymm4, 0xAA        ; ymm2=col2_6=(02 02 02 02 02 02 02 02  06 06 06 06 06 06 06 06)
+    vpshufd     ymm3, ymm4, 0xFF        ; ymm3=col3_7=(03 03 03 03 03 03 03 03  07 07 07 07 07 07 07 07)
+
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,esi,SIZEOF_JCOEF)]  ; ymm4=in0_1
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,esi,SIZEOF_JCOEF)]  ; ymm5=in2_3
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,esi,SIZEOF_JCOEF)]  ; ymm6=in4_5
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,esi,SIZEOF_JCOEF)]  ; ymm7=in6_7
+    vpmullw     ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20  ; ymm0=in0_4
+    vperm2i128  ymm1, ymm5, ymm4, 0x31  ; ymm1=in3_1
+    vperm2i128  ymm2, ymm5, ymm7, 0x20  ; ymm2=in2_6
+    vperm2i128  ymm3, ymm7, ymm6, 0x31  ; ymm3=in7_5
+
+    dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    vperm2i128  ymm4, ymm3, ymm1, 0x31  ; ymm3=in7_5
+    vperm2i128  ymm1, ymm3, ymm1, 0x20  ; ymm1=in3_1
+
+    dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+    vpacksswb   ymm0, ymm0, ymm1        ; ymm0=data01_45
+    vpacksswb   ymm1, ymm2, ymm4        ; ymm1=data23_67
+    vpaddb      ymm0, ymm0, [GOTOFF(ebx,PB_CENTERJSAMP)]
+    vpaddb      ymm1, ymm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    vextracti128 xmm6, ymm1, 1          ; xmm3=data67
+    vextracti128 xmm4, ymm0, 1          ; xmm2=data45
+    vextracti128 xmm2, ymm1, 0          ; xmm1=data23
+    vextracti128 xmm0, ymm0, 0          ; xmm0=data01
+
+    vpshufd     xmm1, xmm0, 0x4E  ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    vpshufd     xmm3, xmm2, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    vpshufd     xmm5, xmm4, 0x4E  ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    vpshufd     xmm7, xmm6, 0x4E  ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    vzeroupper
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm0
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+    mov         edx, JSAMPROW [edi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctint-mmx.asm b/media/libjpeg/simd/i386/jidctint-mmx.asm
new file mode 100644
index 0000000000..f15c8d34bc
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-mmx.asm
@@ -0,0 +1,851 @@
+;
+; jidctint.asm - accurate integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054   times 2 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 2 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 2 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 2 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_mmx(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         12
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_mmx)
+
+EXTN(jsimd_idct_islow_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       mm0, PASS1_BITS
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movq        mm4, mm1                ; mm1=in2=z2
+    movq        mm5, mm1
+    punpcklwd   mm4, mm3                ; mm3=in6=z3
+    punpckhwd   mm5, mm3
+    movq        mm1, mm4
+    movq        mm3, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=tmp3L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F130_F054)]   ; mm5=tmp3H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F054_MF130)]  ; mm3=tmp2H
+
+    movq        mm6, mm0
+    paddw       mm0, mm2                ; mm0=in0+in4
+    psubw       mm6, mm2                ; mm6=in0-in4
+
+    pxor        mm7, mm7
+    pxor        mm2, mm2
+    punpcklwd   mm7, mm0                ; mm7=tmp0L
+    punpckhwd   mm2, mm0                ; mm2=tmp0H
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+    psrad       mm2, (16-CONST_BITS)    ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+    movq        mm0, mm7
+    paddd       mm7, mm4                ; mm7=tmp10L
+    psubd       mm0, mm4                ; mm0=tmp13L
+    movq        mm4, mm2
+    paddd       mm2, mm5                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp13H
+
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
+    movq        MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
+    movq        MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
+
+    pxor        mm5, mm5
+    pxor        mm7, mm7
+    punpcklwd   mm5, mm6                ; mm5=tmp1L
+    punpckhwd   mm7, mm6                ; mm7=tmp1H
+    psrad       mm5, (16-CONST_BITS)    ; psrad mm5,16 & pslld mm5,CONST_BITS
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+    movq        mm2, mm5
+    paddd       mm5, mm1                ; mm5=tmp11L
+    psubd       mm2, mm1                ; mm2=tmp12L
+    movq        mm0, mm7
+    paddd       mm7, mm3                ; mm7=tmp11H
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    movq        MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
+    movq        MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
+    movq        MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
+    movq        MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movq        mm5, mm6
+    movq        mm7, mm4
+    paddw       mm5, mm3                ; mm5=z3
+    paddw       mm7, mm1                ; mm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm2, mm5
+    movq        mm0, mm5
+    punpcklwd   mm2, mm7
+    punpckhwd   mm0, mm7
+    movq        mm5, mm2
+    movq        mm7, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF078_F117)]  ; mm2=z3L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF078_F117)]  ; mm0=z3H
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F117_F078)]   ; mm5=z4L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_F117_F078)]   ; mm7=z4H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=z3L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movq        mm2, mm3
+    movq        mm0, mm3
+    punpcklwd   mm2, mm4
+    punpckhwd   mm0, mm4
+    movq        mm3, mm2
+    movq        mm4, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm2=tmp0L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm0=tmp0H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF089_F060)]   ; mm3=tmp3L
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF089_F060)]   ; mm4=tmp3H
+
+    paddd       mm2, MMWORD [wk(10)]    ; mm2=tmp0L
+    paddd       mm0, MMWORD [wk(11)]    ; mm0=tmp0H
+    paddd       mm3, mm5                ; mm3=tmp3L
+    paddd       mm4, mm7                ; mm4=tmp3H
+
+    movq        MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
+    movq        MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
+
+    movq        mm2, mm1
+    movq        mm0, mm1
+    punpcklwd   mm2, mm6
+    punpckhwd   mm0, mm6
+    movq        mm1, mm2
+    movq        mm6, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm2=tmp1L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm0=tmp1H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF256_F050)]   ; mm1=tmp2L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF256_F050)]   ; mm6=tmp2H
+
+    paddd       mm2, mm5                ; mm2=tmp1L
+    paddd       mm0, mm7                ; mm0=tmp1H
+    paddd       mm1, MMWORD [wk(10)]    ; mm1=tmp2L
+    paddd       mm6, MMWORD [wk(11)]    ; mm6=tmp2H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp10L
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp10H
+
+    movq        mm2, mm5
+    movq        mm0, mm7
+    paddd       mm5, mm3                ; mm5=data0L
+    paddd       mm7, mm4                ; mm7=data0H
+    psubd       mm2, mm3                ; mm2=data7L
+    psubd       mm0, mm4                ; mm0=data7H
+
+    movq        mm3, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm3=[PD_DESCALE_P1]
+
+    paddd       mm5, mm3
+    paddd       mm7, mm3
+    psrad       mm5, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+    paddd       mm2, mm3
+    paddd       mm0, mm3
+    psrad       mm2, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+
+    packssdw    mm5, mm7                ; mm5=data0=(00 01 02 03)
+    packssdw    mm2, mm0                ; mm2=data7=(70 71 72 73)
+
+    movq        mm4, MMWORD [wk(4)]     ; mm4=tmp11L
+    movq        mm3, MMWORD [wk(5)]     ; mm3=tmp11H
+
+    movq        mm7, mm4
+    movq        mm0, mm3
+    paddd       mm4, mm1                ; mm4=data1L
+    paddd       mm3, mm6                ; mm3=data1H
+    psubd       mm7, mm1                ; mm7=data6L
+    psubd       mm0, mm6                ; mm0=data6H
+
+    movq        mm1, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm1=[PD_DESCALE_P1]
+
+    paddd       mm4, mm1
+    paddd       mm3, mm1
+    psrad       mm4, DESCALE_P1
+    psrad       mm3, DESCALE_P1
+    paddd       mm7, mm1
+    paddd       mm0, mm1
+    psrad       mm7, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+
+    packssdw    mm4, mm3                ; mm4=data1=(10 11 12 13)
+    packssdw    mm7, mm0                ; mm7=data6=(60 61 62 63)
+
+    movq        mm6, mm5                ; transpose coefficients(phase 1)
+    punpcklwd   mm5, mm4                ; mm5=(00 10 01 11)
+    punpckhwd   mm6, mm4                ; mm6=(02 12 03 13)
+    movq        mm1, mm7                ; transpose coefficients(phase 1)
+    punpcklwd   mm7, mm2                ; mm7=(60 70 61 71)
+    punpckhwd   mm1, mm2                ; mm1=(62 72 63 73)
+
+    movq        mm3, MMWORD [wk(6)]     ; mm3=tmp12L
+    movq        mm0, MMWORD [wk(7)]     ; mm0=tmp12H
+    movq        mm4, MMWORD [wk(10)]    ; mm4=tmp1L
+    movq        mm2, MMWORD [wk(11)]    ; mm2=tmp1H
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(00 10 01 11)
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=(02 12 03 13)
+    movq        MMWORD [wk(4)], mm7     ; wk(4)=(60 70 61 71)
+    movq        MMWORD [wk(5)], mm1     ; wk(5)=(62 72 63 73)
+
+    movq        mm5, mm3
+    movq        mm6, mm0
+    paddd       mm3, mm4                ; mm3=data2L
+    paddd       mm0, mm2                ; mm0=data2H
+    psubd       mm5, mm4                ; mm5=data5L
+    psubd       mm6, mm2                ; mm6=data5H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm7=[PD_DESCALE_P1]
+
+    paddd       mm3, mm7
+    paddd       mm0, mm7
+    psrad       mm3, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+    paddd       mm5, mm7
+    paddd       mm6, mm7
+    psrad       mm5, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm3, mm0                ; mm3=data2=(20 21 22 23)
+    packssdw    mm5, mm6                ; mm5=data5=(50 51 52 53)
+
+    movq        mm1, MMWORD [wk(2)]     ; mm1=tmp13L
+    movq        mm4, MMWORD [wk(3)]     ; mm4=tmp13H
+    movq        mm2, MMWORD [wk(8)]     ; mm2=tmp0L
+    movq        mm7, MMWORD [wk(9)]     ; mm7=tmp0H
+
+    movq        mm0, mm1
+    movq        mm6, mm4
+    paddd       mm1, mm2                ; mm1=data3L
+    paddd       mm4, mm7                ; mm4=data3H
+    psubd       mm0, mm2                ; mm0=data4L
+    psubd       mm6, mm7                ; mm6=data4H
+
+    movq        mm2, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm2=[PD_DESCALE_P1]
+
+    paddd       mm1, mm2
+    paddd       mm4, mm2
+    psrad       mm1, DESCALE_P1
+    psrad       mm4, DESCALE_P1
+    paddd       mm0, mm2
+    paddd       mm6, mm2
+    psrad       mm0, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm1, mm4                ; mm1=data3=(30 31 32 33)
+    packssdw    mm0, mm6                ; mm0=data4=(40 41 42 43)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=(00 10 01 11)
+    movq        mm2, MMWORD [wk(1)]     ; mm2=(02 12 03 13)
+
+    movq        mm4, mm3                ; transpose coefficients(phase 1)
+    punpcklwd   mm3, mm1                ; mm3=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm6, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm5                ; mm0=(40 50 41 51)
+    punpckhwd   mm6, mm5                ; mm6=(42 52 43 53)
+
+    movq        mm1, mm7                ; transpose coefficients(phase 2)
+    punpckldq   mm7, mm3                ; mm7=(00 10 20 30)
+    punpckhdq   mm1, mm3                ; mm1=(01 11 21 31)
+    movq        mm5, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm4                ; mm2=(02 12 22 32)
+    punpckhdq   mm5, mm4                ; mm5=(03 13 23 33)
+
+    movq        mm3, MMWORD [wk(4)]     ; mm3=(60 70 61 71)
+    movq        mm4, MMWORD [wk(5)]     ; mm4=(62 72 63 73)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+    movq        mm7, mm0                ; transpose coefficients(phase 2)
+    punpckldq   mm0, mm3                ; mm0=(40 50 60 70)
+    punpckhdq   mm7, mm3                ; mm7=(41 51 61 71)
+    movq        mm1, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm4                ; mm6=(42 52 62 72)
+    punpckhdq   mm1, mm4                ; mm1=(43 53 63 73)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_ISLOW_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movq        mm4, mm1                ; mm1=in2=z2
+    movq        mm5, mm1
+    punpcklwd   mm4, mm3                ; mm3=in6=z3
+    punpckhwd   mm5, mm3
+    movq        mm1, mm4
+    movq        mm3, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=tmp3L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F130_F054)]   ; mm5=tmp3H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F054_MF130)]  ; mm3=tmp2H
+
+    movq        mm6, mm0
+    paddw       mm0, mm2                ; mm0=in0+in4
+    psubw       mm6, mm2                ; mm6=in0-in4
+
+    pxor        mm7, mm7
+    pxor        mm2, mm2
+    punpcklwd   mm7, mm0                ; mm7=tmp0L
+    punpckhwd   mm2, mm0                ; mm2=tmp0H
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+    psrad       mm2, (16-CONST_BITS)    ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+    movq        mm0, mm7
+    paddd       mm7, mm4                ; mm7=tmp10L
+    psubd       mm0, mm4                ; mm0=tmp13L
+    movq        mm4, mm2
+    paddd       mm2, mm5                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp13H
+
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
+    movq        MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
+    movq        MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
+
+    pxor        mm5, mm5
+    pxor        mm7, mm7
+    punpcklwd   mm5, mm6                ; mm5=tmp1L
+    punpckhwd   mm7, mm6                ; mm7=tmp1H
+    psrad       mm5, (16-CONST_BITS)    ; psrad mm5,16 & pslld mm5,CONST_BITS
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+    movq        mm2, mm5
+    paddd       mm5, mm1                ; mm5=tmp11L
+    psubd       mm2, mm1                ; mm2=tmp12L
+    movq        mm0, mm7
+    paddd       mm7, mm3                ; mm7=tmp11H
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    movq        MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
+    movq        MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
+    movq        MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
+    movq        MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm5, mm6
+    movq        mm7, mm4
+    paddw       mm5, mm3                ; mm5=z3
+    paddw       mm7, mm1                ; mm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm2, mm5
+    movq        mm0, mm5
+    punpcklwd   mm2, mm7
+    punpckhwd   mm0, mm7
+    movq        mm5, mm2
+    movq        mm7, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF078_F117)]  ; mm2=z3L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF078_F117)]  ; mm0=z3H
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F117_F078)]   ; mm5=z4L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_F117_F078)]   ; mm7=z4H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=z3L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movq        mm2, mm3
+    movq        mm0, mm3
+    punpcklwd   mm2, mm4
+    punpckhwd   mm0, mm4
+    movq        mm3, mm2
+    movq        mm4, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm2=tmp0L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm0=tmp0H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF089_F060)]   ; mm3=tmp3L
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF089_F060)]   ; mm4=tmp3H
+
+    paddd       mm2, MMWORD [wk(10)]    ; mm2=tmp0L
+    paddd       mm0, MMWORD [wk(11)]    ; mm0=tmp0H
+    paddd       mm3, mm5                ; mm3=tmp3L
+    paddd       mm4, mm7                ; mm4=tmp3H
+
+    movq        MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
+    movq        MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
+
+    movq        mm2, mm1
+    movq        mm0, mm1
+    punpcklwd   mm2, mm6
+    punpckhwd   mm0, mm6
+    movq        mm1, mm2
+    movq        mm6, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm2=tmp1L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm0=tmp1H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF256_F050)]   ; mm1=tmp2L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF256_F050)]   ; mm6=tmp2H
+
+    paddd       mm2, mm5                ; mm2=tmp1L
+    paddd       mm0, mm7                ; mm0=tmp1H
+    paddd       mm1, MMWORD [wk(10)]    ; mm1=tmp2L
+    paddd       mm6, MMWORD [wk(11)]    ; mm6=tmp2H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp10L
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp10H
+
+    movq        mm2, mm5
+    movq        mm0, mm7
+    paddd       mm5, mm3                ; mm5=data0L
+    paddd       mm7, mm4                ; mm7=data0H
+    psubd       mm2, mm3                ; mm2=data7L
+    psubd       mm0, mm4                ; mm0=data7H
+
+    movq        mm3, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm3=[PD_DESCALE_P2]
+
+    paddd       mm5, mm3
+    paddd       mm7, mm3
+    psrad       mm5, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+    paddd       mm2, mm3
+    paddd       mm0, mm3
+    psrad       mm2, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+
+    packssdw    mm5, mm7                ; mm5=data0=(00 10 20 30)
+    packssdw    mm2, mm0                ; mm2=data7=(07 17 27 37)
+
+    movq        mm4, MMWORD [wk(4)]     ; mm4=tmp11L
+    movq        mm3, MMWORD [wk(5)]     ; mm3=tmp11H
+
+    movq        mm7, mm4
+    movq        mm0, mm3
+    paddd       mm4, mm1                ; mm4=data1L
+    paddd       mm3, mm6                ; mm3=data1H
+    psubd       mm7, mm1                ; mm7=data6L
+    psubd       mm0, mm6                ; mm0=data6H
+
+    movq        mm1, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm1=[PD_DESCALE_P2]
+
+    paddd       mm4, mm1
+    paddd       mm3, mm1
+    psrad       mm4, DESCALE_P2
+    psrad       mm3, DESCALE_P2
+    paddd       mm7, mm1
+    paddd       mm0, mm1
+    psrad       mm7, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+
+    packssdw    mm4, mm3                ; mm4=data1=(01 11 21 31)
+    packssdw    mm7, mm0                ; mm7=data6=(06 16 26 36)
+
+    packsswb    mm5, mm7                ; mm5=(00 10 20 30 06 16 26 36)
+    packsswb    mm4, mm2                ; mm4=(01 11 21 31 07 17 27 37)
+
+    movq        mm6, MMWORD [wk(6)]     ; mm6=tmp12L
+    movq        mm1, MMWORD [wk(7)]     ; mm1=tmp12H
+    movq        mm3, MMWORD [wk(10)]    ; mm3=tmp1L
+    movq        mm0, MMWORD [wk(11)]    ; mm0=tmp1H
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(00 10 20 30 06 16 26 36)
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=(01 11 21 31 07 17 27 37)
+
+    movq        mm7, mm6
+    movq        mm2, mm1
+    paddd       mm6, mm3                ; mm6=data2L
+    paddd       mm1, mm0                ; mm1=data2H
+    psubd       mm7, mm3                ; mm7=data5L
+    psubd       mm2, mm0                ; mm2=data5H
+
+    movq        mm5, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm5=[PD_DESCALE_P2]
+
+    paddd       mm6, mm5
+    paddd       mm1, mm5
+    psrad       mm6, DESCALE_P2
+    psrad       mm1, DESCALE_P2
+    paddd       mm7, mm5
+    paddd       mm2, mm5
+    psrad       mm7, DESCALE_P2
+    psrad       mm2, DESCALE_P2
+
+    packssdw    mm6, mm1                ; mm6=data2=(02 12 22 32)
+    packssdw    mm7, mm2                ; mm7=data5=(05 15 25 35)
+
+    movq        mm4, MMWORD [wk(2)]     ; mm4=tmp13L
+    movq        mm3, MMWORD [wk(3)]     ; mm3=tmp13H
+    movq        mm0, MMWORD [wk(8)]     ; mm0=tmp0L
+    movq        mm5, MMWORD [wk(9)]     ; mm5=tmp0H
+
+    movq        mm1, mm4
+    movq        mm2, mm3
+    paddd       mm4, mm0                ; mm4=data3L
+    paddd       mm3, mm5                ; mm3=data3H
+    psubd       mm1, mm0                ; mm1=data4L
+    psubd       mm2, mm5                ; mm2=data4H
+
+    movq        mm0, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm0=[PD_DESCALE_P2]
+
+    paddd       mm4, mm0
+    paddd       mm3, mm0
+    psrad       mm4, DESCALE_P2
+    psrad       mm3, DESCALE_P2
+    paddd       mm1, mm0
+    paddd       mm2, mm0
+    psrad       mm1, DESCALE_P2
+    psrad       mm2, DESCALE_P2
+
+    movq        mm5, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm5=[PB_CENTERJSAMP]
+
+    packssdw    mm4, mm3                ; mm4=data3=(03 13 23 33)
+    packssdw    mm1, mm2                ; mm1=data4=(04 14 24 34)
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=(00 10 20 30 06 16 26 36)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(01 11 21 31 07 17 27 37)
+
+    packsswb    mm6, mm1                ; mm6=(02 12 22 32 04 14 24 34)
+    packsswb    mm4, mm7                ; mm4=(03 13 23 33 05 15 25 35)
+
+    paddb       mm0, mm5
+    paddb       mm3, mm5
+    paddb       mm6, mm5
+    paddb       mm4, mm5
+
+    movq        mm2, mm0                ; transpose coefficients(phase 1)
+    punpcklbw   mm0, mm3                ; mm0=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm2, mm3                ; mm2=(06 07 16 17 26 27 36 37)
+    movq        mm1, mm6                ; transpose coefficients(phase 1)
+    punpcklbw   mm6, mm4                ; mm6=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm1, mm4                ; mm1=(04 05 14 15 24 25 34 35)
+
+    movq        mm7, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm6                ; mm0=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm7, mm6                ; mm7=(20 21 22 23 30 31 32 33)
+    movq        mm5, mm1                ; transpose coefficients(phase 2)
+    punpcklwd   mm1, mm2                ; mm1=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm5, mm2                ; mm5=(24 25 26 27 34 35 36 37)
+
+    movq        mm3, mm0                ; transpose coefficients(phase 3)
+    punpckldq   mm0, mm1                ; mm0=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm3, mm1                ; mm3=(10 11 12 13 14 15 16 17)
+    movq        mm4, mm7                ; transpose coefficients(phase 3)
+    punpckldq   mm7, mm5                ; mm7=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm4, mm5                ; mm4=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_JCOEF     ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                          ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctint-sse2.asm b/media/libjpeg/simd/i386/jidctint-sse2.asm
new file mode 100644
index 0000000000..43e320189b
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-sse2.asm
@@ -0,0 +1,858 @@
+;
+; jidctint.asm - accurate integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054   times 4  dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4  dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4  dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4  dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4  dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         12
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm5, PASS1_BITS
+
+    movdqa      xmm4, xmm5              ; xmm5=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm5, xmm5              ; xmm5=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm4, xmm4              ; xmm4=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm7, xmm5, 0x00        ; xmm7=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm6, xmm5, 0x55        ; xmm6=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm1, xmm5, 0xAA        ; xmm1=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm5, xmm5, 0xFF        ; xmm5=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm0, xmm4, 0x00        ; xmm0=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm3, xmm4, 0x55        ; xmm3=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm2, xmm4, 0xAA        ; xmm2=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm4, xmm4, 0xFF        ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm4, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm4, xmm3              ; xmm3=in6=z3
+    punpckhwd   xmm5, xmm3
+    movdqa      xmm1, xmm4
+    movdqa      xmm3, xmm5
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F130_F054)]   ; xmm4=tmp3L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F130_F054)]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=tmp2L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm3=tmp2H
+
+    movdqa      xmm6, xmm0
+    paddw       xmm0, xmm2              ; xmm0=in0+in4
+    psubw       xmm6, xmm2              ; xmm6=in0-in4
+
+    pxor        xmm7, xmm7
+    pxor        xmm2, xmm2
+    punpcklwd   xmm7, xmm0              ; xmm7=tmp0L
+    punpckhwd   xmm2, xmm0              ; xmm2=tmp0H
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+    psrad       xmm2, (16-CONST_BITS)   ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm4              ; xmm7=tmp10L
+    psubd       xmm0, xmm4              ; xmm0=tmp13L
+    movdqa      xmm4, xmm2
+    paddd       xmm2, xmm5              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm7, xmm7
+    punpcklwd   xmm5, xmm6              ; xmm5=tmp1L
+    punpckhwd   xmm7, xmm6              ; xmm7=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+    movdqa      xmm2, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm2, xmm1              ; xmm2=tmp12L
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm3              ; xmm7=tmp11H
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm7, xmm4
+    paddw       xmm5, xmm3              ; xmm5=z3
+    paddw       xmm7, xmm1              ; xmm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm5
+    punpcklwd   xmm2, xmm7
+    punpckhwd   xmm0, xmm7
+    movdqa      xmm5, xmm2
+    movdqa      xmm7, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm2=z3L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm0=z3H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F117_F078)]   ; xmm5=z4L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F117_F078)]   ; xmm7=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm3
+    punpcklwd   xmm2, xmm4
+    punpckhwd   xmm0, xmm4
+    movdqa      xmm3, xmm2
+    movdqa      xmm4, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm2=tmp0L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm0=tmp0H
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm3=tmp3L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm4=tmp3H
+
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+    paddd       xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+    paddd       xmm3, xmm5              ; xmm3=tmp3L
+    paddd       xmm4, xmm7              ; xmm4=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm0, xmm1
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm0, xmm6
+    movdqa      xmm1, xmm2
+    movdqa      xmm6, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm2=tmp1L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm0=tmp1H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm1=tmp2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm6=tmp2H
+
+    paddd       xmm2, xmm5              ; xmm2=tmp1L
+    paddd       xmm0, xmm7              ; xmm0=tmp1H
+    paddd       xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm7
+    paddd       xmm5, xmm3              ; xmm5=data0L
+    paddd       xmm7, xmm4              ; xmm7=data0H
+    psubd       xmm2, xmm3              ; xmm2=data7L
+    psubd       xmm0, xmm4              ; xmm0=data7H
+
+    movdqa      xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm3=[PD_DESCALE_P1]
+
+    paddd       xmm5, xmm3
+    paddd       xmm7, xmm3
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm7, DESCALE_P1
+    paddd       xmm2, xmm3
+    paddd       xmm0, xmm3
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm5, xmm7              ; xmm5=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm2, xmm0              ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+    movdqa      xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+    movdqa      xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+    movdqa      xmm7, xmm4
+    movdqa      xmm0, xmm3
+    paddd       xmm4, xmm1              ; xmm4=data1L
+    paddd       xmm3, xmm6              ; xmm3=data1H
+    psubd       xmm7, xmm1              ; xmm7=data6L
+    psubd       xmm0, xmm6              ; xmm0=data6H
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm1=[PD_DESCALE_P1]
+
+    paddd       xmm4, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+    paddd       xmm7, xmm1
+    paddd       xmm0, xmm1
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm4, xmm3              ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm7, xmm0              ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+    movdqa      xmm6, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm4              ; xmm5=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4              ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm2              ; xmm7=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm1, xmm2              ; xmm1=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+    movdqa      xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+    movdqa      xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+    movdqa      xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm5, xmm3
+    movdqa      xmm6, xmm0
+    paddd       xmm3, xmm4              ; xmm3=data2L
+    paddd       xmm0, xmm2              ; xmm0=data2H
+    psubd       xmm5, xmm4              ; xmm5=data5L
+    psubd       xmm6, xmm2              ; xmm6=data5H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm7=[PD_DESCALE_P1]
+
+    paddd       xmm3, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm3, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+    paddd       xmm5, xmm7
+    paddd       xmm6, xmm7
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm3, xmm0              ; xmm3=data2=(20 21 22 23 24 25 26 27)
+    packssdw    xmm5, xmm6              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+    movdqa      xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+    movdqa      xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+    movdqa      xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm6, xmm4
+    paddd       xmm1, xmm2              ; xmm1=data3L
+    paddd       xmm4, xmm7              ; xmm4=data3H
+    psubd       xmm0, xmm2              ; xmm0=data4L
+    psubd       xmm6, xmm7              ; xmm6=data4H
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm2=[PD_DESCALE_P1]
+
+    paddd       xmm1, xmm2
+    paddd       xmm4, xmm2
+    psrad       xmm1, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm0, xmm2
+    paddd       xmm6, xmm2
+    psrad       xmm0, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm1, xmm4              ; xmm1=data3=(30 31 32 33 34 35 36 37)
+    packssdw    xmm0, xmm6              ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+    movdqa      xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+    movdqa      xmm4, xmm3              ; transpose coefficients(phase 1)
+    punpcklwd   xmm3, xmm1              ; xmm3=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm1              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm6, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm6, xmm5              ; xmm6=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm3              ; xmm7=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm1, xmm3              ; xmm1=(02 12 22 32 03 13 23 33)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm4              ; xmm2=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm5, xmm4              ; xmm5=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+    movdqa      xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm2, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm2, xmm3              ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm4              ; xmm6=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm5, xmm4              ; xmm5=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm0              ; xmm7=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm3, xmm0              ; xmm3=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm3, xmm6              ; xmm3=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm5              ; xmm2=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm4, xmm5              ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm6, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm6, xmm2              ; xmm2=in6=z3
+    punpckhwd   xmm5, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm2, xmm5
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F130_F054)]   ; xmm6=tmp3L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F130_F054)]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=tmp2L
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm2=tmp2H
+
+    movdqa      xmm3, xmm7
+    paddw       xmm7, xmm0              ; xmm7=in0+in4
+    psubw       xmm3, xmm0              ; xmm3=in0-in4
+
+    pxor        xmm4, xmm4
+    pxor        xmm0, xmm0
+    punpcklwd   xmm4, xmm7              ; xmm4=tmp0L
+    punpckhwd   xmm0, xmm7              ; xmm0=tmp0H
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+    psrad       xmm0, (16-CONST_BITS)   ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm6              ; xmm4=tmp10L
+    psubd       xmm7, xmm6              ; xmm7=tmp13L
+    movdqa      xmm6, xmm0
+    paddd       xmm0, xmm5              ; xmm0=tmp10H
+    psubd       xmm6, xmm5              ; xmm6=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm4, xmm4
+    punpcklwd   xmm5, xmm3              ; xmm5=tmp1L
+    punpckhwd   xmm4, xmm3              ; xmm4=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+    movdqa      xmm0, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm0, xmm1              ; xmm0=tmp12L
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm2              ; xmm4=tmp11H
+    psubd       xmm7, xmm2              ; xmm7=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+    movdqa      xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+    movdqa      xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+    movdqa      xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm4, xmm3
+    paddw       xmm5, xmm1              ; xmm5=z3
+    paddw       xmm4, xmm2              ; xmm4=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm5
+    punpcklwd   xmm0, xmm4
+    punpckhwd   xmm7, xmm4
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm0=z3L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm7=z3H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F117_F078)]   ; xmm5=z4L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F117_F078)]   ; xmm4=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm7, xmm1
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm1, xmm0
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm0=tmp0L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm7=tmp0H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm1=tmp3L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm3=tmp3H
+
+    paddd       xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+    paddd       xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+    paddd       xmm1, xmm5              ; xmm1=tmp3L
+    paddd       xmm3, xmm4              ; xmm3=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+    movdqa      xmm0, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm0, xmm6
+    punpckhwd   xmm7, xmm6
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm0=tmp1L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm7=tmp1H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm2=tmp2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm6=tmp2H
+
+    paddd       xmm0, xmm5              ; xmm0=tmp1L
+    paddd       xmm7, xmm4              ; xmm7=tmp1H
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm4
+    paddd       xmm5, xmm1              ; xmm5=data0L
+    paddd       xmm4, xmm3              ; xmm4=data0H
+    psubd       xmm0, xmm1              ; xmm0=data7L
+    psubd       xmm7, xmm3              ; xmm7=data7H
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm1=[PD_DESCALE_P2]
+
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrad       xmm5, DESCALE_P2
+    psrad       xmm4, DESCALE_P2
+    paddd       xmm0, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm5, xmm4              ; xmm5=data0=(00 10 20 30 40 50 60 70)
+    packssdw    xmm0, xmm7              ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+    movdqa      xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm7, xmm1
+    paddd       xmm3, xmm2              ; xmm3=data1L
+    paddd       xmm1, xmm6              ; xmm1=data1H
+    psubd       xmm4, xmm2              ; xmm4=data6L
+    psubd       xmm7, xmm6              ; xmm7=data6H
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm2=[PD_DESCALE_P2]
+
+    paddd       xmm3, xmm2
+    paddd       xmm1, xmm2
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm4, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm3, xmm1              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    packssdw    xmm4, xmm7              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+    packsswb    xmm5, xmm4              ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm0              ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+    movdqa      xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+    movdqa      xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm4, xmm6
+    movdqa      xmm0, xmm2
+    paddd       xmm6, xmm1              ; xmm6=data2L
+    paddd       xmm2, xmm7              ; xmm2=data2H
+    psubd       xmm4, xmm1              ; xmm4=data5L
+    psubd       xmm0, xmm7              ; xmm0=data5H
+
+    movdqa      xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm5=[PD_DESCALE_P2]
+
+    paddd       xmm6, xmm5
+    paddd       xmm2, xmm5
+    psrad       xmm6, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm4, xmm5
+    paddd       xmm0, xmm5
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    packssdw    xmm6, xmm2              ; xmm6=data2=(02 12 22 32 42 52 62 72)
+    packssdw    xmm4, xmm0              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+    movdqa      xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+    movdqa      xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+    movdqa      xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+    movdqa      xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm1
+    paddd       xmm3, xmm7              ; xmm3=data3L
+    paddd       xmm1, xmm5              ; xmm1=data3H
+    psubd       xmm2, xmm7              ; xmm2=data4L
+    psubd       xmm0, xmm5              ; xmm0=data4H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm7=[PD_DESCALE_P2]
+
+    paddd       xmm3, xmm7
+    paddd       xmm1, xmm7
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm2, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm2, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    movdqa      xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm5=[PB_CENTERJSAMP]
+
+    packssdw    xmm3, xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+    packssdw    xmm2, xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+    movdqa      xmm7, XMMWORD [wk(0)]  ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      xmm1, XMMWORD [wk(1)]  ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    packsswb    xmm6, xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm3, xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm7, xmm5
+    paddb       xmm1, xmm5
+    paddb       xmm6, xmm5
+    paddb       xmm3, xmm5
+
+    movdqa      xmm0, xmm7        ; transpose coefficients(phase 1)
+    punpcklbw   xmm7, xmm1        ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm1        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 1)
+    punpcklbw   xmm6, xmm3        ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm2, xmm3        ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm7        ; transpose coefficients(phase 2)
+    punpcklwd   xmm7, xmm6        ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm6        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm5, xmm2        ; transpose coefficients(phase 2)
+    punpcklwd   xmm2, xmm0        ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm5, xmm0        ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm1, xmm7        ; transpose coefficients(phase 3)
+    punpckldq   xmm7, xmm2        ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm1, xmm2        ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm3, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm5        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm3, xmm5        ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm6, xmm7, 0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm1, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+    mov         edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctred-mmx.asm b/media/libjpeg/simd/i386/jidctred-mmx.asm
new file mode 100644
index 0000000000..e2307e1cb6
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctred-mmx.asm
@@ -0,0 +1,704 @@
+;
+; jidctred.asm - reduced-size IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076   times 2 dw  F_1_847, -F_0_765
+PW_F256_F089    times 2 dw  F_2_562,  F_0_899
+PW_F106_MF217   times 2 dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 2 dw -F_0_601, -F_0_509
+PW_F145_MF021   times 2 dw  F_1_451, -F_0_211
+PW_F362_MF127   times 2 dw  F_3_624, -F_1_272
+PW_F085_MF072   times 2 dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_mmx(void *dct_table, JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_mmx)
+
+EXTN(jsimd_idct_4x4_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm0, mm1
+    packsswb    mm0, mm0
+    movd        eax, mm0
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       mm0, PASS1_BITS
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm0
+    punpcklwd   mm4, mm1
+    punpckhwd   mm5, mm1
+    movq        mm0, mm4
+    movq        mm1, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
+
+    movq        mm6, mm2
+    movq        mm7, mm2
+    punpcklwd   mm6, mm3
+    punpckhwd   mm7, mm3
+    movq        mm2, mm6
+    movq        mm3, mm7
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
+
+    paddd       mm6, mm4                ; mm6=tmp2L
+    paddd       mm7, mm5                ; mm7=tmp2H
+    paddd       mm2, mm0                ; mm2=tmp0L
+    paddd       mm3, mm1                ; mm3=tmp0H
+
+    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
+    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        mm1, mm1
+    pxor        mm2, mm2
+    punpcklwd   mm1, mm4                ; mm1=tmp0L
+    punpckhwd   mm2, mm4                ; mm2=tmp0H
+    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+    movq        mm3, mm5                ; mm5=in2=z2
+    punpcklwd   mm5, mm0                ; mm0=in6=z3
+    punpckhwd   mm3, mm0
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
+
+    movq        mm4, mm1
+    movq        mm0, mm2
+    paddd       mm1, mm5                ; mm1=tmp10L
+    paddd       mm2, mm3                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp12L
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    ; -- Final output stage
+
+    movq        mm5, mm1
+    movq        mm3, mm2
+    paddd       mm1, mm6                ; mm1=data0L
+    paddd       mm2, mm7                ; mm2=data0H
+    psubd       mm5, mm6                ; mm5=data3L
+    psubd       mm3, mm7                ; mm3=data3H
+
+    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm6=[PD_DESCALE_P1_4]
+
+    paddd       mm1, mm6
+    paddd       mm2, mm6
+    psrad       mm1, DESCALE_P1_4
+    psrad       mm2, DESCALE_P1_4
+    paddd       mm5, mm6
+    paddd       mm3, mm6
+    psrad       mm5, DESCALE_P1_4
+    psrad       mm3, DESCALE_P1_4
+
+    packssdw    mm1, mm2                ; mm1=data0=(00 01 02 03)
+    packssdw    mm5, mm3                ; mm5=data3=(30 31 32 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
+
+    movq        mm2, mm4
+    movq        mm3, mm0
+    paddd       mm4, mm7                ; mm4=data1L
+    paddd       mm0, mm6                ; mm0=data1H
+    psubd       mm2, mm7                ; mm2=data2L
+    psubd       mm3, mm6                ; mm3=data2H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm7=[PD_DESCALE_P1_4]
+
+    paddd       mm4, mm7
+    paddd       mm0, mm7
+    psrad       mm4, DESCALE_P1_4
+    psrad       mm0, DESCALE_P1_4
+    paddd       mm2, mm7
+    paddd       mm3, mm7
+    psrad       mm2, DESCALE_P1_4
+    psrad       mm3, DESCALE_P1_4
+
+    packssdw    mm4, mm0                ; mm4=data1=(10 11 12 13)
+    packssdw    mm2, mm3                ; mm2=data2=(20 21 22 23)
+
+    movq        mm6, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm4                ; mm1=(00 10 01 11)
+    punpckhwd   mm6, mm4                ; mm6=(02 12 03 13)
+    movq        mm7, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm5                ; mm2=(20 30 21 31)
+    punpckhwd   mm7, mm5                ; mm7=(22 32 23 33)
+
+    movq        mm0, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm2                ; mm1=(00 10 20 30)
+    punpckhdq   mm0, mm2                ; mm0=(01 11 21 31)
+    movq        mm3, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm7                ; mm6=(02 12 22 32)
+    punpckhdq   mm3, mm7                ; mm3=(03 13 23 33)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_ISLOW_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm0
+    movq        mm5, mm0
+    punpcklwd   mm4, mm1
+    punpckhwd   mm5, mm1
+    movq        mm0, mm4
+    movq        mm1, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
+
+    movq        mm6, mm2
+    movq        mm7, mm2
+    punpcklwd   mm6, mm3
+    punpckhwd   mm7, mm3
+    movq        mm2, mm6
+    movq        mm3, mm7
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
+
+    paddd       mm6, mm4                ; mm6=tmp2L
+    paddd       mm7, mm5                ; mm7=tmp2H
+    paddd       mm2, mm0                ; mm2=tmp0L
+    paddd       mm3, mm1                ; mm3=tmp0H
+
+    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
+    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    pxor        mm1, mm1
+    pxor        mm2, mm2
+    punpcklwd   mm1, mm4                ; mm1=tmp0L
+    punpckhwd   mm2, mm4                ; mm2=tmp0H
+    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+    movq        mm3, mm5                ; mm5=in2=z2
+    punpcklwd   mm5, mm0                ; mm0=in6=z3
+    punpckhwd   mm3, mm0
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
+
+    movq        mm4, mm1
+    movq        mm0, mm2
+    paddd       mm1, mm5                ; mm1=tmp10L
+    paddd       mm2, mm3                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp12L
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    ; -- Final output stage
+
+    movq        mm5, mm1
+    movq        mm3, mm2
+    paddd       mm1, mm6                ; mm1=data0L
+    paddd       mm2, mm7                ; mm2=data0H
+    psubd       mm5, mm6                ; mm5=data3L
+    psubd       mm3, mm7                ; mm3=data3H
+
+    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm6=[PD_DESCALE_P2_4]
+
+    paddd       mm1, mm6
+    paddd       mm2, mm6
+    psrad       mm1, DESCALE_P2_4
+    psrad       mm2, DESCALE_P2_4
+    paddd       mm5, mm6
+    paddd       mm3, mm6
+    psrad       mm5, DESCALE_P2_4
+    psrad       mm3, DESCALE_P2_4
+
+    packssdw    mm1, mm2                ; mm1=data0=(00 10 20 30)
+    packssdw    mm5, mm3                ; mm5=data3=(03 13 23 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
+
+    movq        mm2, mm4
+    movq        mm3, mm0
+    paddd       mm4, mm7                ; mm4=data1L
+    paddd       mm0, mm6                ; mm0=data1H
+    psubd       mm2, mm7                ; mm2=data2L
+    psubd       mm3, mm6                ; mm3=data2H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm7=[PD_DESCALE_P2_4]
+
+    paddd       mm4, mm7
+    paddd       mm0, mm7
+    psrad       mm4, DESCALE_P2_4
+    psrad       mm0, DESCALE_P2_4
+    paddd       mm2, mm7
+    paddd       mm3, mm7
+    psrad       mm2, DESCALE_P2_4
+    psrad       mm3, DESCALE_P2_4
+
+    packssdw    mm4, mm0                ; mm4=data1=(01 11 21 31)
+    packssdw    mm2, mm3                ; mm2=data2=(02 12 22 32)
+
+    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
+
+    packsswb    mm1, mm2                ; mm1=(00 10 20 30 02 12 22 32)
+    packsswb    mm4, mm5                ; mm4=(01 11 21 31 03 13 23 33)
+    paddb       mm1, mm6
+    paddb       mm4, mm6
+
+    movq        mm7, mm1                ; transpose coefficients(phase 1)
+    punpcklbw   mm1, mm4                ; mm1=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm7, mm4                ; mm7=(02 03 12 13 22 23 32 33)
+
+    movq        mm0, mm1                ; transpose coefficients(phase 2)
+    punpcklwd   mm1, mm7                ; mm1=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm0, mm7                ; mm0=(20 21 22 23 30 31 32 33)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
+
+    psrlq       mm1, 4*BYTE_BIT
+    psrlq       mm0, 4*BYTE_BIT
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_mmx(void *dct_table, JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_mmx)
+
+EXTN(jsimd_idct_2x2_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         edx, POINTER [dct_table(ebp)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(ebp)]  ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+    ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+    pcmpeqd     mm7, mm7
+    pslld       mm7, WORD_BIT           ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+    movq        mm4, mm0                ; mm4=(10 11 ** 13)
+    movq        mm5, mm2                ; mm5=(50 51 ** 53)
+    punpcklwd   mm4, mm1                ; mm4=(10 30 11 31)
+    punpcklwd   mm5, mm3                ; mm5=(50 70 51 71)
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    psrld       mm0, WORD_BIT           ; mm0=(11 -- 13 --)
+    pand        mm1, mm7                ; mm1=(-- 31 -- 33)
+    psrld       mm2, WORD_BIT           ; mm2=(51 -- 53 --)
+    pand        mm3, mm7                ; mm3=(-- 71 -- 73)
+    por         mm0, mm1                ; mm0=(11 31 13 33)
+    por         mm2, mm3                ; mm2=(51 71 53 73)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm4, mm5                ; mm4=tmp0[col0 col1]
+
+    movq        mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+    ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+    psrld       mm6, WORD_BIT           ; mm6=(15 -- 17 --)
+    pand        mm1, mm7                ; mm1=(-- 35 -- 37)
+    psrld       mm3, WORD_BIT           ; mm3=(55 -- 57 --)
+    pand        mm5, mm7                ; mm5=(-- 75 -- 77)
+    por         mm6, mm1                ; mm6=(15 35 17 37)
+    por         mm3, mm5                ; mm3=(55 75 57 77)
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm0, mm2                ; mm0=tmp0[col1 col3]
+    paddd       mm6, mm3                ; mm6=tmp0[col5 col7]
+
+    ; -- Even part
+
+    movq        mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+    movq        mm2, mm1                      ; mm2=(00 01 ** 03)
+    pslld       mm1, WORD_BIT                 ; mm1=(-- 00 -- **)
+    psrad       mm1, (WORD_BIT-CONST_BITS-2)  ; mm1=tmp10[col0 ****]
+
+    pand        mm2, mm7                      ; mm2=(-- 01 -- 03)
+    pand        mm5, mm7                      ; mm5=(-- 05 -- 07)
+    psrad       mm2, (WORD_BIT-CONST_BITS-2)  ; mm2=tmp10[col1 col3]
+    psrad       mm5, (WORD_BIT-CONST_BITS-2)  ; mm5=tmp10[col5 col7]
+
+    ; -- Final output stage
+
+    movq        mm3, mm1
+    paddd       mm1, mm4                ; mm1=data0[col0 ****]=(A0 **)
+    psubd       mm3, mm4                ; mm3=data1[col0 ****]=(B0 **)
+    punpckldq   mm1, mm3                ; mm1=(A0 B0)
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_2)]  ; mm7=[PD_DESCALE_P1_2]
+
+    movq        mm4, mm2
+    movq        mm3, mm5
+    paddd       mm2, mm0                ; mm2=data0[col1 col3]=(A1 A3)
+    paddd       mm5, mm6                ; mm5=data0[col5 col7]=(A5 A7)
+    psubd       mm4, mm0                ; mm4=data1[col1 col3]=(B1 B3)
+    psubd       mm3, mm6                ; mm3=data1[col5 col7]=(B5 B7)
+
+    paddd       mm1, mm7
+    psrad       mm1, DESCALE_P1_2
+
+    paddd       mm2, mm7
+    paddd       mm5, mm7
+    psrad       mm2, DESCALE_P1_2
+    psrad       mm5, DESCALE_P1_2
+    paddd       mm4, mm7
+    paddd       mm3, mm7
+    psrad       mm4, DESCALE_P1_2
+    psrad       mm3, DESCALE_P1_2
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         edi, JSAMPARRAY [output_buf(ebp)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(ebp)]
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    mm2, mm4                ; mm2=(A1 A3 B1 B3)
+    packssdw    mm5, mm3                ; mm5=(A5 A7 B5 B7)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm2, mm5                ; mm2=tmp0[row0 row1]
+
+    ; -- Even part
+
+    pslld       mm1, (CONST_BITS+2)     ; mm1=tmp10[row0 row1]
+
+    ; -- Final output stage
+
+    movq        mm0, [GOTOFF(ebx,PD_DESCALE_P2_2)]  ; mm0=[PD_DESCALE_P2_2]
+
+    movq        mm6, mm1
+    paddd       mm1, mm2                ; mm1=data0[row0 row1]=(C0 C1)
+    psubd       mm6, mm2                ; mm6=data1[row0 row1]=(D0 D1)
+
+    paddd       mm1, mm0
+    paddd       mm6, mm0
+    psrad       mm1, DESCALE_P2_2
+    psrad       mm6, DESCALE_P2_2
+
+    movq        mm7, mm1                ; transpose coefficients
+    punpckldq   mm1, mm6                ; mm1=(C0 D0)
+    punpckhdq   mm7, mm6                ; mm7=(C1 D1)
+
+    packssdw    mm1, mm7                ; mm1=(C0 D0 C1 D1)
+    packsswb    mm1, mm1                ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+    paddb       mm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    movd        ecx, mm1
+    movd        ebx, mm1                ; ebx=(C0 D0 C1 D1)
+    shr         ecx, 2*BYTE_BIT         ; ecx=(C1 D1 -- --)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctred-sse2.asm b/media/libjpeg/simd/i386/jidctred-sse2.asm
new file mode 100644
index 0000000000..6e56494e97
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctred-sse2.asm
@@ -0,0 +1,592 @@
+;
+; jidctred.asm - reduced-size IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076   times 4  dw  F_1_847, -F_0_765
+PW_F256_F089    times 4  dw  F_2_562,  F_0_899
+PW_F106_MF217   times 4  dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 4  dw -F_0_601, -F_0_509
+PW_F145_MF021   times 4  dw  F_1_451, -F_0_211
+PW_F362_MF127   times 4  dw  F_3_624, -F_1_272
+PW_F085_MF072   times 4  dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4  dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4  dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4  dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4  dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, xmm1
+    packsswb    xmm0, xmm0
+    packsswb    xmm0, xmm0
+    movd        eax, xmm0
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm0, PASS1_BITS
+
+    movdqa      xmm3, xmm0        ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0        ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm3, xmm3        ; xmm3=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm1, xmm0, 0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+    pshufd      xmm0, xmm0, 0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+    pshufd      xmm6, xmm3, 0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+    pshufd      xmm3, xmm3, 0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm0
+    punpcklwd   xmm4, xmm1
+    punpckhwd   xmm5, xmm1
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F256_F089)]   ; xmm4=(tmp2L)
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F256_F089)]   ; xmm5=(tmp2H)
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F106_MF217)]  ; xmm0=(tmp0L)
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F106_MF217)]  ; xmm1=(tmp0H)
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm6, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm6=(tmp2L)
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm7=(tmp2H)
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm2=(tmp0L)
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm3=(tmp0H)
+
+    paddd       xmm6, xmm4              ; xmm6=tmp2L
+    paddd       xmm7, xmm5              ; xmm7=tmp2H
+    paddd       xmm2, xmm0              ; xmm2=tmp0L
+    paddd       xmm3, xmm1              ; xmm3=tmp0H
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        xmm1, xmm1
+    pxor        xmm2, xmm2
+    punpcklwd   xmm1, xmm4               ; xmm1=tmp0L
+    punpckhwd   xmm2, xmm4               ; xmm2=tmp0H
+    psrad       xmm1, (16-CONST_BITS-1)  ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+    psrad       xmm2, (16-CONST_BITS-1)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+    movdqa      xmm3, xmm5              ; xmm5=in2=z2
+    punpcklwd   xmm5, xmm0              ; xmm0=in6=z3
+    punpckhwd   xmm3, xmm0
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm5=tmp2L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm3=tmp2H
+
+    movdqa      xmm4, xmm1
+    movdqa      xmm0, xmm2
+    paddd       xmm1, xmm5              ; xmm1=tmp10L
+    paddd       xmm2, xmm3              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp12L
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, xmm1
+    movdqa      xmm3, xmm2
+    paddd       xmm1, xmm6              ; xmm1=data0L
+    paddd       xmm2, xmm7              ; xmm2=data0H
+    psubd       xmm5, xmm6              ; xmm5=data3L
+    psubd       xmm3, xmm7              ; xmm3=data3H
+
+    movdqa      xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; xmm6=[PD_DESCALE_P1_4]
+
+    paddd       xmm1, xmm6
+    paddd       xmm2, xmm6
+    psrad       xmm1, DESCALE_P1_4
+    psrad       xmm2, DESCALE_P1_4
+    paddd       xmm5, xmm6
+    paddd       xmm3, xmm6
+    psrad       xmm5, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm1, xmm2              ; xmm1=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm5, xmm3              ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+    movdqa      xmm2, xmm4
+    movdqa      xmm3, xmm0
+    paddd       xmm4, xmm7              ; xmm4=data1L
+    paddd       xmm0, xmm6              ; xmm0=data1H
+    psubd       xmm2, xmm7              ; xmm2=data2L
+    psubd       xmm3, xmm6              ; xmm3=data2H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; xmm7=[PD_DESCALE_P1_4]
+
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm4, DESCALE_P1_4
+    psrad       xmm0, DESCALE_P1_4
+    paddd       xmm2, xmm7
+    paddd       xmm3, xmm7
+    psrad       xmm2, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm4, xmm0        ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm2, xmm3        ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+    movdqa      xmm6, xmm1        ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm4        ; xmm1=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4        ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm7, xmm2        ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm5        ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm7, xmm5        ; xmm7=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm2        ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm0, xmm2        ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+    movdqa      xmm3, xmm6        ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7        ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm3, xmm7        ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    pxor        xmm4, xmm4
+    punpcklwd   xmm4, xmm1               ; xmm4=tmp0
+    psrad       xmm4, (16-CONST_BITS-1)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+    ; -- Odd part
+
+    punpckhwd   xmm1, xmm0
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm5, xmm1
+    movdqa      xmm2, xmm6
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F256_F089)]    ; xmm1=(tmp2)
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm6=(tmp2)
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F106_MF217)]   ; xmm5=(tmp0)
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm2=(tmp0)
+
+    paddd       xmm6, xmm1              ; xmm6=tmp2
+    paddd       xmm2, xmm5              ; xmm2=tmp0
+
+    ; -- Even part
+
+    punpcklwd   xmm0, xmm3
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm0=tmp2
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm0              ; xmm4=tmp10
+    psubd       xmm7, xmm0              ; xmm7=tmp12
+
+    ; -- Final output stage
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; xmm1=[PD_DESCALE_P2_4]
+
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm7
+    paddd       xmm4, xmm6              ; xmm4=data0=(00 10 20 30)
+    paddd       xmm7, xmm2              ; xmm7=data1=(01 11 21 31)
+    psubd       xmm5, xmm6              ; xmm5=data3=(03 13 23 33)
+    psubd       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+
+    paddd       xmm4, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm4, DESCALE_P2_4
+    psrad       xmm7, DESCALE_P2_4
+    paddd       xmm5, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm5, DESCALE_P2_4
+    psrad       xmm3, DESCALE_P2_4
+
+    packssdw    xmm4, xmm3              ; xmm4=(00 10 20 30 02 12 22 32)
+    packssdw    xmm7, xmm5              ; xmm7=(01 11 21 31 03 13 23 33)
+
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 1)
+    punpcklwd   xmm4, xmm7              ; xmm4=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm0, xmm7              ; xmm0=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm6, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm6, xmm0              ; xmm6=(20 21 22 23 30 31 32 33)
+
+    packsswb    xmm4, xmm6              ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+    paddb       xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    pshufd      xmm2, xmm4, 0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+    pshufd      xmm1, xmm4, 0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+    pshufd      xmm3, xmm4, 0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movd        XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+    movd        XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         edx, POINTER [dct_table(ebp)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(ebp)]  ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+    ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+    pcmpeqd     xmm7, xmm7
+    pslld       xmm7, WORD_BIT          ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+    movdqa      xmm4, xmm0              ; xmm4=(10 11 ** 13 ** 15 ** 17)
+    movdqa      xmm5, xmm2              ; xmm5=(50 51 ** 53 ** 55 ** 57)
+    punpcklwd   xmm4, xmm1              ; xmm4=(10 30 11 31 ** ** 13 33)
+    punpcklwd   xmm5, xmm3              ; xmm5=(50 70 51 71 ** ** 53 73)
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    psrld       xmm0, WORD_BIT          ; xmm0=(11 -- 13 -- 15 -- 17 --)
+    pand        xmm1, xmm7              ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+    psrld       xmm2, WORD_BIT          ; xmm2=(51 -- 53 -- 55 -- 57 --)
+    pand        xmm3, xmm7              ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+    por         xmm0, xmm1              ; xmm0=(11 31 13 33 15 35 17 37)
+    por         xmm2, xmm3              ; xmm2=(51 71 53 73 55 75 57 77)
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       xmm4, xmm5              ; xmm4=tmp0[col0 col1 **** col3]
+    paddd       xmm0, xmm2              ; xmm0=tmp0[col1 col3 col5 col7]
+
+    ; -- Even part
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+    movdqa      xmm1, xmm6              ; xmm1=(00 01 ** 03 ** 05 ** 07)
+    pslld       xmm6, WORD_BIT          ; xmm6=(-- 00 -- ** -- ** -- **)
+    pand        xmm1, xmm7              ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+    psrad       xmm6, (WORD_BIT-CONST_BITS-2)  ; xmm6=tmp10[col0 **** **** ****]
+    psrad       xmm1, (WORD_BIT-CONST_BITS-2)  ; xmm1=tmp10[col1 col3 col5 col7]
+
+    ; -- Final output stage
+
+    movdqa      xmm3, xmm6
+    movdqa      xmm5, xmm1
+    paddd       xmm6, xmm4      ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+    paddd       xmm1, xmm0      ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+    psubd       xmm3, xmm4      ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+    psubd       xmm5, xmm0      ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)]  ; xmm2=[PD_DESCALE_P1_2]
+
+    punpckldq   xmm6, xmm3              ; xmm6=(A0 B0 ** **)
+
+    movdqa      xmm7, xmm1
+    punpcklqdq  xmm1, xmm5              ; xmm1=(A1 A3 B1 B3)
+    punpckhqdq  xmm7, xmm5              ; xmm7=(A5 A7 B5 B7)
+
+    paddd       xmm6, xmm2
+    psrad       xmm6, DESCALE_P1_2
+
+    paddd       xmm1, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm1, DESCALE_P1_2
+    psrad       xmm7, DESCALE_P1_2
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         edi, JSAMPARRAY [output_buf(ebp)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(ebp)]
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    xmm1, xmm1              ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+    packssdw    xmm7, xmm7              ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       xmm1, xmm7              ; xmm1=tmp0[row0 row1 row0 row1]
+
+    ; -- Even part
+
+    pslld       xmm6, (CONST_BITS+2)    ; xmm6=tmp10[row0 row1 **** ****]
+
+    ; -- Final output stage
+
+    movdqa      xmm4, xmm6
+    paddd       xmm6, xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+    psubd       xmm4, xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+    punpckldq   xmm6, xmm4     ; xmm6=(C0 D0 C1 D1)
+
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
+    psrad       xmm6, DESCALE_P2_2
+
+    packssdw    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+    packsswb    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+    paddb       xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    pextrw      ebx, xmm6, 0x00         ; ebx=(C0 D0 -- --)
+    pextrw      ecx, xmm6, 0x01         ; ecx=(C1 D1 -- --)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquant-3dn.asm b/media/libjpeg/simd/i386/jquant-3dn.asm
new file mode 100644
index 0000000000..5cb60caa94
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-3dn.asm
@@ -0,0 +1,230 @@
+;
+; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow)
+
+EXTN(jsimd_convsamp_float_3dnow):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7
+    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       mm0, mm7                ; mm0=(01234567)
+    psubb       mm1, mm7                ; mm1=(89ABCDEF)
+
+    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
+    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
+    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
+    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
+
+    punpcklwd   mm4, mm2                ; mm4=(***0***1)
+    punpckhwd   mm2, mm2                ; mm2=(***2***3)
+    punpcklwd   mm5, mm0                ; mm5=(***4***5)
+    punpckhwd   mm0, mm0                ; mm0=(***6***7)
+
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
+    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
+    pi2fd       mm4, mm4
+    pi2fd       mm2, mm2
+    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
+    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
+    pi2fd       mm5, mm5
+    pi2fd       mm0, mm0
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+    punpcklwd   mm6, mm3                ; mm6=(***8***9)
+    punpckhwd   mm3, mm3                ; mm3=(***A***B)
+    punpcklwd   mm4, mm1                ; mm4=(***C***D)
+    punpckhwd   mm1, mm1                ; mm1=(***E***F)
+
+    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
+    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
+    pi2fd       mm6, mm6
+    pi2fd       mm3, mm3
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
+    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
+    pi2fd       mm4, mm4
+    pi2fd       mm1, mm1
+
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .convloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                            FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_3dnow)
+
+EXTN(jsimd_quantize_float_3dnow):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, 0x4B400000         ; (float)0x00C00000 (rndint_magic)
+    movd        mm7, eax
+    punpckldq   mm7, mm7                ; mm7={12582912.0F 12582912.0F}
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+    pfadd       mm0, mm7                ; mm0=(00 ** 01 **)
+    pfadd       mm1, mm7                ; mm1=(02 ** 03 **)
+    pfadd       mm2, mm7                ; mm0=(04 ** 05 **)
+    pfadd       mm3, mm7                ; mm1=(06 ** 07 **)
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm1                ; mm0=(00 02 ** **)
+    punpckhwd   mm4, mm1                ; mm4=(01 03 ** **)
+    movq        mm5, mm2
+    punpcklwd   mm2, mm3                ; mm2=(04 06 ** **)
+    punpckhwd   mm5, mm3                ; mm5=(05 07 ** **)
+
+    punpcklwd   mm0, mm4                ; mm0=(00 01 02 03)
+    punpcklwd   mm2, mm5                ; mm2=(04 05 06 07)
+
+    movq        mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+    pfadd       mm6, mm7                ; mm0=(10 ** 11 **)
+    pfadd       mm1, mm7                ; mm4=(12 ** 13 **)
+    pfadd       mm3, mm7                ; mm0=(14 ** 15 **)
+    pfadd       mm4, mm7                ; mm4=(16 ** 17 **)
+
+    movq        mm5, mm6
+    punpcklwd   mm6, mm1                ; mm6=(10 12 ** **)
+    punpckhwd   mm5, mm1                ; mm5=(11 13 ** **)
+    movq        mm1, mm3
+    punpcklwd   mm3, mm4                ; mm3=(14 16 ** **)
+    punpckhwd   mm1, mm4                ; mm1=(15 17 ** **)
+
+    punpcklwd   mm6, mm5                ; mm6=(10 11 12 13)
+    punpcklwd   mm3, mm1                ; mm3=(14 15 16 17)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         near .quantloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquant-mmx.asm b/media/libjpeg/simd/i386/jquant-mmx.asm
new file mode 100644
index 0000000000..61305c625d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-mmx.asm
@@ -0,0 +1,276 @@
+;
+; jquant.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                    DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pxor        mm6, mm6                ; mm6=(all 0's)
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7                  ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]   ; mm0=(01234567)
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]   ; mm1=(89ABCDEF)
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]   ; mm2=(GHIJKLMN)
+    movq        mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]   ; mm3=(OPQRSTUV)
+
+    movq        mm4, mm0
+    punpcklbw   mm0, mm6                ; mm0=(0123)
+    punpckhbw   mm4, mm6                ; mm4=(4567)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm6                ; mm1=(89AB)
+    punpckhbw   mm5, mm6                ; mm5=(CDEF)
+
+    paddw       mm0, mm7
+    paddw       mm4, mm7
+    paddw       mm1, mm7
+    paddw       mm5, mm7
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+    movq        mm0, mm2
+    punpcklbw   mm2, mm6                ; mm2=(GHIJ)
+    punpckhbw   mm0, mm6                ; mm0=(KLMN)
+    movq        mm4, mm3
+    punpcklbw   mm3, mm6                ; mm3=(OPQR)
+    punpckhbw   mm4, mm6                ; mm4=(STUV)
+
+    paddw       mm2, mm7
+    paddw       mm0, mm7
+    paddw       mm3, mm7
+    paddw       mm4, mm7
+
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+    add         esi, byte 4*SIZEOF_JSAMPROW
+    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         short .convloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors,
+;                    DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SHIFT(m, n, b) \
+  MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         ah, 2
+    alignx      16, 7
+.quantloop1:
+    mov         al, DCTSIZE2/8/2
+    alignx      16, 7
+.quantloop2:
+    movq        mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+    movq        mm0, mm2
+    movq        mm1, mm3
+
+    psraw       mm2, (WORD_BIT-1)       ; -1 if value < 0, 0 otherwise
+    psraw       mm3, (WORD_BIT-1)
+
+    pxor        mm0, mm2                ; val = -val
+    pxor        mm1, mm3
+    psubw       mm0, mm2
+    psubw       mm1, mm3
+
+    ;
+    ; MMX is an annoyingly crappy instruction set. It has two
+    ; misfeatures that are causing problems here:
+    ;
+    ; - All multiplications are signed.
+    ;
+    ; - The second operand for the shifts is not treated as packed.
+    ;
+    ;
+    ; We work around the first problem by implementing this algorithm:
+    ;
+    ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+    ; {
+    ;   enum { SHORT_BIT = 16 };
+    ;   signed short sx = (signed short)x;
+    ;   signed short sy = (signed short)y;
+    ;   signed long sz;
+    ;
+    ;   sz = (long)sx * (long)sy;    /* signed multiply */
+    ;
+    ;   if (sx < 0) sz += (long)sy << SHORT_BIT;
+    ;   if (sy < 0) sz += (long)sx << SHORT_BIT;
+    ;
+    ;   return (unsigned long)sz;
+    ; }
+    ;
+    ; (note that a negative sx adds _sy_ and vice versa)
+    ;
+    ; For the second problem, we replace the shift by a multiplication.
+    ; Unfortunately that means we have to deal with the signed issue again.
+    ;
+
+    paddw       mm0, MMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    paddw       mm1, MMWORD [CORRECTION(0,1,edx)]
+
+    movq        mm4, mm0                ; store current value for later
+    movq        mm5, mm1
+    pmulhw      mm0, MMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    pmulhw      mm1, MMWORD [RECIPROCAL(0,1,edx)]
+    paddw       mm0, mm4  ; reciprocal is always negative (MSB=1),
+    paddw       mm1, mm5  ; so we always need to add the initial value
+                          ; (input value is never negative as we
+                          ; inverted it at the start of this routine)
+
+    ; here it gets a bit tricky as both scale
+    ; and mm0/mm1 can be negative
+    movq        mm6, MMWORD [SCALE(0,0,edx)]  ; scale
+    movq        mm7, MMWORD [SCALE(0,1,edx)]
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pmulhw      mm0, mm6
+    pmulhw      mm1, mm7
+
+    psraw       mm6, (WORD_BIT-1)       ; determine if scale is negative
+    psraw       mm7, (WORD_BIT-1)
+
+    pand        mm6, mm4                ; and add input if it is
+    pand        mm7, mm5
+    paddw       mm0, mm6
+    paddw       mm1, mm7
+
+    psraw       mm4, (WORD_BIT-1)       ; then check if negative input
+    psraw       mm5, (WORD_BIT-1)
+
+    pand        mm4, MMWORD [SCALE(0,0,edx)]  ; and add scale if it is
+    pand        mm5, MMWORD [SCALE(0,1,edx)]
+    paddw       mm0, mm4
+    paddw       mm1, mm5
+
+    pxor        mm0, mm2                ; val = -val
+    pxor        mm1, mm3
+    psubw       mm0, mm2
+    psubw       mm1, mm3
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+    add         esi, byte 8*SIZEOF_DCTELEM
+    add         edx, byte 8*SIZEOF_DCTELEM
+    add         edi, byte 8*SIZEOF_JCOEF
+    dec         al
+    jnz         near .quantloop2
+    dec         ah
+    jnz         near .quantloop1        ; to avoid branch misprediction
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquant-sse.asm b/media/libjpeg/simd/i386/jquant-sse.asm
new file mode 100644
index 0000000000..218adc976f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-sse.asm
@@ -0,0 +1,208 @@
+;
+; jquant.asm - sample data conversion and quantization (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                          FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
+
+EXTN(jsimd_convsamp_float_sse):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7
+    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       mm0, mm7                ; mm0=(01234567)
+    psubb       mm1, mm7                ; mm1=(89ABCDEF)
+
+    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
+    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
+    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
+    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
+
+    punpcklwd   mm4, mm2                ; mm4=(***0***1)
+    punpckhwd   mm2, mm2                ; mm2=(***2***3)
+    punpcklwd   mm5, mm0                ; mm5=(***4***5)
+    punpckhwd   mm0, mm0                ; mm0=(***6***7)
+
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
+    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
+    cvtpi2ps    xmm0, mm4                  ; xmm0=(01**)
+    cvtpi2ps    xmm1, mm2                  ; xmm1=(23**)
+    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
+    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
+    cvtpi2ps    xmm2, mm5                  ; xmm2=(45**)
+    cvtpi2ps    xmm3, mm0                  ; xmm3=(67**)
+
+    punpcklwd   mm6, mm3                ; mm6=(***8***9)
+    punpckhwd   mm3, mm3                ; mm3=(***A***B)
+    punpcklwd   mm4, mm1                ; mm4=(***C***D)
+    punpckhwd   mm1, mm1                ; mm1=(***E***F)
+
+    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
+    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
+    cvtpi2ps    xmm4, mm6                  ; xmm4=(89**)
+    cvtpi2ps    xmm5, mm3                  ; xmm5=(AB**)
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
+    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
+    cvtpi2ps    xmm6, mm4                  ; xmm6=(CD**)
+    cvtpi2ps    xmm7, mm1                  ; xmm7=(EF**)
+
+    movlhps     xmm0, xmm1              ; xmm0=(0123)
+    movlhps     xmm2, xmm3              ; xmm2=(4567)
+    movlhps     xmm4, xmm5              ; xmm4=(89AB)
+    movlhps     xmm6, xmm7              ; xmm6=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .convloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                          FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse)
+
+EXTN(jsimd_quantize_float_sse):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    movhlps     xmm4, xmm0
+    movhlps     xmm5, xmm1
+
+    cvtps2pi    mm0, xmm0
+    cvtps2pi    mm1, xmm1
+    cvtps2pi    mm4, xmm4
+    cvtps2pi    mm5, xmm5
+
+    movhlps     xmm6, xmm2
+    movhlps     xmm7, xmm3
+
+    cvtps2pi    mm2, xmm2
+    cvtps2pi    mm3, xmm3
+    cvtps2pi    mm6, xmm6
+    cvtps2pi    mm7, xmm7
+
+    packssdw    mm0, mm4
+    packssdw    mm1, mm5
+    packssdw    mm2, mm6
+    packssdw    mm3, mm7
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         short .quantloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquantf-sse2.asm b/media/libjpeg/simd/i386/jquantf-sse2.asm
new file mode 100644
index 0000000000..a881ab50f9
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquantf-sse2.asm
@@ -0,0 +1,168 @@
+;
+; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7
+    packsswb    xmm7, xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       xmm0, xmm7              ; xmm0=(01234567)
+    psubb       xmm1, xmm7              ; xmm1=(89ABCDEF)
+
+    punpcklbw   xmm0, xmm0              ; xmm0=(*0*1*2*3*4*5*6*7)
+    punpcklbw   xmm1, xmm1              ; xmm1=(*8*9*A*B*C*D*E*F)
+
+    punpcklwd   xmm2, xmm0              ; xmm2=(***0***1***2***3)
+    punpckhwd   xmm0, xmm0              ; xmm0=(***4***5***6***7)
+    punpcklwd   xmm3, xmm1              ; xmm3=(***8***9***A***B)
+    punpckhwd   xmm1, xmm1              ; xmm1=(***C***D***E***F)
+
+    psrad       xmm2, (DWORD_BIT-BYTE_BIT)  ; xmm2=(0123)
+    psrad       xmm0, (DWORD_BIT-BYTE_BIT)  ; xmm0=(4567)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=(0123)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=(4567)
+    psrad       xmm3, (DWORD_BIT-BYTE_BIT)  ; xmm3=(89AB)
+    psrad       xmm1, (DWORD_BIT-BYTE_BIT)  ; xmm1=(CDEF)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=(89AB)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         short .convloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                           FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    cvtps2dq    xmm0, xmm0
+    cvtps2dq    xmm1, xmm1
+    cvtps2dq    xmm2, xmm2
+    cvtps2dq    xmm3, xmm3
+
+    packssdw    xmm0, xmm1
+    packssdw    xmm2, xmm3
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         short .quantloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquanti-avx2.asm b/media/libjpeg/simd/i386/jquanti-avx2.asm
new file mode 100644
index 0000000000..5ed6bec246
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquanti-avx2.asm
@@ -0,0 +1,188 @@
+;
+; jquanti.asm - sample data conversion and quantization (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    vinserti128 ymm0, ymm0, xmm1, 1
+    vinserti128 ymm2, ymm2, xmm3, 1
+    vinserti128 ymm4, ymm4, xmm5, 1
+    vinserti128 ymm6, ymm6, xmm7, 1
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpunpcklbw  ymm0, ymm0, ymm1
+    vpunpcklbw  ymm2, ymm2, ymm1
+    vpunpcklbw  ymm4, ymm4, ymm1
+    vpunpcklbw  ymm6, ymm6, ymm1
+
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpaddw      ymm4, ymm4, ymm7
+    vpaddw      ymm6, ymm6, ymm7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+
+    vmovdqu     ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,edx)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,edx)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,edx)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,edx)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquanti-sse2.asm b/media/libjpeg/simd/i386/jquanti-sse2.asm
new file mode 100644
index 0000000000..0a509408aa
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquanti-sse2.asm
@@ -0,0 +1,201 @@
+;
+; jquanti.asm - sample data conversion and quantization (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pxor        xmm6, xmm6              ; xmm6=(all 0's)
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
+    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
+
+    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
+    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
+    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+    add         esi, byte 4*SIZEOF_JSAMPROW
+    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         short .convloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/32
+    alignx      16, 7
+.quantloop:
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    psraw       xmm4, (WORD_BIT-1)
+    psraw       xmm5, (WORD_BIT-1)
+    psraw       xmm6, (WORD_BIT-1)
+    psraw       xmm7, (WORD_BIT-1)
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
+    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
+    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
+    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
+
+    paddw       xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    paddw       xmm1, XMMWORD [CORRECTION(1,0,edx)]
+    paddw       xmm2, XMMWORD [CORRECTION(2,0,edx)]
+    paddw       xmm3, XMMWORD [CORRECTION(3,0,edx)]
+    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+    pmulhuw     xmm0, XMMWORD [SCALE(0,0,edx)]       ; scale
+    pmulhuw     xmm1, XMMWORD [SCALE(1,0,edx)]
+    pmulhuw     xmm2, XMMWORD [SCALE(2,0,edx)]
+    pmulhuw     xmm3, XMMWORD [SCALE(3,0,edx)]
+
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4
+    psubw       xmm1, xmm5
+    psubw       xmm2, xmm6
+    psubw       xmm3, xmm7
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+    add         esi, byte 32*SIZEOF_DCTELEM
+    add         edx, byte 32*SIZEOF_DCTELEM
+    add         edi, byte 32*SIZEOF_JCOEF
+    dec         eax
+    jnz         near .quantloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jsimd.c b/media/libjpeg/simd/i386/jsimd.c
new file mode 100644
index 0000000000..80bc821ff4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jsimd.c
@@ -0,0 +1,1246 @@
+/*
+ * jsimd_i386.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "jconfigint.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order)  (((unsigned)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr)  (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr)  (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static unsigned int simd_support = (unsigned int)(~0);
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = jpeg_simd_cpu_support();
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCEMMX") && !strcmp(env, "1"))
+    simd_support &= JSIMD_MMX;
+  if (!GETENV_S(env, 2, "JSIMD_FORCE3DNOW") && !strcmp(env, "1"))
+    simd_support &= JSIMD_3DNOW | JSIMD_MMX;
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE") && !strcmp(env, "1"))
+    simd_support &= JSIMD_SSE | JSIMD_MMX;
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_SSE2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_AVX2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extrgb_ycc_convert_sse2;
+    mmxfct = jsimd_extrgb_ycc_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+    sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+    mmxfct = jsimd_extrgbx_ycc_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extbgr_ycc_convert_sse2;
+    mmxfct = jsimd_extbgr_ycc_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+    sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+    mmxfct = jsimd_extbgrx_ycc_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+    mmxfct = jsimd_extxbgr_ycc_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+    mmxfct = jsimd_extxrgb_ycc_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_rgb_ycc_convert_avx2;
+    sse2fct = jsimd_rgb_ycc_convert_sse2;
+    mmxfct = jsimd_rgb_ycc_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_gray_convert_avx2;
+    sse2fct = jsimd_extrgb_gray_convert_sse2;
+    mmxfct = jsimd_extrgb_gray_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_gray_convert_avx2;
+    sse2fct = jsimd_extrgbx_gray_convert_sse2;
+    mmxfct = jsimd_extrgbx_gray_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_gray_convert_avx2;
+    sse2fct = jsimd_extbgr_gray_convert_sse2;
+    mmxfct = jsimd_extbgr_gray_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_gray_convert_avx2;
+    sse2fct = jsimd_extbgrx_gray_convert_sse2;
+    mmxfct = jsimd_extbgrx_gray_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_gray_convert_avx2;
+    sse2fct = jsimd_extxbgr_gray_convert_sse2;
+    mmxfct = jsimd_extxbgr_gray_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_gray_convert_avx2;
+    sse2fct = jsimd_extxrgb_gray_convert_sse2;
+    mmxfct = jsimd_extxrgb_gray_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_rgb_gray_convert_avx2;
+    sse2fct = jsimd_rgb_gray_convert_sse2;
+    mmxfct = jsimd_rgb_gray_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_ycc_extrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extrgb_convert_sse2;
+    mmxfct = jsimd_ycc_extrgb_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+    sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+    mmxfct = jsimd_ycc_extrgbx_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_ycc_extbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extbgr_convert_sse2;
+    mmxfct = jsimd_ycc_extbgr_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+    sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+    mmxfct = jsimd_ycc_extbgrx_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+    mmxfct = jsimd_ycc_extxbgr_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+    mmxfct = jsimd_ycc_extxrgb_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_ycc_rgb_convert_avx2;
+    sse2fct = jsimd_ycc_rgb_convert_sse2;
+    mmxfct = jsimd_ycc_rgb_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else
+    mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extrgb_merged_upsample_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extrgbx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extbgrx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extxbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extxrgb_merged_upsample_mmx;
+    break;
+  default:
+    avx2fct = jsimd_h2v2_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_merged_upsample_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extrgb_merged_upsample_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extrgbx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extbgrx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extxbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extxrgb_merged_upsample_mmx;
+    break;
+  default:
+    avx2fct = jsimd_h2v1_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_merged_upsample_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_convsamp_avx2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_mmx(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_convsamp_float_sse(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_fdct_islow_avx2(data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_fdct_islow_sse2(data);
+  else
+    jsimd_fdct_islow_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    jsimd_fdct_ifast_sse2(data);
+  else
+    jsimd_fdct_ifast_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    jsimd_fdct_float_sse(data);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_fdct_float_3dnow(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_mmx(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_quantize_float_sse(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+  else
+    jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+  else
+    jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+  else
+    jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 4)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 4)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/i386/jsimdcpu.asm b/media/libjpeg/simd/i386/jsimdcpu.asm
new file mode 100644
index 0000000000..ddcafa9e21
--- /dev/null
+++ b/media/libjpeg/simd/i386/jsimdcpu.asm
@@ -0,0 +1,135 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+    align       32
+    GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+    push        edi
+
+    xor         edi, edi                ; simd support flag
+
+    pushfd
+    pop         eax
+    mov         edx, eax
+    xor         eax, 1<<21              ; flip ID bit in EFLAGS
+    push        eax
+    popfd
+    pushfd
+    pop         eax
+    xor         eax, edx
+    jz          near .return            ; CPUID is not supported
+
+    ; Check whether CPUID leaf 07H is supported
+    ; (leaf 07H is used to check for AVX2 instruction support)
+    xor         eax, eax
+    cpuid
+    test        eax, eax
+    jz          near .return
+    cmp         eax, 7
+    jl          short .no_avx2          ; Maximum leaf < 07H
+
+    ; Check for AVX2 instruction support
+    mov         eax, 7
+    xor         ecx, ecx
+    cpuid
+    mov         eax, ebx
+    test        eax, 1<<5               ; bit5:AVX2
+    jz          short .no_avx2
+
+    ; Check for AVX2 O/S support
+    mov         eax, 1
+    xor         ecx, ecx
+    cpuid
+    test        ecx, 1<<27
+    jz          short .no_avx2          ; O/S does not support XSAVE
+    test        ecx, 1<<28
+    jz          short .no_avx2          ; CPU does not support AVX2
+
+    xor         ecx, ecx
+    xgetbv
+    and         eax, 6
+    cmp         eax, 6                  ; O/S does not manage XMM/YMM state
+                                        ; using XSAVE
+    jnz         short .no_avx2
+
+    or          edi, JSIMD_AVX2
+.no_avx2:
+
+    ; Check CPUID leaf 01H for MMX, SSE, and SSE2 support
+    xor         eax, eax
+    inc         eax
+    cpuid
+    mov         eax, edx                ; eax = Standard feature flags
+
+    ; Check for MMX instruction support
+    test        eax, 1<<23              ; bit23:MMX
+    jz          short .no_mmx
+    or          edi, byte JSIMD_MMX
+.no_mmx:
+    test        eax, 1<<25              ; bit25:SSE
+    jz          short .no_sse
+    or          edi, byte JSIMD_SSE
+.no_sse:
+    test        eax, 1<<26              ; bit26:SSE2
+    jz          short .no_sse2
+    or          edi, byte JSIMD_SSE2
+.no_sse2:
+
+    ; Check for 3DNow! instruction support
+    mov         eax, 0x80000000
+    cpuid
+    cmp         eax, 0x80000000
+    jbe         short .return
+
+    mov         eax, 0x80000001
+    cpuid
+    mov         eax, edx                ; eax = Extended feature flags
+
+    test        eax, 1<<31              ; bit31:3DNow!(vendor independent)
+    jz          short .no_3dnow
+    or          edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+    mov         eax, edi
+
+    pop         edi
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/jccolext-mmx.asm b/media/libjpeg/simd/jccolext-mmx.asm
deleted file mode 100644
index 96a0372b1b..0000000000
--- a/media/libjpeg/simd/jccolext-mmx.asm
+++ /dev/null
@@ -1,476 +0,0 @@
-;
-; jccolext.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
-;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                           JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION img_width
-%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
-%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
-%define output_row(b)   (b)+20          ; JDIMENSION output_row
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          8
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_rgb_ycc_convert_mmx)
-
-EXTN(jsimd_rgb_ycc_convert_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [img_width(eax)]        ; num_cols
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     esi, JSAMPIMAGE [output_buf(eax)]
-        mov     ecx, JDIMENSION [output_row(eax)]
-        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
-        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
-        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     esi, JSAMPARRAY [input_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        pushpic eax
-        push    edx
-        push    ebx
-        push    edi
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr0
-        mov     ebx, JSAMPROW [ebx]     ; outptr1
-        mov     edx, JSAMPROW [edx]     ; outptr2
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jae     short .columnloop
-        alignx  16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    eax
-        push    edx
-        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_BYTE
-        xor     eax,eax
-        mov     al, BYTE [esi+ecx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_WORD
-        xor     edx,edx
-        mov     dx, WORD [esi+ecx]
-        shl     eax, WORD_BIT
-        or      eax,edx
-.column_ld4:
-        movd    mmA,eax
-        pop     edx
-        pop     eax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_DWORD
-        movd    mmG, DWORD [esi+ecx]
-        psllq   mmA, DWORD_BIT
-        por     mmA,mmG
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        movq    mmG,mmA
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        mov     ecx, SIZEOF_MMWORD
-        jmp     short .rgb_ycc_cnv
-.column_ld16:
-        test    cl, 2*SIZEOF_MMWORD
-        mov     ecx, SIZEOF_MMWORD
-        jz      short .rgb_ycc_cnv
-        movq    mmF,mmA
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-        jmp     short .rgb_ycc_cnv
-        alignx  16,7
-
-.columnloop:
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
-        ; mmA=(00 10 20 01 11 21 02 12)
-        ; mmG=(22 03 13 23 04 14 24 05)
-        ; mmF=(15 25 06 16 26 07 17 27)
-
-        movq      mmD,mmA
-        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 10 20 01)
-        psrlq     mmD,4*BYTE_BIT        ; mmD=(11 21 02 12 -- -- -- --)
-
-        punpckhbw mmA,mmG               ; mmA=(00 04 10 14 20 24 01 05)
-        psllq     mmG,4*BYTE_BIT        ; mmG=(-- -- -- -- 22 03 13 23)
-
-        punpcklbw mmD,mmF               ; mmD=(11 15 21 25 02 06 12 16)
-        punpckhbw mmG,mmF               ; mmG=(22 26 03 07 13 17 23 27)
-
-        movq      mmE,mmA
-        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 04 10 14)
-        psrlq     mmE,4*BYTE_BIT        ; mmE=(20 24 01 05 -- -- -- --)
-
-        punpckhbw mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
-        psllq     mmD,4*BYTE_BIT        ; mmD=(-- -- -- -- 11 15 21 25)
-
-        punpcklbw mmE,mmG               ; mmE=(20 22 24 26 01 03 05 07)
-        punpckhbw mmD,mmG               ; mmD=(11 13 15 17 21 23 25 27)
-
-        pxor      mmH,mmH
-
-        movq      mmC,mmA
-        punpcklbw mmA,mmH               ; mmA=(00 02 04 06)
-        punpckhbw mmC,mmH               ; mmC=(10 12 14 16)
-
-        movq      mmB,mmE
-        punpcklbw mmE,mmH               ; mmE=(20 22 24 26)
-        punpckhbw mmB,mmH               ; mmB=(01 03 05 07)
-
-        movq      mmF,mmD
-        punpcklbw mmD,mmH               ; mmD=(11 13 15 17)
-        punpckhbw mmF,mmH               ; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_MMWORD/8
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_MMWORD/8
-        movd    mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_MMWORD/4
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_MMWORD/4
-        movq    mmF,mmA
-        movq    mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
-        test    cl, SIZEOF_MMWORD/2
-        mov     ecx, SIZEOF_MMWORD
-        jz      short .rgb_ycc_cnv
-        movq    mmD,mmA
-        movq    mmC,mmF
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-        jmp     short .rgb_ycc_cnv
-        alignx  16,7
-
-.columnloop:
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mmD, MMWORD [esi+2*SIZEOF_MMWORD]
-        movq    mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
-        ; mmA=(00 10 20 30 01 11 21 31)
-        ; mmF=(02 12 22 32 03 13 23 33)
-        ; mmD=(04 14 24 34 05 15 25 35)
-        ; mmC=(06 16 26 36 07 17 27 37)
-
-        movq      mmB,mmA
-        punpcklbw mmA,mmF               ; mmA=(00 02 10 12 20 22 30 32)
-        punpckhbw mmB,mmF               ; mmB=(01 03 11 13 21 23 31 33)
-
-        movq      mmG,mmD
-        punpcklbw mmD,mmC               ; mmD=(04 06 14 16 24 26 34 36)
-        punpckhbw mmG,mmC               ; mmG=(05 07 15 17 25 27 35 37)
-
-        movq      mmE,mmA
-        punpcklwd mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
-        punpckhwd mmE,mmD               ; mmE=(20 22 24 26 30 32 34 36)
-
-        movq      mmH,mmB
-        punpcklwd mmB,mmG               ; mmB=(01 03 05 07 11 13 15 17)
-        punpckhwd mmH,mmG               ; mmH=(21 23 25 27 31 33 35 37)
-
-        pxor      mmF,mmF
-
-        movq      mmC,mmA
-        punpcklbw mmA,mmF               ; mmA=(00 02 04 06)
-        punpckhbw mmC,mmF               ; mmC=(10 12 14 16)
-
-        movq      mmD,mmB
-        punpcklbw mmB,mmF               ; mmB=(01 03 05 07)
-        punpckhbw mmD,mmF               ; mmD=(11 13 15 17)
-
-        movq      mmG,mmE
-        punpcklbw mmE,mmF               ; mmE=(20 22 24 26)
-        punpckhbw mmG,mmF               ; mmG=(30 32 34 36)
-
-        punpcklbw mmF,mmH
-        punpckhbw mmH,mmH
-        psrlw     mmF,BYTE_BIT          ; mmF=(21 23 25 27)
-        psrlw     mmH,BYTE_BIT          ; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
-        ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-        movq      MMWORD [wk(0)], mm0   ; wk(0)=RE
-        movq      MMWORD [wk(1)], mm1   ; wk(1)=RO
-        movq      MMWORD [wk(2)], mm4   ; wk(2)=BE
-        movq      MMWORD [wk(3)], mm5   ; wk(3)=BO
-
-        movq      mm6,mm1
-        punpcklwd mm1,mm3
-        punpckhwd mm6,mm3
-        movq      mm7,mm1
-        movq      mm4,mm6
-        pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-        pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-        pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-        movq      MMWORD [wk(4)], mm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-        movq      MMWORD [wk(5)], mm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        pxor      mm1,mm1
-        pxor      mm6,mm6
-        punpcklwd mm1,mm5               ; mm1=BOL
-        punpckhwd mm6,mm5               ; mm6=BOH
-        psrld     mm1,1                 ; mm1=BOL*FIX(0.500)
-        psrld     mm6,1                 ; mm6=BOH*FIX(0.500)
-
-        movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
-
-        paddd     mm7,mm1
-        paddd     mm4,mm6
-        paddd     mm7,mm5
-        paddd     mm4,mm5
-        psrld     mm7,SCALEBITS         ; mm7=CbOL
-        psrld     mm4,SCALEBITS         ; mm4=CbOH
-        packssdw  mm7,mm4               ; mm7=CbO
-
-        movq      mm1, MMWORD [wk(2)]   ; mm1=BE
-
-        movq      mm6,mm0
-        punpcklwd mm0,mm2
-        punpckhwd mm6,mm2
-        movq      mm5,mm0
-        movq      mm4,mm6
-        pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-        pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-        movq      MMWORD [wk(6)], mm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movq      MMWORD [wk(7)], mm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        pxor      mm0,mm0
-        pxor      mm6,mm6
-        punpcklwd mm0,mm1               ; mm0=BEL
-        punpckhwd mm6,mm1               ; mm6=BEH
-        psrld     mm0,1                 ; mm0=BEL*FIX(0.500)
-        psrld     mm6,1                 ; mm6=BEH*FIX(0.500)
-
-        movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
-        paddd     mm5,mm0
-        paddd     mm4,mm6
-        paddd     mm5,mm1
-        paddd     mm4,mm1
-        psrld     mm5,SCALEBITS         ; mm5=CbEL
-        psrld     mm4,SCALEBITS         ; mm4=CbEH
-        packssdw  mm5,mm4               ; mm5=CbE
-
-        psllw     mm7,BYTE_BIT
-        por       mm5,mm7               ; mm5=Cb
-        movq      MMWORD [ebx], mm5     ; Save Cb
-
-        movq      mm0, MMWORD [wk(3)]   ; mm0=BO
-        movq      mm6, MMWORD [wk(2)]   ; mm6=BE
-        movq      mm1, MMWORD [wk(1)]   ; mm1=RO
-
-        movq      mm4,mm0
-        punpcklwd mm0,mm3
-        punpckhwd mm4,mm3
-        movq      mm7,mm0
-        movq      mm5,mm4
-        pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-        pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-        movq      mm3,[GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
-
-        paddd     mm0, MMWORD [wk(4)]
-        paddd     mm4, MMWORD [wk(5)]
-        paddd     mm0,mm3
-        paddd     mm4,mm3
-        psrld     mm0,SCALEBITS         ; mm0=YOL
-        psrld     mm4,SCALEBITS         ; mm4=YOH
-        packssdw  mm0,mm4               ; mm0=YO
-
-        pxor      mm3,mm3
-        pxor      mm4,mm4
-        punpcklwd mm3,mm1               ; mm3=ROL
-        punpckhwd mm4,mm1               ; mm4=ROH
-        psrld     mm3,1                 ; mm3=ROL*FIX(0.500)
-        psrld     mm4,1                 ; mm4=ROH*FIX(0.500)
-
-        movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
-        paddd     mm7,mm3
-        paddd     mm5,mm4
-        paddd     mm7,mm1
-        paddd     mm5,mm1
-        psrld     mm7,SCALEBITS         ; mm7=CrOL
-        psrld     mm5,SCALEBITS         ; mm5=CrOH
-        packssdw  mm7,mm5               ; mm7=CrO
-
-        movq      mm3, MMWORD [wk(0)]   ; mm3=RE
-
-        movq      mm4,mm6
-        punpcklwd mm6,mm2
-        punpckhwd mm4,mm2
-        movq      mm1,mm6
-        movq      mm5,mm4
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-        pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-        movq      mm2,[GOTOFF(eax,PD_ONEHALF)]  ; mm2=[PD_ONEHALF]
-
-        paddd     mm6, MMWORD [wk(6)]
-        paddd     mm4, MMWORD [wk(7)]
-        paddd     mm6,mm2
-        paddd     mm4,mm2
-        psrld     mm6,SCALEBITS         ; mm6=YEL
-        psrld     mm4,SCALEBITS         ; mm4=YEH
-        packssdw  mm6,mm4               ; mm6=YE
-
-        psllw     mm0,BYTE_BIT
-        por       mm6,mm0               ; mm6=Y
-        movq      MMWORD [edi], mm6     ; Save Y
-
-        pxor      mm2,mm2
-        pxor      mm4,mm4
-        punpcklwd mm2,mm3               ; mm2=REL
-        punpckhwd mm4,mm3               ; mm4=REH
-        psrld     mm2,1                 ; mm2=REL*FIX(0.500)
-        psrld     mm4,1                 ; mm4=REH*FIX(0.500)
-
-        movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
-
-        paddd     mm1,mm2
-        paddd     mm5,mm4
-        paddd     mm1,mm0
-        paddd     mm5,mm0
-        psrld     mm1,SCALEBITS         ; mm1=CrEL
-        psrld     mm5,SCALEBITS         ; mm5=CrEH
-        packssdw  mm1,mm5               ; mm1=CrE
-
-        psllw     mm7,BYTE_BIT
-        por       mm1,mm7               ; mm1=Cr
-        movq      MMWORD [edx], mm1     ; Save Cr
-
-        sub     ecx, byte SIZEOF_MMWORD
-        add     esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; inptr
-        add     edi, byte SIZEOF_MMWORD                 ; outptr0
-        add     ebx, byte SIZEOF_MMWORD                 ; outptr1
-        add     edx, byte SIZEOF_MMWORD                 ; outptr2
-        cmp     ecx, byte SIZEOF_MMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .column_ld1
-
-        pop     ecx                     ; col
-        pop     esi
-        pop     edi
-        pop     ebx
-        pop     edx
-        poppic  eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     edi, byte SIZEOF_JSAMPROW
-        add     ebx, byte SIZEOF_JSAMPROW
-        add     edx, byte SIZEOF_JSAMPROW
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jccolext-sse2-64.asm b/media/libjpeg/simd/jccolext-sse2-64.asm
deleted file mode 100644
index 8e4642d3bc..0000000000
--- a/media/libjpeg/simd/jccolext-sse2-64.asm
+++ /dev/null
@@ -1,486 +0,0 @@
-;
-; jccolext.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-; r10 = JDIMENSION img_width
-; r11 = JSAMPARRAY input_buf
-; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          8
-
-        align   16
-
-        global  EXTN(jsimd_rgb_ycc_convert_sse2)
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     ecx, r10d
-        test    rcx,rcx
-        jz      near .return
-
-        push    rcx
-
-        mov rsi, r12
-        mov ecx, r13d
-        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-        mov     rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
-        mov     rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
-        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
-        lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
-        lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
-        pop     rcx
-
-        mov rsi, r11
-        mov     eax, r14d
-        test    rax,rax
-        jle     near .return
-.rowloop:
-        push    rdx
-        push    rbx
-        push    rdi
-        push    rsi
-        push    rcx                     ; col
-
-        mov     rsi, JSAMPROW [rsi]     ; inptr
-        mov     rdi, JSAMPROW [rdi]     ; outptr0
-        mov     rbx, JSAMPROW [rbx]     ; outptr1
-        mov     rdx, JSAMPROW [rdx]     ; outptr2
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    rax
-        push    rdx
-        lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     rcx, byte SIZEOF_BYTE
-        movzx   rax, BYTE [rsi+rcx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     rcx, byte SIZEOF_WORD
-        movzx   rdx, WORD [rsi+rcx]
-        shl     rax, WORD_BIT
-        or      rax,rdx
-.column_ld4:
-        movd    xmmA,eax
-        pop     rdx
-        pop     rax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     rcx, byte SIZEOF_DWORD
-        movd    xmmF, XMM_DWORD [rsi+rcx]
-        pslldq  xmmA, SIZEOF_DWORD
-        por     xmmA,xmmF
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        sub     rcx, byte SIZEOF_MMWORD
-        movq    xmmB, XMM_MMWORD [rsi+rcx]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmB
-.column_ld16:
-        test    cl, SIZEOF_XMMWORD
-        jz      short .column_ld32
-        movdqa  xmmF,xmmA
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        mov     rcx, SIZEOF_XMMWORD
-        jmp     short .rgb_ycc_cnv
-.column_ld32:
-        test    cl, 2*SIZEOF_XMMWORD
-        mov     rcx, SIZEOF_XMMWORD
-        jz      short .rgb_ycc_cnv
-        movdqa  xmmB,xmmA
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_ycc_cnv
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        movdqa    xmmG,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-        movdqa    xmmD,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-        movdqa    xmmE,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-        pxor      xmmH,xmmH
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmB,xmmE
-        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-        movdqa    xmmF,xmmD
-        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_XMMWORD/16
-        jz      short .column_ld2
-        sub     rcx, byte SIZEOF_XMMWORD/16
-        movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_XMMWORD/8
-        jz      short .column_ld4
-        sub     rcx, byte SIZEOF_XMMWORD/8
-        movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmE
-.column_ld4:
-        test    cl, SIZEOF_XMMWORD/4
-        jz      short .column_ld8
-        sub     rcx, byte SIZEOF_XMMWORD/4
-        movdqa  xmmE,xmmA
-        movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld8:
-        test    cl, SIZEOF_XMMWORD/2
-        mov     rcx, SIZEOF_XMMWORD
-        jz      short .rgb_ycc_cnv
-        movdqa  xmmF,xmmA
-        movdqa  xmmH,xmmE
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_ycc_cnv
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-        movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-        movdqa    xmmC,xmmF
-        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-        movdqa    xmmB,xmmA
-        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-        movdqa    xmmG,xmmD
-        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-        movdqa    xmmE,xmmA
-        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-        movdqa    xmmH,xmmB
-        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-        pxor      xmmF,xmmF
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmD,xmmB
-        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-        movdqa    xmmG,xmmE
-        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-        punpcklbw xmmF,xmmH
-        punpckhbw xmmH,xmmH
-        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
-        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE
-        movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO
-        movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE
-        movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO
-
-        movdqa    xmm6,xmm1
-        punpcklwd xmm1,xmm3
-        punpckhwd xmm6,xmm3
-        movdqa    xmm7,xmm1
-        movdqa    xmm4,xmm6
-        pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-        pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-        pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-        movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-        movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        pxor      xmm1,xmm1
-        pxor      xmm6,xmm6
-        punpcklwd xmm1,xmm5             ; xmm1=BOL
-        punpckhwd xmm6,xmm5             ; xmm6=BOH
-        psrld     xmm1,1                ; xmm1=BOL*FIX(0.500)
-        psrld     xmm6,1                ; xmm6=BOH*FIX(0.500)
-
-        movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm7,xmm1
-        paddd     xmm4,xmm6
-        paddd     xmm7,xmm5
-        paddd     xmm4,xmm5
-        psrld     xmm7,SCALEBITS        ; xmm7=CbOL
-        psrld     xmm4,SCALEBITS        ; xmm4=CbOH
-        packssdw  xmm7,xmm4             ; xmm7=CbO
-
-        movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE
-
-        movdqa    xmm6,xmm0
-        punpcklwd xmm0,xmm2
-        punpckhwd xmm6,xmm2
-        movdqa    xmm5,xmm0
-        movdqa    xmm4,xmm6
-        pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-        pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-        pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-        movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        pxor      xmm0,xmm0
-        pxor      xmm6,xmm6
-        punpcklwd xmm0,xmm1             ; xmm0=BEL
-        punpckhwd xmm6,xmm1             ; xmm6=BEH
-        psrld     xmm0,1                ; xmm0=BEL*FIX(0.500)
-        psrld     xmm6,1                ; xmm6=BEH*FIX(0.500)
-
-        movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm5,xmm0
-        paddd     xmm4,xmm6
-        paddd     xmm5,xmm1
-        paddd     xmm4,xmm1
-        psrld     xmm5,SCALEBITS        ; xmm5=CbEL
-        psrld     xmm4,SCALEBITS        ; xmm4=CbEH
-        packssdw  xmm5,xmm4             ; xmm5=CbE
-
-        psllw     xmm7,BYTE_BIT
-        por       xmm5,xmm7             ; xmm5=Cb
-        movdqa    XMMWORD [rbx], xmm5   ; Save Cb
-
-        movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO
-        movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE
-        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO
-
-        movdqa    xmm4,xmm0
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm4,xmm3
-        movdqa    xmm7,xmm0
-        movdqa    xmm5,xmm4
-        pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-        pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-        pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-        movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
-
-        paddd     xmm0, XMMWORD [wk(4)]
-        paddd     xmm4, XMMWORD [wk(5)]
-        paddd     xmm0,xmm3
-        paddd     xmm4,xmm3
-        psrld     xmm0,SCALEBITS        ; xmm0=YOL
-        psrld     xmm4,SCALEBITS        ; xmm4=YOH
-        packssdw  xmm0,xmm4             ; xmm0=YO
-
-        pxor      xmm3,xmm3
-        pxor      xmm4,xmm4
-        punpcklwd xmm3,xmm1             ; xmm3=ROL
-        punpckhwd xmm4,xmm1             ; xmm4=ROH
-        psrld     xmm3,1                ; xmm3=ROL*FIX(0.500)
-        psrld     xmm4,1                ; xmm4=ROH*FIX(0.500)
-
-        movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm7,xmm3
-        paddd     xmm5,xmm4
-        paddd     xmm7,xmm1
-        paddd     xmm5,xmm1
-        psrld     xmm7,SCALEBITS        ; xmm7=CrOL
-        psrld     xmm5,SCALEBITS        ; xmm5=CrOH
-        packssdw  xmm7,xmm5             ; xmm7=CrO
-
-        movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE
-
-        movdqa    xmm4,xmm6
-        punpcklwd xmm6,xmm2
-        punpckhwd xmm4,xmm2
-        movdqa    xmm1,xmm6
-        movdqa    xmm5,xmm4
-        pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-        pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-        pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-        movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
-
-        paddd     xmm6, XMMWORD [wk(6)]
-        paddd     xmm4, XMMWORD [wk(7)]
-        paddd     xmm6,xmm2
-        paddd     xmm4,xmm2
-        psrld     xmm6,SCALEBITS        ; xmm6=YEL
-        psrld     xmm4,SCALEBITS        ; xmm4=YEH
-        packssdw  xmm6,xmm4             ; xmm6=YE
-
-        psllw     xmm0,BYTE_BIT
-        por       xmm6,xmm0             ; xmm6=Y
-        movdqa    XMMWORD [rdi], xmm6   ; Save Y
-
-        pxor      xmm2,xmm2
-        pxor      xmm4,xmm4
-        punpcklwd xmm2,xmm3             ; xmm2=REL
-        punpckhwd xmm4,xmm3             ; xmm4=REH
-        psrld     xmm2,1                ; xmm2=REL*FIX(0.500)
-        psrld     xmm4,1                ; xmm4=REH*FIX(0.500)
-
-        movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm1,xmm2
-        paddd     xmm5,xmm4
-        paddd     xmm1,xmm0
-        paddd     xmm5,xmm0
-        psrld     xmm1,SCALEBITS        ; xmm1=CrEL
-        psrld     xmm5,SCALEBITS        ; xmm5=CrEH
-        packssdw  xmm1,xmm5             ; xmm1=CrE
-
-        psllw     xmm7,BYTE_BIT
-        por       xmm1,xmm7             ; xmm1=Cr
-        movdqa    XMMWORD [rdx], xmm1   ; Save Cr
-
-        sub     rcx, byte SIZEOF_XMMWORD
-        add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
-        add     rdi, byte SIZEOF_XMMWORD                ; outptr0
-        add     rbx, byte SIZEOF_XMMWORD                ; outptr1
-        add     rdx, byte SIZEOF_XMMWORD                ; outptr2
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    rcx,rcx
-        jnz     near .column_ld1
-
-        pop     rcx                     ; col
-        pop     rsi
-        pop     rdi
-        pop     rbx
-        pop     rdx
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     rdi, byte SIZEOF_JSAMPROW
-        add     rbx, byte SIZEOF_JSAMPROW
-        add     rdx, byte SIZEOF_JSAMPROW
-        dec     rax                             ; num_rows
-        jg      near .rowloop
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jccolext-sse2.asm b/media/libjpeg/simd/jccolext-sse2.asm
deleted file mode 100644
index cc38e98a18..0000000000
--- a/media/libjpeg/simd/jccolext-sse2.asm
+++ /dev/null
@@ -1,503 +0,0 @@
-;
-; jccolext.asm - colorspace conversion (SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION img_width
-%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
-%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
-%define output_row(b)   (b)+20          ; JDIMENSION output_row
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          8
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-
-        global  EXTN(jsimd_rgb_ycc_convert_sse2)
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [img_width(eax)]
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     esi, JSAMPIMAGE [output_buf(eax)]
-        mov     ecx, JDIMENSION [output_row(eax)]
-        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
-        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
-        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     esi, JSAMPARRAY [input_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        pushpic eax
-        push    edx
-        push    ebx
-        push    edi
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr0
-        mov     ebx, JSAMPROW [ebx]     ; outptr1
-        mov     edx, JSAMPROW [edx]     ; outptr2
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        alignx  16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    eax
-        push    edx
-        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_BYTE
-        movzx   eax, BYTE [esi+ecx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_WORD
-        movzx   edx, WORD [esi+ecx]
-        shl     eax, WORD_BIT
-        or      eax,edx
-.column_ld4:
-        movd    xmmA,eax
-        pop     edx
-        pop     eax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_DWORD
-        movd    xmmF, XMM_DWORD [esi+ecx]
-        pslldq  xmmA, SIZEOF_DWORD
-        por     xmmA,xmmF
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        sub     ecx, byte SIZEOF_MMWORD
-        movq    xmmB, XMM_MMWORD [esi+ecx]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmB
-.column_ld16:
-        test    cl, SIZEOF_XMMWORD
-        jz      short .column_ld32
-        movdqa  xmmF,xmmA
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        mov     ecx, SIZEOF_XMMWORD
-        jmp     short .rgb_ycc_cnv
-.column_ld32:
-        test    cl, 2*SIZEOF_XMMWORD
-        mov     ecx, SIZEOF_XMMWORD
-        jz      short .rgb_ycc_cnv
-        movdqa  xmmB,xmmA
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_ycc_cnv
-        alignx  16,7
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        movdqu  xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        movdqa    xmmG,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-        movdqa    xmmD,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-        movdqa    xmmE,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-        pxor      xmmH,xmmH
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmB,xmmE
-        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-        movdqa    xmmF,xmmD
-        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_XMMWORD/16
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_XMMWORD/16
-        movd    xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_XMMWORD/8
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_XMMWORD/8
-        movq    xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmE
-.column_ld4:
-        test    cl, SIZEOF_XMMWORD/4
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_XMMWORD/4
-        movdqa  xmmE,xmmA
-        movdqu  xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
-        test    cl, SIZEOF_XMMWORD/2
-        mov     ecx, SIZEOF_XMMWORD
-        jz      short .rgb_ycc_cnv
-        movdqa  xmmF,xmmA
-        movdqa  xmmH,xmmE
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_ycc_cnv
-        alignx  16,7
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
-        movdqu  xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-        movdqa    xmmC,xmmF
-        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-        movdqa    xmmB,xmmA
-        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-        movdqa    xmmG,xmmD
-        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-        movdqa    xmmE,xmmA
-        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-        movdqa    xmmH,xmmB
-        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-        pxor      xmmF,xmmF
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmD,xmmB
-        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-        movdqa    xmmG,xmmE
-        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-        punpcklbw xmmF,xmmH
-        punpckhbw xmmH,xmmH
-        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
-        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE
-        movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO
-        movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE
-        movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO
-
-        movdqa    xmm6,xmm1
-        punpcklwd xmm1,xmm3
-        punpckhwd xmm6,xmm3
-        movdqa    xmm7,xmm1
-        movdqa    xmm4,xmm6
-        pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-        pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-        movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-        movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        pxor      xmm1,xmm1
-        pxor      xmm6,xmm6
-        punpcklwd xmm1,xmm5             ; xmm1=BOL
-        punpckhwd xmm6,xmm5             ; xmm6=BOH
-        psrld     xmm1,1                ; xmm1=BOL*FIX(0.500)
-        psrld     xmm6,1                ; xmm6=BOH*FIX(0.500)
-
-        movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm7,xmm1
-        paddd     xmm4,xmm6
-        paddd     xmm7,xmm5
-        paddd     xmm4,xmm5
-        psrld     xmm7,SCALEBITS        ; xmm7=CbOL
-        psrld     xmm4,SCALEBITS        ; xmm4=CbOH
-        packssdw  xmm7,xmm4             ; xmm7=CbO
-
-        movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE
-
-        movdqa    xmm6,xmm0
-        punpcklwd xmm0,xmm2
-        punpckhwd xmm6,xmm2
-        movdqa    xmm5,xmm0
-        movdqa    xmm4,xmm6
-        pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-        movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        pxor      xmm0,xmm0
-        pxor      xmm6,xmm6
-        punpcklwd xmm0,xmm1             ; xmm0=BEL
-        punpckhwd xmm6,xmm1             ; xmm6=BEH
-        psrld     xmm0,1                ; xmm0=BEL*FIX(0.500)
-        psrld     xmm6,1                ; xmm6=BEH*FIX(0.500)
-
-        movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm5,xmm0
-        paddd     xmm4,xmm6
-        paddd     xmm5,xmm1
-        paddd     xmm4,xmm1
-        psrld     xmm5,SCALEBITS        ; xmm5=CbEL
-        psrld     xmm4,SCALEBITS        ; xmm4=CbEH
-        packssdw  xmm5,xmm4             ; xmm5=CbE
-
-        psllw     xmm7,BYTE_BIT
-        por       xmm5,xmm7             ; xmm5=Cb
-        movdqa    XMMWORD [ebx], xmm5   ; Save Cb
-
-        movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO
-        movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE
-        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO
-
-        movdqa    xmm4,xmm0
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm4,xmm3
-        movdqa    xmm7,xmm0
-        movdqa    xmm5,xmm4
-        pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-        pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-        movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
-
-        paddd     xmm0, XMMWORD [wk(4)]
-        paddd     xmm4, XMMWORD [wk(5)]
-        paddd     xmm0,xmm3
-        paddd     xmm4,xmm3
-        psrld     xmm0,SCALEBITS        ; xmm0=YOL
-        psrld     xmm4,SCALEBITS        ; xmm4=YOH
-        packssdw  xmm0,xmm4             ; xmm0=YO
-
-        pxor      xmm3,xmm3
-        pxor      xmm4,xmm4
-        punpcklwd xmm3,xmm1             ; xmm3=ROL
-        punpckhwd xmm4,xmm1             ; xmm4=ROH
-        psrld     xmm3,1                ; xmm3=ROL*FIX(0.500)
-        psrld     xmm4,1                ; xmm4=ROH*FIX(0.500)
-
-        movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm7,xmm3
-        paddd     xmm5,xmm4
-        paddd     xmm7,xmm1
-        paddd     xmm5,xmm1
-        psrld     xmm7,SCALEBITS        ; xmm7=CrOL
-        psrld     xmm5,SCALEBITS        ; xmm5=CrOH
-        packssdw  xmm7,xmm5             ; xmm7=CrO
-
-        movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE
-
-        movdqa    xmm4,xmm6
-        punpcklwd xmm6,xmm2
-        punpckhwd xmm4,xmm2
-        movdqa    xmm1,xmm6
-        movdqa    xmm5,xmm4
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-        pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-        movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
-
-        paddd     xmm6, XMMWORD [wk(6)]
-        paddd     xmm4, XMMWORD [wk(7)]
-        paddd     xmm6,xmm2
-        paddd     xmm4,xmm2
-        psrld     xmm6,SCALEBITS        ; xmm6=YEL
-        psrld     xmm4,SCALEBITS        ; xmm4=YEH
-        packssdw  xmm6,xmm4             ; xmm6=YE
-
-        psllw     xmm0,BYTE_BIT
-        por       xmm6,xmm0             ; xmm6=Y
-        movdqa    XMMWORD [edi], xmm6   ; Save Y
-
-        pxor      xmm2,xmm2
-        pxor      xmm4,xmm4
-        punpcklwd xmm2,xmm3             ; xmm2=REL
-        punpckhwd xmm4,xmm3             ; xmm4=REH
-        psrld     xmm2,1                ; xmm2=REL*FIX(0.500)
-        psrld     xmm4,1                ; xmm4=REH*FIX(0.500)
-
-        movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm1,xmm2
-        paddd     xmm5,xmm4
-        paddd     xmm1,xmm0
-        paddd     xmm5,xmm0
-        psrld     xmm1,SCALEBITS        ; xmm1=CrEL
-        psrld     xmm5,SCALEBITS        ; xmm5=CrEH
-        packssdw  xmm1,xmm5             ; xmm1=CrE
-
-        psllw     xmm7,BYTE_BIT
-        por       xmm1,xmm7             ; xmm1=Cr
-        movdqa    XMMWORD [edx], xmm1   ; Save Cr
-
-        sub     ecx, byte SIZEOF_XMMWORD
-        add     esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
-        add     edi, byte SIZEOF_XMMWORD                ; outptr0
-        add     ebx, byte SIZEOF_XMMWORD                ; outptr1
-        add     edx, byte SIZEOF_XMMWORD                ; outptr2
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .column_ld1
-
-        pop     ecx                     ; col
-        pop     esi
-        pop     edi
-        pop     ebx
-        pop     edx
-        poppic  eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     edi, byte SIZEOF_JSAMPROW
-        add     ebx, byte SIZEOF_JSAMPROW
-        add     edx, byte SIZEOF_JSAMPROW
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jccolor-altivec.c b/media/libjpeg/simd/jccolor-altivec.c
deleted file mode 100644
index ec473320e5..0000000000
--- a/media/libjpeg/simd/jccolor-altivec.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* RGB --> YCC CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_081 5329                 /* FIX(0.08131) */
-#define F_0_114 7471                 /* FIX(0.11400) */
-#define F_0_168 11059                /* FIX(0.16874) */
-#define F_0_250 16384                /* FIX(0.25000) */
-#define F_0_299 19595                /* FIX(0.29900) */
-#define F_0_331 21709                /* FIX(0.33126) */
-#define F_0_418 27439                /* FIX(0.41869) */
-#define F_0_500 32768                /* FIX(0.50000) */
-#define F_0_587 38470                /* FIX(0.58700) */
-#define F_0_337 (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-
-#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
-#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
-#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
-#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
-#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
-#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
-#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
diff --git a/media/libjpeg/simd/jccolor-mmx.asm b/media/libjpeg/simd/jccolor-mmx.asm
deleted file mode 100644
index c4e6d88be3..0000000000
--- a/media/libjpeg/simd/jccolor-mmx.asm
+++ /dev/null
@@ -1,122 +0,0 @@
-;
-; jccolor.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_081 equ      5329                   ; FIX(0.08131)
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_168 equ     11059                   ; FIX(0.16874)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_331 equ     21709                   ; FIX(0.33126)
-F_0_418 equ     27439                   ; FIX(0.41869)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_ycc_convert_mmx)
-
-EXTN(jconst_rgb_ycc_convert_mmx):
-
-PW_F0299_F0337  times 2 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 2 dw  F_0_114, F_0_250
-PW_MF016_MF033  times 2 dw -F_0_168,-F_0_331
-PW_MF008_MF041  times 2 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 2 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF      times 2 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
-%include "jccolext-mmx.asm"
diff --git a/media/libjpeg/simd/jccolor-sse2-64.asm b/media/libjpeg/simd/jccolor-sse2-64.asm
deleted file mode 100644
index bd2188b4c0..0000000000
--- a/media/libjpeg/simd/jccolor-sse2-64.asm
+++ /dev/null
@@ -1,121 +0,0 @@
-;
-; jccolor.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_081 equ      5329                   ; FIX(0.08131)
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_168 equ     11059                   ; FIX(0.16874)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_331 equ     21709                   ; FIX(0.33126)
-F_0_418 equ     27439                   ; FIX(0.41869)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_ycc_convert_sse2)
-
-EXTN(jconst_rgb_ycc_convert_sse2):
-
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PW_MF016_MF033  times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041  times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
diff --git a/media/libjpeg/simd/jccolor-sse2.asm b/media/libjpeg/simd/jccolor-sse2.asm
deleted file mode 100644
index 13124d13d7..0000000000
--- a/media/libjpeg/simd/jccolor-sse2.asm
+++ /dev/null
@@ -1,121 +0,0 @@
-;
-; jccolor.asm - colorspace conversion (SSE2)
-;
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_081 equ      5329                   ; FIX(0.08131)
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_168 equ     11059                   ; FIX(0.16874)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_331 equ     21709                   ; FIX(0.33126)
-F_0_418 equ     27439                   ; FIX(0.41869)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_ycc_convert_sse2)
-
-EXTN(jconst_rgb_ycc_convert_sse2):
-
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PW_MF016_MF033  times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041  times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
-%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/jcgray-altivec.c b/media/libjpeg/simd/jcgray-altivec.c
deleted file mode 100644
index 684df5ef1e..0000000000
--- a/media/libjpeg/simd/jcgray-altivec.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* RGB --> GRAYSCALE CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_114 7471                 /* FIX(0.11400) */
-#define F_0_250 16384                /* FIX(0.25000) */
-#define F_0_299 19595                /* FIX(0.29900) */
-#define F_0_587 38470                /* FIX(0.58700) */
-#define F_0_337 (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-
-#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
-#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
-#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
-#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
-#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
-#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
-#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
-#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
-#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
-#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
-#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
-#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
diff --git a/media/libjpeg/simd/jcgray-mmx.asm b/media/libjpeg/simd/jcgray-mmx.asm
deleted file mode 100644
index 0819b6ca01..0000000000
--- a/media/libjpeg/simd/jcgray-mmx.asm
+++ /dev/null
@@ -1,115 +0,0 @@
-;
-; jcgray.asm - grayscale colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_gray_convert_mmx)
-
-EXTN(jconst_rgb_gray_convert_mmx):
-
-PW_F0299_F0337  times 2 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 2 dw  F_0_114, F_0_250
-PD_ONEHALF      times 2 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx
-%include "jcgryext-mmx.asm"
diff --git a/media/libjpeg/simd/jcgray-sse2-64.asm b/media/libjpeg/simd/jcgray-sse2-64.asm
deleted file mode 100644
index bafd302aa5..0000000000
--- a/media/libjpeg/simd/jcgray-sse2-64.asm
+++ /dev/null
@@ -1,114 +0,0 @@
-;
-; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_gray_convert_sse2)
-
-EXTN(jconst_rgb_gray_convert_sse2):
-
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
diff --git a/media/libjpeg/simd/jcgray-sse2.asm b/media/libjpeg/simd/jcgray-sse2.asm
deleted file mode 100644
index 5b0b466953..0000000000
--- a/media/libjpeg/simd/jcgray-sse2.asm
+++ /dev/null
@@ -1,114 +0,0 @@
-;
-; jcgray.asm - grayscale colorspace conversion (SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_gray_convert_sse2)
-
-EXTN(jconst_rgb_gray_convert_sse2):
-
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
-%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/jcgryext-mmx.asm b/media/libjpeg/simd/jcgryext-mmx.asm
deleted file mode 100644
index 1c1b8d8bc4..0000000000
--- a/media/libjpeg/simd/jcgryext-mmx.asm
+++ /dev/null
@@ -1,356 +0,0 @@
-;
-; jcgryext.asm - grayscale colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION img_width
-%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
-%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
-%define output_row(b)   (b)+20          ; JDIMENSION output_row
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_rgb_gray_convert_mmx)
-
-EXTN(jsimd_rgb_gray_convert_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [img_width(eax)]        ; num_cols
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     esi, JSAMPIMAGE [output_buf(eax)]
-        mov     ecx, JDIMENSION [output_row(eax)]
-        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     esi, JSAMPARRAY [input_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        pushpic eax
-        push    edi
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr0
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jae     short .columnloop
-        alignx  16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    eax
-        push    edx
-        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_BYTE
-        xor     eax,eax
-        mov     al, BYTE [esi+ecx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_WORD
-        xor     edx,edx
-        mov     dx, WORD [esi+ecx]
-        shl     eax, WORD_BIT
-        or      eax,edx
-.column_ld4:
-        movd    mmA,eax
-        pop     edx
-        pop     eax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_DWORD
-        movd    mmG, DWORD [esi+ecx]
-        psllq   mmA, DWORD_BIT
-        por     mmA,mmG
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        movq    mmG,mmA
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        mov     ecx, SIZEOF_MMWORD
-        jmp     short .rgb_gray_cnv
-.column_ld16:
-        test    cl, 2*SIZEOF_MMWORD
-        mov     ecx, SIZEOF_MMWORD
-        jz      short .rgb_gray_cnv
-        movq    mmF,mmA
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-        jmp     short .rgb_gray_cnv
-        alignx  16,7
-
-.columnloop:
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_gray_cnv:
-        ; mmA=(00 10 20 01 11 21 02 12)
-        ; mmG=(22 03 13 23 04 14 24 05)
-        ; mmF=(15 25 06 16 26 07 17 27)
-
-        movq      mmD,mmA
-        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 10 20 01)
-        psrlq     mmD,4*BYTE_BIT        ; mmD=(11 21 02 12 -- -- -- --)
-
-        punpckhbw mmA,mmG               ; mmA=(00 04 10 14 20 24 01 05)
-        psllq     mmG,4*BYTE_BIT        ; mmG=(-- -- -- -- 22 03 13 23)
-
-        punpcklbw mmD,mmF               ; mmD=(11 15 21 25 02 06 12 16)
-        punpckhbw mmG,mmF               ; mmG=(22 26 03 07 13 17 23 27)
-
-        movq      mmE,mmA
-        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 04 10 14)
-        psrlq     mmE,4*BYTE_BIT        ; mmE=(20 24 01 05 -- -- -- --)
-
-        punpckhbw mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
-        psllq     mmD,4*BYTE_BIT        ; mmD=(-- -- -- -- 11 15 21 25)
-
-        punpcklbw mmE,mmG               ; mmE=(20 22 24 26 01 03 05 07)
-        punpckhbw mmD,mmG               ; mmD=(11 13 15 17 21 23 25 27)
-
-        pxor      mmH,mmH
-
-        movq      mmC,mmA
-        punpcklbw mmA,mmH               ; mmA=(00 02 04 06)
-        punpckhbw mmC,mmH               ; mmC=(10 12 14 16)
-
-        movq      mmB,mmE
-        punpcklbw mmE,mmH               ; mmE=(20 22 24 26)
-        punpckhbw mmB,mmH               ; mmB=(01 03 05 07)
-
-        movq      mmF,mmD
-        punpcklbw mmD,mmH               ; mmD=(11 13 15 17)
-        punpckhbw mmF,mmH               ; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_MMWORD/8
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_MMWORD/8
-        movd    mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_MMWORD/4
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_MMWORD/4
-        movq    mmF,mmA
-        movq    mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
-        test    cl, SIZEOF_MMWORD/2
-        mov     ecx, SIZEOF_MMWORD
-        jz      short .rgb_gray_cnv
-        movq    mmD,mmA
-        movq    mmC,mmF
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-        jmp     short .rgb_gray_cnv
-        alignx  16,7
-
-.columnloop:
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mmD, MMWORD [esi+2*SIZEOF_MMWORD]
-        movq    mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_gray_cnv:
-        ; mmA=(00 10 20 30 01 11 21 31)
-        ; mmF=(02 12 22 32 03 13 23 33)
-        ; mmD=(04 14 24 34 05 15 25 35)
-        ; mmC=(06 16 26 36 07 17 27 37)
-
-        movq      mmB,mmA
-        punpcklbw mmA,mmF               ; mmA=(00 02 10 12 20 22 30 32)
-        punpckhbw mmB,mmF               ; mmB=(01 03 11 13 21 23 31 33)
-
-        movq      mmG,mmD
-        punpcklbw mmD,mmC               ; mmD=(04 06 14 16 24 26 34 36)
-        punpckhbw mmG,mmC               ; mmG=(05 07 15 17 25 27 35 37)
-
-        movq      mmE,mmA
-        punpcklwd mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
-        punpckhwd mmE,mmD               ; mmE=(20 22 24 26 30 32 34 36)
-
-        movq      mmH,mmB
-        punpcklwd mmB,mmG               ; mmB=(01 03 05 07 11 13 15 17)
-        punpckhwd mmH,mmG               ; mmH=(21 23 25 27 31 33 35 37)
-
-        pxor      mmF,mmF
-
-        movq      mmC,mmA
-        punpcklbw mmA,mmF               ; mmA=(00 02 04 06)
-        punpckhbw mmC,mmF               ; mmC=(10 12 14 16)
-
-        movq      mmD,mmB
-        punpcklbw mmB,mmF               ; mmB=(01 03 05 07)
-        punpckhbw mmD,mmF               ; mmD=(11 13 15 17)
-
-        movq      mmG,mmE
-        punpcklbw mmE,mmF               ; mmE=(20 22 24 26)
-        punpckhbw mmG,mmF               ; mmG=(30 32 34 36)
-
-        punpcklbw mmF,mmH
-        punpckhbw mmH,mmH
-        psrlw     mmF,BYTE_BIT          ; mmF=(21 23 25 27)
-        psrlw     mmH,BYTE_BIT          ; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
-        ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-        movq      mm6,mm1
-        punpcklwd mm1,mm3
-        punpckhwd mm6,mm3
-        pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movq      mm7, mm6      ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movq      mm6,mm0
-        punpcklwd mm0,mm2
-        punpckhwd mm6,mm2
-        pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movq      MMWORD [wk(0)], mm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movq      MMWORD [wk(1)], mm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movq      mm0, mm5      ; mm0=BO
-        movq      mm6, mm4      ; mm6=BE
-
-        movq      mm4,mm0
-        punpcklwd mm0,mm3
-        punpckhwd mm4,mm3
-        pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-        movq      mm3,[GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
-
-        paddd     mm0, mm1
-        paddd     mm4, mm7
-        paddd     mm0,mm3
-        paddd     mm4,mm3
-        psrld     mm0,SCALEBITS         ; mm0=YOL
-        psrld     mm4,SCALEBITS         ; mm4=YOH
-        packssdw  mm0,mm4               ; mm0=YO
-
-        movq      mm4,mm6
-        punpcklwd mm6,mm2
-        punpckhwd mm4,mm2
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-        movq      mm2,[GOTOFF(eax,PD_ONEHALF)]  ; mm2=[PD_ONEHALF]
-
-        paddd     mm6, MMWORD [wk(0)]
-        paddd     mm4, MMWORD [wk(1)]
-        paddd     mm6,mm2
-        paddd     mm4,mm2
-        psrld     mm6,SCALEBITS         ; mm6=YEL
-        psrld     mm4,SCALEBITS         ; mm4=YEH
-        packssdw  mm6,mm4               ; mm6=YE
-
-        psllw     mm0,BYTE_BIT
-        por       mm6,mm0               ; mm6=Y
-        movq      MMWORD [edi], mm6     ; Save Y
-
-        sub     ecx, byte SIZEOF_MMWORD
-        add     esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; inptr
-        add     edi, byte SIZEOF_MMWORD                 ; outptr0
-        cmp     ecx, byte SIZEOF_MMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .column_ld1
-
-        pop     ecx                     ; col
-        pop     esi
-        pop     edi
-        poppic  eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     edi, byte SIZEOF_JSAMPROW
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jcgryext-sse2-64.asm b/media/libjpeg/simd/jcgryext-sse2-64.asm
deleted file mode 100644
index 541355af86..0000000000
--- a/media/libjpeg/simd/jcgryext-sse2-64.asm
+++ /dev/null
@@ -1,365 +0,0 @@
-;
-; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
-;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                              JDIMENSION output_row, int num_rows);
-;
-
-; r10 = JDIMENSION img_width
-; r11 = JSAMPARRAY input_buf
-; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-
-        global  EXTN(jsimd_rgb_gray_convert_sse2)
-
-EXTN(jsimd_rgb_gray_convert_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     ecx, r10d
-        test    rcx,rcx
-        jz      near .return
-
-        push    rcx
-
-        mov rsi, r12
-        mov ecx, r13d
-        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
-
-        pop     rcx
-
-        mov rsi, r11
-        mov     eax, r14d
-        test    rax,rax
-        jle     near .return
-.rowloop:
-        push    rdi
-        push    rsi
-        push    rcx                     ; col
-
-        mov     rsi, JSAMPROW [rsi]     ; inptr
-        mov     rdi, JSAMPROW [rdi]     ; outptr0
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    rax
-        push    rdx
-        lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     rcx, byte SIZEOF_BYTE
-        movzx   rax, BYTE [rsi+rcx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     rcx, byte SIZEOF_WORD
-        movzx   rdx, WORD [rsi+rcx]
-        shl     rax, WORD_BIT
-        or      rax,rdx
-.column_ld4:
-        movd    xmmA,eax
-        pop     rdx
-        pop     rax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     rcx, byte SIZEOF_DWORD
-        movd    xmmF, XMM_DWORD [rsi+rcx]
-        pslldq  xmmA, SIZEOF_DWORD
-        por     xmmA,xmmF
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        sub     rcx, byte SIZEOF_MMWORD
-        movq    xmmB, XMM_MMWORD [rsi+rcx]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmB
-.column_ld16:
-        test    cl, SIZEOF_XMMWORD
-        jz      short .column_ld32
-        movdqa  xmmF,xmmA
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        mov     rcx, SIZEOF_XMMWORD
-        jmp     short .rgb_gray_cnv
-.column_ld32:
-        test    cl, 2*SIZEOF_XMMWORD
-        mov     rcx, SIZEOF_XMMWORD
-        jz      short .rgb_gray_cnv
-        movdqa  xmmB,xmmA
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_gray_cnv
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        movdqa    xmmG,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-        movdqa    xmmD,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-        movdqa    xmmE,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-        pxor      xmmH,xmmH
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmB,xmmE
-        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-        movdqa    xmmF,xmmD
-        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_XMMWORD/16
-        jz      short .column_ld2
-        sub     rcx, byte SIZEOF_XMMWORD/16
-        movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_XMMWORD/8
-        jz      short .column_ld4
-        sub     rcx, byte SIZEOF_XMMWORD/8
-        movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmE
-.column_ld4:
-        test    cl, SIZEOF_XMMWORD/4
-        jz      short .column_ld8
-        sub     rcx, byte SIZEOF_XMMWORD/4
-        movdqa  xmmE,xmmA
-        movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld8:
-        test    cl, SIZEOF_XMMWORD/2
-        mov     rcx, SIZEOF_XMMWORD
-        jz      short .rgb_gray_cnv
-        movdqa  xmmF,xmmA
-        movdqa  xmmH,xmmE
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_gray_cnv
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-        movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-        movdqa    xmmC,xmmF
-        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-        movdqa    xmmB,xmmA
-        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-        movdqa    xmmG,xmmD
-        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-        movdqa    xmmE,xmmA
-        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-        movdqa    xmmH,xmmB
-        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-        pxor      xmmF,xmmF
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmD,xmmB
-        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-        movdqa    xmmG,xmmE
-        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-        punpcklbw xmmF,xmmH
-        punpckhbw xmmH,xmmH
-        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
-        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-        movdqa    xmm6,xmm1
-        punpcklwd xmm1,xmm3
-        punpckhwd xmm6,xmm3
-        pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movdqa    xmm7, xmm6    ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movdqa    xmm6,xmm0
-        punpcklwd xmm0,xmm2
-        punpckhwd xmm6,xmm2
-        pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movdqa    XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movdqa    xmm0, xmm5    ; xmm0=BO
-        movdqa    xmm6, xmm4    ; xmm6=BE
-
-        movdqa    xmm4,xmm0
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm4,xmm3
-        pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-        movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
-
-        paddd     xmm0, xmm1
-        paddd     xmm4, xmm7
-        paddd     xmm0,xmm3
-        paddd     xmm4,xmm3
-        psrld     xmm0,SCALEBITS        ; xmm0=YOL
-        psrld     xmm4,SCALEBITS        ; xmm4=YOH
-        packssdw  xmm0,xmm4             ; xmm0=YO
-
-        movdqa    xmm4,xmm6
-        punpcklwd xmm6,xmm2
-        punpckhwd xmm4,xmm2
-        pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-        movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
-
-        paddd     xmm6, XMMWORD [wk(0)]
-        paddd     xmm4, XMMWORD [wk(1)]
-        paddd     xmm6,xmm2
-        paddd     xmm4,xmm2
-        psrld     xmm6,SCALEBITS        ; xmm6=YEL
-        psrld     xmm4,SCALEBITS        ; xmm4=YEH
-        packssdw  xmm6,xmm4             ; xmm6=YE
-
-        psllw     xmm0,BYTE_BIT
-        por       xmm6,xmm0             ; xmm6=Y
-        movdqa    XMMWORD [rdi], xmm6   ; Save Y
-
-        sub     rcx, byte SIZEOF_XMMWORD
-        add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
-        add     rdi, byte SIZEOF_XMMWORD                ; outptr0
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    rcx,rcx
-        jnz     near .column_ld1
-
-        pop     rcx                     ; col
-        pop     rsi
-        pop     rdi
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     rdi, byte SIZEOF_JSAMPROW
-        dec     rax                             ; num_rows
-        jg      near .rowloop
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jcgryext-sse2.asm b/media/libjpeg/simd/jcgryext-sse2.asm
deleted file mode 100644
index cd16dd1928..0000000000
--- a/media/libjpeg/simd/jcgryext-sse2.asm
+++ /dev/null
@@ -1,384 +0,0 @@
-;
-; jcgryext.asm - grayscale colorspace conversion (SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
-;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                              JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION img_width
-%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
-%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
-%define output_row(b)   (b)+20          ; JDIMENSION output_row
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-
-        global  EXTN(jsimd_rgb_gray_convert_sse2)
-
-EXTN(jsimd_rgb_gray_convert_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [img_width(eax)]
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     esi, JSAMPIMAGE [output_buf(eax)]
-        mov     ecx, JDIMENSION [output_row(eax)]
-        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     esi, JSAMPARRAY [input_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        pushpic eax
-        push    edi
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr0
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        alignx  16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    eax
-        push    edx
-        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_BYTE
-        movzx   eax, BYTE [esi+ecx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_WORD
-        movzx   edx, WORD [esi+ecx]
-        shl     eax, WORD_BIT
-        or      eax,edx
-.column_ld4:
-        movd    xmmA,eax
-        pop     edx
-        pop     eax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_DWORD
-        movd    xmmF, XMM_DWORD [esi+ecx]
-        pslldq  xmmA, SIZEOF_DWORD
-        por     xmmA,xmmF
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        sub     ecx, byte SIZEOF_MMWORD
-        movq    xmmB, XMM_MMWORD [esi+ecx]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmB
-.column_ld16:
-        test    cl, SIZEOF_XMMWORD
-        jz      short .column_ld32
-        movdqa  xmmF,xmmA
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        mov     ecx, SIZEOF_XMMWORD
-        jmp     short .rgb_gray_cnv
-.column_ld32:
-        test    cl, 2*SIZEOF_XMMWORD
-        mov     ecx, SIZEOF_XMMWORD
-        jz      short .rgb_gray_cnv
-        movdqa  xmmB,xmmA
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_gray_cnv
-        alignx  16,7
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        movdqu  xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        movdqa    xmmG,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-        movdqa    xmmD,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-        movdqa    xmmE,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-        pxor      xmmH,xmmH
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmB,xmmE
-        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-        movdqa    xmmF,xmmD
-        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_XMMWORD/16
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_XMMWORD/16
-        movd    xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_XMMWORD/8
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_XMMWORD/8
-        movq    xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmE
-.column_ld4:
-        test    cl, SIZEOF_XMMWORD/4
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_XMMWORD/4
-        movdqa  xmmE,xmmA
-        movdqu  xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
-        test    cl, SIZEOF_XMMWORD/2
-        mov     ecx, SIZEOF_XMMWORD
-        jz      short .rgb_gray_cnv
-        movdqa  xmmF,xmmA
-        movdqa  xmmH,xmmE
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_gray_cnv
-        alignx  16,7
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
-        movdqu  xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-        movdqa    xmmC,xmmF
-        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-        movdqa    xmmB,xmmA
-        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-        movdqa    xmmG,xmmD
-        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-        movdqa    xmmE,xmmA
-        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-        movdqa    xmmH,xmmB
-        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-        pxor      xmmF,xmmF
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmD,xmmB
-        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-        movdqa    xmmG,xmmE
-        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-        punpcklbw xmmF,xmmH
-        punpckhbw xmmH,xmmH
-        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
-        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-        movdqa    xmm6,xmm1
-        punpcklwd xmm1,xmm3
-        punpckhwd xmm6,xmm3
-        pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movdqa    xmm7, xmm6    ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movdqa    xmm6,xmm0
-        punpcklwd xmm0,xmm2
-        punpckhwd xmm6,xmm2
-        pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movdqa    XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movdqa    xmm0, xmm5    ; xmm0=BO
-        movdqa    xmm6, xmm4    ; xmm6=BE
-
-        movdqa    xmm4,xmm0
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm4,xmm3
-        pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-        movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
-
-        paddd     xmm0, xmm1
-        paddd     xmm4, xmm7
-        paddd     xmm0,xmm3
-        paddd     xmm4,xmm3
-        psrld     xmm0,SCALEBITS        ; xmm0=YOL
-        psrld     xmm4,SCALEBITS        ; xmm4=YOH
-        packssdw  xmm0,xmm4             ; xmm0=YO
-
-        movdqa    xmm4,xmm6
-        punpcklwd xmm6,xmm2
-        punpckhwd xmm4,xmm2
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-        movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
-
-        paddd     xmm6, XMMWORD [wk(0)]
-        paddd     xmm4, XMMWORD [wk(1)]
-        paddd     xmm6,xmm2
-        paddd     xmm4,xmm2
-        psrld     xmm6,SCALEBITS        ; xmm6=YEL
-        psrld     xmm4,SCALEBITS        ; xmm4=YEH
-        packssdw  xmm6,xmm4             ; xmm6=YE
-
-        psllw     xmm0,BYTE_BIT
-        por       xmm6,xmm0             ; xmm6=Y
-        movdqa    XMMWORD [edi], xmm6   ; Save Y
-
-        sub     ecx, byte SIZEOF_XMMWORD
-        add     esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
-        add     edi, byte SIZEOF_XMMWORD                ; outptr0
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .column_ld1
-
-        pop     ecx                     ; col
-        pop     esi
-        pop     edi
-        poppic  eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     edi, byte SIZEOF_JSAMPROW
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jchuff-sse2-64.asm b/media/libjpeg/simd/jchuff-sse2-64.asm
deleted file mode 100644
index b1144d1cdd..0000000000
--- a/media/libjpeg/simd/jchuff-sse2-64.asm
+++ /dev/null
@@ -1,360 +0,0 @@
-;
-; jchuff-sse2-64.asm - Huffman entropy encoding (64-bit SSE2)
-;
-; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
-; Copyright (C) 2015, Matthieu Darbois.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_huff_encode_one_block)
-
-EXTN(jconst_huff_encode_one_block):
-
-%include "jpeg_nbits_table.inc"
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code.  In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it.  This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
-
-%macro EMIT_BYTE 0
-        sub put_bits, 8  ; put_bits -= 8;
-        mov rdx, put_buffer
-        mov ecx, put_bits
-        shr rdx, cl  ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
-        mov byte [buffer], dl  ; *buffer++ = c;
-        add buffer, 1
-        cmp dl, 0xFF  ; need to stuff a zero byte?
-        jne %%.EMIT_BYTE_END
-        mov byte [buffer], 0  ; *buffer++ = 0;
-        add buffer, 1
-%%.EMIT_BYTE_END:
-%endmacro
-
-%macro PUT_BITS 1
-        add put_bits, ecx  ; put_bits += size;
-        shl put_buffer, cl  ; put_buffer = (put_buffer << size);
-        or  put_buffer, %1
-%endmacro
-
-%macro CHECKBUF31 0
-        cmp put_bits, 32  ; if (put_bits > 31) {
-        jl %%.CHECKBUF31_END
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-%%.CHECKBUF31_END:
-%endmacro
-
-%macro CHECKBUF47 0
-        cmp put_bits, 48  ; if (put_bits > 47) {
-        jl %%.CHECKBUF47_END
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-%%.CHECKBUF47_END:
-%endmacro
-
-%macro EMIT_BITS 2
-        CHECKBUF47
-        mov ecx, %2
-        PUT_BITS %1
-%endmacro
-
-%macro kloop_prepare 37  ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
-    pxor xmm8, xmm8  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm9, xmm9  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm10, xmm10  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm11, xmm11  ; __m128i neg = _mm_setzero_si128();
-    pinsrw %34, word [r12 + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
-    pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
-    pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
-    pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
-    pinsrw %34, word [r12 + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
-    pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
-    pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
-    pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
-    pinsrw %34, word [r12 + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
-    pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
-    pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
-    pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
-    pinsrw %34, word [r12 + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
-    pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
-    pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
-    pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
-    pinsrw %34, word [r12 + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
-    pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
-    pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
-    pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
-    pinsrw %34, word [r12 + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
-    pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
-    pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
-    pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
-    pinsrw %34, word [r12 + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
-    pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
-    pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
-    pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
-    pinsrw %34, word [r12 + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
-    pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
-    pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
-    pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
-%else
-    pinsrw %37, ebx, 7  ; xmm_shadow[31] = block[jno31];
-%endif
-    pcmpgtw xmm8, %34  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm9, %35  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm10, %36  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm11, %37  ; neg = _mm_cmpgt_epi16(neg, x1);
-    paddw %34, xmm8   ; x1 = _mm_add_epi16(x1, neg);
-    paddw %35, xmm9   ; x1 = _mm_add_epi16(x1, neg);
-    paddw %36, xmm10  ; x1 = _mm_add_epi16(x1, neg);
-    paddw %37, xmm11  ; x1 = _mm_add_epi16(x1, neg);
-    pxor %34, xmm8    ; x1 = _mm_xor_si128(x1, neg);
-    pxor %35, xmm9    ; x1 = _mm_xor_si128(x1, neg);
-    pxor %36, xmm10   ; x1 = _mm_xor_si128(x1, neg);
-    pxor %37, xmm11   ; x1 = _mm_xor_si128(x1, neg);
-    pxor xmm8, %34    ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm9, %35    ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm10, %36   ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm11, %37   ; neg = _mm_xor_si128(neg, x1);
-    movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34  ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
-    movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35  ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
-    movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36  ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
-    movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37  ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
-    movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8  ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
-    movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9  ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
-    movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
-    movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
-%endmacro
-
-;
-; Encode a single block's worth of coefficients.
-;
-; GLOBAL(JOCTET*)
-; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
-;                                   JCOEFPTR block, int last_dc_val,
-;                                   c_derived_tbl *dctbl, c_derived_tbl *actbl)
-;
-
-; r10 = working_state *state
-; r11 = JOCTET *buffer
-; r12 = JCOEFPTR block
-; r13 = int last_dc_val
-; r14 = c_derived_tbl *dctbl
-; r15 = c_derived_tbl *actbl
-
-%define t1              rbp-(DCTSIZE2*SIZEOF_WORD)
-%define t2              t1-(DCTSIZE2*SIZEOF_WORD)
-%define put_buffer      r8
-%define put_bits        r9d
-%define buffer          rax
-
-        align   16
-        global  EXTN(jsimd_huff_encode_one_block_sse2)
-
-EXTN(jsimd_huff_encode_one_block_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [t2]
-        collect_args
-%ifdef WIN64
-        movaps  XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
-        movaps  XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
-        movaps  XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
-        movaps  XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
-        sub     rsp, 4*SIZEOF_XMMWORD
-%endif
-        push rbx
-
-        mov buffer, r11  ; r11 is now sratch
-
-        mov put_buffer, MMWORD [r10+16]  ; put_buffer = state->cur.put_buffer;
-        mov put_bits,    DWORD [r10+24]  ; put_bits = state->cur.put_bits;
-        push r10  ; r10 is now scratch
-
-        ; Encode the DC coefficient difference per section F.1.2.1
-        movsx edi, word [r12]  ; temp = temp2 = block[0] - last_dc_val;
-        sub   edi, r13d  ; r13 is not used anymore
-        mov   ebx, edi
-
-        ; This is a well-known technique for obtaining the absolute value
-        ; without a branch.  It is derived from an assembly language technique
-        ; presented in "How to Optimize for the Pentium Processors",
-        ; Copyright (c) 1996, 1997 by Agner Fog.
-        mov esi, edi
-        sar esi, 31   ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-        xor edi, esi  ; temp ^= temp3;
-        sub edi, esi  ; temp -= temp3;
-
-        ; For a negative input, want temp2 = bitwise complement of abs(input)
-        ; This code assumes we are on a two's complement machine
-        add ebx, esi  ; temp2 += temp3;
-
-        ; Find the number of bits needed for the magnitude of the coefficient
-        lea   r11, [rel jpeg_nbits_table]
-        movzx rdi, byte [r11 + rdi]  ; nbits = JPEG_NBITS(temp);
-        ; Emit the Huffman-coded symbol for the number of bits
-        mov   r11d,  INT [r14 + rdi * 4]  ; code = dctbl->ehufco[nbits];
-        movzx  esi, byte [r14 + rdi + 1024]  ; size = dctbl->ehufsi[nbits];
-        EMIT_BITS r11, esi  ; EMIT_BITS(code, size)
-
-        ; Mask off any extra bits in code
-        mov esi, 1
-        mov ecx, edi
-        shl esi, cl
-        dec esi
-        and ebx, esi  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-
-        ; Emit that number of bits of the value, if positive,
-        ; or the complement of its magnitude, if negative.
-        EMIT_BITS rbx, edi  ; EMIT_BITS(temp2, nbits)
-
-        ; Prepare data
-        xor ebx, ebx
-        kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
-                       18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
-                       27, 20, 13, 6,  7,  14, 21, 28, 35, \
-                       xmm0, xmm1, xmm2, xmm3
-        kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
-                       30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
-                       53, 60, 61, 54, 47, 55, 62, 63, 63, \
-                       xmm4, xmm5, xmm6, xmm7
-
-        pxor xmm8, xmm8
-        pcmpeqw xmm0, xmm8  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-        pcmpeqw xmm1, xmm8  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-        pcmpeqw xmm2, xmm8  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-        pcmpeqw xmm3, xmm8  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-        pcmpeqw xmm4, xmm8  ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
-        pcmpeqw xmm5, xmm8  ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
-        pcmpeqw xmm6, xmm8  ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
-        pcmpeqw xmm7, xmm8  ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
-        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-        packsswb xmm4, xmm5  ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
-        packsswb xmm6, xmm7  ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
-        pmovmskb r11d, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-        pmovmskb r12d, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-        pmovmskb r13d, xmm4  ; index  = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
-        pmovmskb r14d, xmm6  ; index  = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
-        shl r12, 16
-        shl r14, 16
-        or  r11, r12
-        or  r13, r14
-        shl r13, 32
-        or  r11, r13
-        not r11  ; index = ~index;
-
-        ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
-        ;jmp .EFN
-
-        mov   r13d,  INT [r15 + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
-        movzx r14d, byte [r15 + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-        lea rsi, [t1]
-.BLOOP:
-        bsf r12, r11  ; r = __builtin_ctzl(index);
-        jz .ELOOP
-        mov rcx, r12
-        lea rsi, [rsi+r12*2]  ; k += r;
-        shr r11, cl  ; index >>= r;
-        movzx rdi, word [rsi]  ; temp = t1[k];
-        lea   rbx, [rel jpeg_nbits_table]
-        movzx rdi, byte [rbx + rdi]  ; nbits = JPEG_NBITS(temp);
-.BRLOOP:
-        cmp r12, 16  ; while (r > 15) {
-        jl .ERLOOP
-        EMIT_BITS r13, r14d  ; EMIT_BITS(code_0xf0, size_0xf0)
-        sub r12, 16  ; r -= 16;
-        jmp .BRLOOP
-.ERLOOP:
-        ; Emit Huffman symbol for run length / number of bits
-        CHECKBUF31  ; uses rcx, rdx
-
-        shl r12, 4  ; temp3 = (r << 4) + nbits;
-        add r12, rdi
-        mov   ebx,  INT [r15 + r12 * 4]  ; code = actbl->ehufco[temp3];
-        movzx ecx, byte [r15 + r12 + 1024]  ; size = actbl->ehufsi[temp3];
-        PUT_BITS rbx
-
-        ;EMIT_CODE(code, size)
-
-        movsx ebx, word [rsi-DCTSIZE2*2]  ; temp2 = t2[k];
-        ; Mask off any extra bits in code
-        mov rcx, rdi
-        mov rdx, 1
-        shl rdx, cl
-        dec rdx
-        and rbx, rdx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-        PUT_BITS rbx  ; PUT_BITS(temp2, nbits)
-
-        shr r11, 1  ; index >>= 1;
-        add rsi, 2  ; ++k;
-        jmp .BLOOP
-.ELOOP:
-        ; If the last coef(s) were zero, emit an end-of-block code
-        lea rdi, [t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
-        cmp rdi, rsi  ; if (r > 0) {
-        je .EFN
-        mov   ebx,  INT [r15]  ; code = actbl->ehufco[0];
-        movzx r12d, byte [r15 + 1024]  ; size = actbl->ehufsi[0];
-        EMIT_BITS rbx, r12d
-.EFN:
-        pop r10
-        ; Save put_buffer & put_bits
-        mov MMWORD [r10+16], put_buffer  ; state->cur.put_buffer = put_buffer;
-        mov DWORD  [r10+24], put_bits  ; state->cur.put_bits = put_bits;
-
-        pop rbx
-%ifdef WIN64
-        movaps  xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
-        movaps  xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
-        movaps  xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
-        movaps  xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
-        add     rsp, 4*SIZEOF_XMMWORD
-%endif
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jchuff-sse2.asm b/media/libjpeg/simd/jchuff-sse2.asm
deleted file mode 100644
index 36d1f2db66..0000000000
--- a/media/libjpeg/simd/jchuff-sse2.asm
+++ /dev/null
@@ -1,426 +0,0 @@
-;
-; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
-;
-; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
-; Copyright (C) 2015, Matthieu Darbois.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_huff_encode_one_block)
-
-EXTN(jconst_huff_encode_one_block):
-
-%include "jpeg_nbits_table.inc"
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code.  In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it.  This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
-
-%macro EMIT_BYTE 0
-        sub put_bits, 8  ; put_bits -= 8;
-        mov edx, put_buffer
-        mov ecx, put_bits
-        shr edx, cl  ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
-        mov byte [eax], dl  ; *buffer++ = c;
-        add eax, 1
-        cmp dl, 0xFF  ; need to stuff a zero byte?
-        jne %%.EMIT_BYTE_END
-        mov byte [eax], 0  ; *buffer++ = 0;
-        add eax, 1
-%%.EMIT_BYTE_END:
-%endmacro
-
-%macro PUT_BITS 1
-        add put_bits, ecx  ; put_bits += size;
-        shl put_buffer, cl  ; put_buffer = (put_buffer << size);
-        or  put_buffer, %1
-%endmacro
-
-%macro CHECKBUF15 0
-        cmp put_bits, 16  ; if (put_bits > 31) {
-        jl %%.CHECKBUF15_END
-        mov eax, POINTER [esp+buffer]
-        EMIT_BYTE
-        EMIT_BYTE
-        mov POINTER [esp+buffer], eax
-%%.CHECKBUF15_END:
-%endmacro
-
-%macro EMIT_BITS 1
-        PUT_BITS %1
-        CHECKBUF15
-%endmacro
-
-%macro kloop_prepare 37  ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
-    pxor xmm4, xmm4  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm5, xmm5  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm6, xmm6  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm7, xmm7  ; __m128i neg = _mm_setzero_si128();
-    pinsrw %34, word [esi + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
-    pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
-    pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
-    pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
-    pinsrw %34, word [esi + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
-    pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
-    pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
-    pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
-    pinsrw %34, word [esi + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
-    pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
-    pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
-    pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
-    pinsrw %34, word [esi + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
-    pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
-    pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
-    pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
-    pinsrw %34, word [esi + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
-    pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
-    pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
-    pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
-    pinsrw %34, word [esi + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
-    pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
-    pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
-    pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
-    pinsrw %34, word [esi + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
-    pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
-    pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
-    pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
-    pinsrw %34, word [esi + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
-    pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
-    pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
-    pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
-%else
-    pinsrw %37, ecx, 7  ; xmm_shadow[31] = block[jno31];
-%endif
-    pcmpgtw xmm4, %34  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm5, %35  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm6, %36  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm7, %37  ; neg = _mm_cmpgt_epi16(neg, x1);
-    paddw %34, xmm4   ; x1 = _mm_add_epi16(x1, neg);
-    paddw %35, xmm5   ; x1 = _mm_add_epi16(x1, neg);
-    paddw %36, xmm6  ; x1 = _mm_add_epi16(x1, neg);
-    paddw %37, xmm7  ; x1 = _mm_add_epi16(x1, neg);
-    pxor %34, xmm4    ; x1 = _mm_xor_si128(x1, neg);
-    pxor %35, xmm5    ; x1 = _mm_xor_si128(x1, neg);
-    pxor %36, xmm6   ; x1 = _mm_xor_si128(x1, neg);
-    pxor %37, xmm7   ; x1 = _mm_xor_si128(x1, neg);
-    pxor xmm4, %34    ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm5, %35    ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm6, %36   ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm7, %37   ; neg = _mm_xor_si128(neg, x1);
-    movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34  ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
-    movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35  ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
-    movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36  ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
-    movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37  ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
-    movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4  ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
-    movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5  ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
-    movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
-    movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
-%endmacro
-
-;
-; Encode a single block's worth of coefficients.
-;
-; GLOBAL(JOCTET*)
-; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
-;                                   JCOEFPTR block, int last_dc_val,
-;                                   c_derived_tbl *dctbl, c_derived_tbl *actbl)
-;
-
-; eax + 8 = working_state *state
-; eax + 12 = JOCTET *buffer
-; eax + 16 = JCOEFPTR block
-; eax + 20 = int last_dc_val
-; eax + 24 = c_derived_tbl *dctbl
-; eax + 28 = c_derived_tbl *actbl
-
-%define pad             6*SIZEOF_DWORD  ; Align to 16 bytes
-%define t1              pad
-%define t2              t1+(DCTSIZE2*SIZEOF_WORD)
-%define block           t2+(DCTSIZE2*SIZEOF_WORD)
-%define actbl           block+SIZEOF_DWORD
-%define buffer          actbl+SIZEOF_DWORD
-%define temp            buffer+SIZEOF_DWORD
-%define temp2           temp+SIZEOF_DWORD
-%define temp3           temp2+SIZEOF_DWORD
-%define temp4           temp3+SIZEOF_DWORD
-%define temp5           temp4+SIZEOF_DWORD
-%define gotptr          temp5+SIZEOF_DWORD  ; void *gotptr
-%define put_buffer      ebx
-%define put_bits        edi
-
-        align   16
-        global  EXTN(jsimd_huff_encode_one_block_sse2)
-
-EXTN(jsimd_huff_encode_one_block_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        sub     esp, temp5+9*SIZEOF_DWORD-pad
-        push    ebx
-        push    ecx
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-        push    ebp
-
-        mov esi, POINTER [eax+8]        ; (working_state *state)
-        mov put_buffer,  DWORD [esi+8]  ; put_buffer = state->cur.put_buffer;
-        mov put_bits,    DWORD [esi+12]  ; put_bits = state->cur.put_bits;
-        push esi  ; esi is now scratch
-
-        get_GOT edx                       ; get GOT address
-        movpic POINTER [esp+gotptr], edx  ; save GOT address
-
-        mov ecx, POINTER [eax+28]
-        mov edx, POINTER [eax+16]
-        mov esi, POINTER [eax+12]
-        mov POINTER [esp+actbl],  ecx
-        mov POINTER [esp+block],  edx
-        mov POINTER [esp+buffer], esi
-
-        ; Encode the DC coefficient difference per section F.1.2.1
-        mov esi, POINTER [esp+block]        ; block
-        movsx ecx, word [esi]  ; temp = temp2 = block[0] - last_dc_val;
-        sub   ecx, DWORD [eax+20]
-        mov   esi, ecx
-
-        ; This is a well-known technique for obtaining the absolute value
-        ; without a branch.  It is derived from an assembly language technique
-        ; presented in "How to Optimize for the Pentium Processors",
-        ; Copyright (c) 1996, 1997 by Agner Fog.
-        mov edx, ecx
-        sar edx, 31   ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-        xor ecx, edx ; temp ^= temp3;
-        sub ecx, edx ; temp -= temp3;
-
-        ; For a negative input, want temp2 = bitwise complement of abs(input)
-        ; This code assumes we are on a two's complement machine
-        add esi, edx  ; temp2 += temp3;
-        mov DWORD [esp+temp], esi  ; backup temp2 in temp
-
-        ; Find the number of bits needed for the magnitude of the coefficient
-        movpic ebp, POINTER [esp+gotptr]   ; load GOT address (ebp)
-        movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]  ; nbits = JPEG_NBITS(temp);
-        mov DWORD [esp+temp2], edx  ; backup nbits in temp2
-
-        ; Emit the Huffman-coded symbol for the number of bits
-        mov    ebp, POINTER [eax+24]  ; After this point, arguments are not accessible anymore
-        mov    eax,  INT [ebp + edx * 4]  ; code = dctbl->ehufco[nbits];
-        movzx  ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
-        EMIT_BITS eax  ; EMIT_BITS(code, size)
-
-        mov ecx, DWORD [esp+temp2]  ; restore nbits
-
-        ; Mask off any extra bits in code
-        mov eax, 1
-        shl eax, cl
-        dec eax
-        and eax, DWORD [esp+temp]  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-
-        ; Emit that number of bits of the value, if positive,
-        ; or the complement of its magnitude, if negative.
-        EMIT_BITS eax  ; EMIT_BITS(temp2, nbits)
-
-        ; Prepare data
-        xor ecx, ecx
-        mov esi, POINTER [esp+block]
-        kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
-                       18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
-                       27, 20, 13, 6,  7,  14, 21, 28, 35, \
-                       xmm0, xmm1, xmm2, xmm3
-        kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
-                       30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
-                       53, 60, 61, 54, 47, 55, 62, 63, 63, \
-                       xmm0, xmm1, xmm2, xmm3
-
-        pxor xmm7, xmm7
-        movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD]   ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
-        movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD]   ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
-        movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
-        movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
-        pcmpeqw xmm0, xmm7  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-        pcmpeqw xmm1, xmm7  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-        pcmpeqw xmm2, xmm7  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-        pcmpeqw xmm3, xmm7  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-        pmovmskb edx, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-        pmovmskb ecx, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-        shl ecx, 16
-        or  edx, ecx
-        not edx  ; index = ~index;
-
-        lea esi, [esp+t1]
-        mov ebp, POINTER [esp+actbl]  ; ebp = actbl
-
-.BLOOP:
-        bsf ecx, edx  ; r = __builtin_ctzl(index);
-        jz .ELOOP
-        lea esi, [esi+ecx*2]  ; k += r;
-        shr edx, cl  ; index >>= r;
-        mov DWORD [esp+temp3], edx
-.BRLOOP:
-        cmp ecx, 16  ; while (r > 15) {
-        jl .ERLOOP
-        sub ecx, 16 ; r -= 16;
-        mov DWORD [esp+temp], ecx
-        mov   eax, INT [ebp + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
-        movzx ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-        EMIT_BITS eax  ; EMIT_BITS(code_0xf0, size_0xf0)
-        mov ecx, DWORD [esp+temp]
-        jmp .BRLOOP
-.ERLOOP:
-        movsx eax, word [esi]  ; temp = t1[k];
-        movpic edx, POINTER [esp+gotptr]   ; load GOT address (edx)
-        movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]  ; nbits = JPEG_NBITS(temp);
-        mov DWORD [esp+temp2], eax
-        ; Emit Huffman symbol for run length / number of bits
-        shl ecx, 4  ; temp3 = (r << 4) + nbits;
-        add ecx, eax
-        mov   eax,  INT [ebp + ecx * 4]  ; code = actbl->ehufco[temp3];
-        movzx ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
-        EMIT_BITS eax
-
-        movsx edx, word [esi+DCTSIZE2*2]  ; temp2 = t2[k];
-        ; Mask off any extra bits in code
-        mov ecx, DWORD [esp+temp2]
-        mov eax, 1
-        shl eax, cl
-        dec eax
-        and eax, edx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-        EMIT_BITS eax  ; PUT_BITS(temp2, nbits)
-        mov edx, DWORD [esp+temp3]
-        add esi, 2  ; ++k;
-        shr edx, 1  ; index >>= 1;
-
-        jmp .BLOOP
-.ELOOP:
-        movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD]  ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
-        movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD]  ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
-        movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
-        movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
-        pcmpeqw xmm0, xmm7  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-        pcmpeqw xmm1, xmm7  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-        pcmpeqw xmm2, xmm7  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-        pcmpeqw xmm3, xmm7  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-        pmovmskb edx, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-        pmovmskb ecx, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-        shl ecx, 16
-        or  edx, ecx
-        not edx  ; index = ~index;
-
-        lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
-        sub eax, esi
-        shr eax, 1
-        bsf ecx, edx  ; r = __builtin_ctzl(index);
-        jz .ELOOP2
-        shr edx, cl  ; index >>= r;
-        add ecx, eax
-        lea esi, [esi+ecx*2]  ; k += r;
-        mov DWORD [esp+temp3], edx
-        jmp .BRLOOP2
-.BLOOP2:
-        bsf ecx, edx  ; r = __builtin_ctzl(index);
-        jz .ELOOP2
-        lea esi, [esi+ecx*2]  ; k += r;
-        shr edx, cl  ; index >>= r;
-        mov DWORD [esp+temp3], edx
-.BRLOOP2:
-        cmp ecx, 16  ; while (r > 15) {
-        jl .ERLOOP2
-        sub ecx, 16  ; r -= 16;
-        mov DWORD [esp+temp], ecx
-        mov   eax, INT [ebp + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
-        movzx ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-        EMIT_BITS eax  ; EMIT_BITS(code_0xf0, size_0xf0)
-        mov ecx, DWORD [esp+temp]
-        jmp .BRLOOP2
-.ERLOOP2:
-        movsx eax, word [esi]  ; temp = t1[k];
-        bsr eax, eax  ; nbits = 32 - __builtin_clz(temp);
-        inc eax
-        mov DWORD [esp+temp2], eax
-        ; Emit Huffman symbol for run length / number of bits
-        shl ecx, 4  ; temp3 = (r << 4) + nbits;
-        add ecx, eax
-        mov   eax,  INT [ebp + ecx * 4]  ; code = actbl->ehufco[temp3];
-        movzx ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
-        EMIT_BITS eax
-
-        movsx edx, word [esi+DCTSIZE2*2]  ; temp2 = t2[k];
-        ; Mask off any extra bits in code
-        mov ecx, DWORD [esp+temp2]
-        mov eax, 1
-        shl eax, cl
-        dec eax
-        and eax, edx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-        EMIT_BITS eax  ; PUT_BITS(temp2, nbits)
-        mov edx, DWORD [esp+temp3]
-        add esi, 2  ; ++k;
-        shr edx, 1  ; index >>= 1;
-
-        jmp .BLOOP2
-.ELOOP2:
-        ; If the last coef(s) were zero, emit an end-of-block code
-        lea edx, [esp + t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
-        cmp edx, esi  ; if (r > 0) {
-        je .EFN
-        mov   eax,  INT [ebp]  ; code = actbl->ehufco[0];
-        movzx ecx, byte [ebp + 1024]  ; size = actbl->ehufsi[0];
-        EMIT_BITS eax
-.EFN:
-        mov eax, [esp+buffer]
-        pop esi
-        ; Save put_buffer & put_bits
-        mov DWORD [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
-        mov DWORD [esi+12], put_bits  ; state->cur.put_bits = put_bits;
-
-        pop     ebp
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-        pop     ecx
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jcolsamp.inc b/media/libjpeg/simd/jcolsamp.inc
deleted file mode 100644
index 3be446e847..0000000000
--- a/media/libjpeg/simd/jcolsamp.inc
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-; jcolsamp.inc - private declarations for color conversion & up/downsampling
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; [TAB8]
-
-; --------------------------------------------------------------------------
-
-; pseudo-resisters to make ordering of RGB configurable
-;
-%if RGB_RED == 0
-%define  mmA  mm0
-%define  mmB  mm1
-%define xmmA xmm0
-%define xmmB xmm1
-%elif RGB_GREEN == 0
-%define  mmA  mm2
-%define  mmB  mm3
-%define xmmA xmm2
-%define xmmB xmm3
-%elif RGB_BLUE == 0
-%define  mmA  mm4
-%define  mmB  mm5
-%define xmmA xmm4
-%define xmmB xmm5
-%else
-%define  mmA  mm6
-%define  mmB  mm7
-%define xmmA xmm6
-%define xmmB xmm7
-%endif
-
-%if RGB_RED == 1
-%define  mmC  mm0
-%define  mmD  mm1
-%define xmmC xmm0
-%define xmmD xmm1
-%elif RGB_GREEN == 1
-%define  mmC  mm2
-%define  mmD  mm3
-%define xmmC xmm2
-%define xmmD xmm3
-%elif RGB_BLUE == 1
-%define  mmC  mm4
-%define  mmD  mm5
-%define xmmC xmm4
-%define xmmD xmm5
-%else
-%define  mmC  mm6
-%define  mmD  mm7
-%define xmmC xmm6
-%define xmmD xmm7
-%endif
-
-%if RGB_RED == 2
-%define  mmE  mm0
-%define  mmF  mm1
-%define xmmE xmm0
-%define xmmF xmm1
-%elif RGB_GREEN == 2
-%define  mmE  mm2
-%define  mmF  mm3
-%define xmmE xmm2
-%define xmmF xmm3
-%elif RGB_BLUE == 2
-%define  mmE  mm4
-%define  mmF  mm5
-%define xmmE xmm4
-%define xmmF xmm5
-%else
-%define  mmE  mm6
-%define  mmF  mm7
-%define xmmE xmm6
-%define xmmF xmm7
-%endif
-
-%if RGB_RED == 3
-%define  mmG  mm0
-%define  mmH  mm1
-%define xmmG xmm0
-%define xmmH xmm1
-%elif RGB_GREEN == 3
-%define  mmG  mm2
-%define  mmH  mm3
-%define xmmG xmm2
-%define xmmH xmm3
-%elif RGB_BLUE == 3
-%define  mmG  mm4
-%define  mmH  mm5
-%define xmmG xmm4
-%define xmmH xmm5
-%else
-%define  mmG  mm6
-%define  mmH  mm7
-%define xmmG xmm6
-%define xmmH xmm7
-%endif
-
-; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/jcsample-mmx.asm b/media/libjpeg/simd/jcsample-mmx.asm
deleted file mode 100644
index 6cd544e74d..0000000000
--- a/media/libjpeg/simd/jcsample-mmx.asm
+++ /dev/null
@@ -1,323 +0,0 @@
-;
-; jcsample.asm - downsampling (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
-;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION image_width
-%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
-%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
-%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
-%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
-%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v1_downsample_mmx)
-
-EXTN(jsimd_h2v1_downsample_mmx):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     ecx, JDIMENSION [width_blks(ebp)]
-        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
-        jz      near .return
-
-        mov     edx, JDIMENSION [img_width(ebp)]
-
-        ; -- expand_right_edge
-
-        push    ecx
-        shl     ecx,1                           ; output_cols * 2
-        sub     ecx,edx
-        jle     short .expand_end
-
-        mov     eax, INT [max_v_samp(ebp)]
-        test    eax,eax
-        jle     short .expand_end
-
-        cld
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        alignx  16,7
-.expandloop:
-        push    eax
-        push    ecx
-
-        mov     edi, JSAMPROW [esi]
-        add     edi,edx
-        mov     al, JSAMPLE [edi-1]
-
-        rep stosb
-
-        pop     ecx
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        dec     eax
-        jg      short .expandloop
-
-.expand_end:
-        pop     ecx                             ; output_cols
-
-        ; -- h2v1_downsample
-
-        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov       edx, 0x00010000       ; bias pattern
-        movd      mm7,edx
-        pcmpeqw   mm6,mm6
-        punpckldq mm7,mm7               ; mm7={0, 1, 0, 1}
-        psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
-        alignx  16,7
-.rowloop:
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]             ; inptr
-        mov     edi, JSAMPROW [edi]             ; outptr
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mm1, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mm2,mm0
-        movq    mm3,mm1
-
-        pand    mm0,mm6
-        psrlw   mm2,BYTE_BIT
-        pand    mm1,mm6
-        psrlw   mm3,BYTE_BIT
-
-        paddw   mm0,mm2
-        paddw   mm1,mm3
-        paddw   mm0,mm7
-        paddw   mm1,mm7
-        psrlw   mm0,1
-        psrlw   mm1,1
-
-        packuswb mm0,mm1
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
-
-        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
-        add     edi, byte 1*SIZEOF_MMWORD       ; outptr
-        sub     ecx, byte SIZEOF_MMWORD         ; outcol
-        jnz     short .columnloop
-
-        pop     esi
-        pop     edi
-        pop     ecx
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     eax                             ; rowctr
-        jg      short .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
-;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION image_width
-%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
-%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
-%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
-%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
-%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v2_downsample_mmx)
-
-EXTN(jsimd_h2v2_downsample_mmx):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     ecx, JDIMENSION [width_blks(ebp)]
-        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
-        jz      near .return
-
-        mov     edx, JDIMENSION [img_width(ebp)]
-
-        ; -- expand_right_edge
-
-        push    ecx
-        shl     ecx,1                           ; output_cols * 2
-        sub     ecx,edx
-        jle     short .expand_end
-
-        mov     eax, INT [max_v_samp(ebp)]
-        test    eax,eax
-        jle     short .expand_end
-
-        cld
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        alignx  16,7
-.expandloop:
-        push    eax
-        push    ecx
-
-        mov     edi, JSAMPROW [esi]
-        add     edi,edx
-        mov     al, JSAMPLE [edi-1]
-
-        rep stosb
-
-        pop     ecx
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        dec     eax
-        jg      short .expandloop
-
-.expand_end:
-        pop     ecx                             ; output_cols
-
-        ; -- h2v2_downsample
-
-        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov       edx, 0x00020001       ; bias pattern
-        movd      mm7,edx
-        pcmpeqw   mm6,mm6
-        punpckldq mm7,mm7               ; mm7={1, 2, 1, 2}
-        psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
-        alignx  16,7
-.rowloop:
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
-        mov     edi, JSAMPROW [edi]                     ; outptr
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [edx+0*SIZEOF_MMWORD]
-        movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mm2, MMWORD [edx+1*SIZEOF_MMWORD]
-        movq    mm3, MMWORD [esi+1*SIZEOF_MMWORD]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        pand    mm0,mm6
-        psrlw   mm4,BYTE_BIT
-        pand    mm1,mm6
-        psrlw   mm5,BYTE_BIT
-        paddw   mm0,mm4
-        paddw   mm1,mm5
-
-        movq    mm4,mm2
-        movq    mm5,mm3
-        pand    mm2,mm6
-        psrlw   mm4,BYTE_BIT
-        pand    mm3,mm6
-        psrlw   mm5,BYTE_BIT
-        paddw   mm2,mm4
-        paddw   mm3,mm5
-
-        paddw   mm0,mm1
-        paddw   mm2,mm3
-        paddw   mm0,mm7
-        paddw   mm2,mm7
-        psrlw   mm0,2
-        psrlw   mm2,2
-
-        packuswb mm0,mm2
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
-
-        add     edx, byte 2*SIZEOF_MMWORD       ; inptr0
-        add     esi, byte 2*SIZEOF_MMWORD       ; inptr1
-        add     edi, byte 1*SIZEOF_MMWORD       ; outptr
-        sub     ecx, byte SIZEOF_MMWORD         ; outcol
-        jnz     near .columnloop
-
-        pop     esi
-        pop     edi
-        pop     ecx
-
-        add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
-        dec     eax                             ; rowctr
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jcsample-sse2-64.asm b/media/libjpeg/simd/jcsample-sse2-64.asm
deleted file mode 100644
index 40ee15fcbb..0000000000
--- a/media/libjpeg/simd/jcsample-sse2-64.asm
+++ /dev/null
@@ -1,329 +0,0 @@
-;
-; jcsample.asm - downsampling (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-; r10 = JDIMENSION image_width
-; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
-; r14 = JSAMPARRAY input_data
-; r15 = JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v1_downsample_sse2)
-
-EXTN(jsimd_h2v1_downsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov ecx, r13d
-        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
-        jz      near .return
-
-        mov edx, r10d
-
-        ; -- expand_right_edge
-
-        push    rcx
-        shl     rcx,1                           ; output_cols * 2
-        sub     rcx,rdx
-        jle     short .expand_end
-
-        mov     rax, r11
-        test    rax,rax
-        jle     short .expand_end
-
-        cld
-        mov     rsi, r14        ; input_data
-.expandloop:
-        push    rax
-        push    rcx
-
-        mov     rdi, JSAMPROW [rsi]
-        add     rdi,rdx
-        mov     al, JSAMPLE [rdi-1]
-
-        rep stosb
-
-        pop     rcx
-        pop     rax
-
-        add     rsi, byte SIZEOF_JSAMPROW
-        dec     rax
-        jg      short .expandloop
-
-.expand_end:
-        pop     rcx                             ; output_cols
-
-        ; -- h2v1_downsample
-
-        mov     eax, r12d        ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov     rdx, 0x00010000         ; bias pattern
-        movd    xmm7,edx
-        pcmpeqw xmm6,xmm6
-        pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     rsi, r14        ; input_data
-        mov     rdi, r15        ; output_data
-.rowloop:
-        push    rcx
-        push    rdi
-        push    rsi
-
-        mov     rsi, JSAMPROW [rsi]             ; inptr
-        mov rdi, JSAMPROW [rdi]         ; outptr
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-
-.columnloop_r8:
-        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        pxor    xmm1,xmm1
-        mov     rcx, SIZEOF_XMMWORD
-        jmp     short .downsample
-
-.columnloop:
-        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-.downsample:
-        movdqa  xmm2,xmm0
-        movdqa  xmm3,xmm1
-
-        pand    xmm0,xmm6
-        psrlw   xmm2,BYTE_BIT
-        pand    xmm1,xmm6
-        psrlw   xmm3,BYTE_BIT
-
-        paddw   xmm0,xmm2
-        paddw   xmm1,xmm3
-        paddw   xmm0,xmm7
-        paddw   xmm1,xmm7
-        psrlw   xmm0,1
-        psrlw   xmm1,1
-
-        packuswb xmm0,xmm1
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
-        sub     rcx, byte SIZEOF_XMMWORD        ; outcol
-        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-        test    rcx,rcx
-        jnz     short .columnloop_r8
-
-        pop     rsi
-        pop     rdi
-        pop     rcx
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
-        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     rax                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        uncollect_args
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-; r10 = JDIMENSION image_width
-; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
-; r14 = JSAMPARRAY input_data
-; r15 = JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v2_downsample_sse2)
-
-EXTN(jsimd_h2v2_downsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov     ecx, r13d
-        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
-        jz      near .return
-
-        mov     edx, r10d
-
-        ; -- expand_right_edge
-
-        push    rcx
-        shl     rcx,1                           ; output_cols * 2
-        sub     rcx,rdx
-        jle     short .expand_end
-
-        mov     rax, r11
-        test    rax,rax
-        jle     short .expand_end
-
-        cld
-        mov     rsi, r14        ; input_data
-.expandloop:
-        push    rax
-        push    rcx
-
-        mov     rdi, JSAMPROW [rsi]
-        add     rdi,rdx
-        mov     al, JSAMPLE [rdi-1]
-
-        rep stosb
-
-        pop     rcx
-        pop     rax
-
-        add     rsi, byte SIZEOF_JSAMPROW
-        dec     rax
-        jg      short .expandloop
-
-.expand_end:
-        pop     rcx                             ; output_cols
-
-        ; -- h2v2_downsample
-
-        mov     eax, r12d        ; rowctr
-        test    rax,rax
-        jle     near .return
-
-        mov     rdx, 0x00020001         ; bias pattern
-        movd    xmm7,edx
-        pcmpeqw xmm6,xmm6
-        pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     rsi, r14        ; input_data
-        mov     rdi, r15        ; output_data
-.rowloop:
-        push    rcx
-        push    rdi
-        push    rsi
-
-        mov     rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1
-        mov     rdi, JSAMPROW [rdi]                     ; outptr
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-
-.columnloop_r8:
-        movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        pxor    xmm2,xmm2
-        pxor    xmm3,xmm3
-        mov     rcx, SIZEOF_XMMWORD
-        jmp     short .downsample
-
-.columnloop:
-        movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqa  xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-        movdqa  xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-.downsample:
-        movdqa  xmm4,xmm0
-        movdqa  xmm5,xmm1
-        pand    xmm0,xmm6
-        psrlw   xmm4,BYTE_BIT
-        pand    xmm1,xmm6
-        psrlw   xmm5,BYTE_BIT
-        paddw   xmm0,xmm4
-        paddw   xmm1,xmm5
-
-        movdqa  xmm4,xmm2
-        movdqa  xmm5,xmm3
-        pand    xmm2,xmm6
-        psrlw   xmm4,BYTE_BIT
-        pand    xmm3,xmm6
-        psrlw   xmm5,BYTE_BIT
-        paddw   xmm2,xmm4
-        paddw   xmm3,xmm5
-
-        paddw   xmm0,xmm1
-        paddw   xmm2,xmm3
-        paddw   xmm0,xmm7
-        paddw   xmm2,xmm7
-        psrlw   xmm0,2
-        psrlw   xmm2,2
-
-        packuswb xmm0,xmm2
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
-        sub     rcx, byte SIZEOF_XMMWORD        ; outcol
-        add     rdx, byte 2*SIZEOF_XMMWORD      ; inptr0
-        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr1
-        add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    rcx,rcx
-        jnz     near .columnloop_r8
-
-        pop     rsi
-        pop     rdi
-        pop     rcx
-
-        add     rsi, byte 2*SIZEOF_JSAMPROW     ; input_data
-        add     rdi, byte 1*SIZEOF_JSAMPROW     ; output_data
-        dec     rax                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jcsample-sse2.asm b/media/libjpeg/simd/jcsample-sse2.asm
deleted file mode 100644
index 83c9d152a7..0000000000
--- a/media/libjpeg/simd/jcsample-sse2.asm
+++ /dev/null
@@ -1,350 +0,0 @@
-;
-; jcsample.asm - downsampling (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION image_width
-%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
-%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
-%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
-%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
-%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v1_downsample_sse2)
-
-EXTN(jsimd_h2v1_downsample_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     ecx, JDIMENSION [width_blks(ebp)]
-        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
-        jz      near .return
-
-        mov     edx, JDIMENSION [img_width(ebp)]
-
-        ; -- expand_right_edge
-
-        push    ecx
-        shl     ecx,1                           ; output_cols * 2
-        sub     ecx,edx
-        jle     short .expand_end
-
-        mov     eax, INT [max_v_samp(ebp)]
-        test    eax,eax
-        jle     short .expand_end
-
-        cld
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        alignx  16,7
-.expandloop:
-        push    eax
-        push    ecx
-
-        mov     edi, JSAMPROW [esi]
-        add     edi,edx
-        mov     al, JSAMPLE [edi-1]
-
-        rep stosb
-
-        pop     ecx
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        dec     eax
-        jg      short .expandloop
-
-.expand_end:
-        pop     ecx                             ; output_cols
-
-        ; -- h2v1_downsample
-
-        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov     edx, 0x00010000         ; bias pattern
-        movd    xmm7,edx
-        pcmpeqw xmm6,xmm6
-        pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
-        alignx  16,7
-.rowloop:
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]             ; inptr
-        mov     edi, JSAMPROW [edi]             ; outptr
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-        alignx  16,7
-
-.columnloop_r8:
-        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        pxor    xmm1,xmm1
-        mov     ecx, SIZEOF_XMMWORD
-        jmp     short .downsample
-        alignx  16,7
-
-.columnloop:
-        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-.downsample:
-        movdqa  xmm2,xmm0
-        movdqa  xmm3,xmm1
-
-        pand    xmm0,xmm6
-        psrlw   xmm2,BYTE_BIT
-        pand    xmm1,xmm6
-        psrlw   xmm3,BYTE_BIT
-
-        paddw   xmm0,xmm2
-        paddw   xmm1,xmm3
-        paddw   xmm0,xmm7
-        paddw   xmm1,xmm7
-        psrlw   xmm0,1
-        psrlw   xmm1,1
-
-        packuswb xmm0,xmm1
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
-        sub     ecx, byte SIZEOF_XMMWORD        ; outcol
-        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-        test    ecx,ecx
-        jnz     short .columnloop_r8
-
-        pop     esi
-        pop     edi
-        pop     ecx
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     eax                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION image_width
-%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
-%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
-%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
-%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
-%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v2_downsample_sse2)
-
-EXTN(jsimd_h2v2_downsample_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     ecx, JDIMENSION [width_blks(ebp)]
-        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
-        jz      near .return
-
-        mov     edx, JDIMENSION [img_width(ebp)]
-
-        ; -- expand_right_edge
-
-        push    ecx
-        shl     ecx,1                           ; output_cols * 2
-        sub     ecx,edx
-        jle     short .expand_end
-
-        mov     eax, INT [max_v_samp(ebp)]
-        test    eax,eax
-        jle     short .expand_end
-
-        cld
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        alignx  16,7
-.expandloop:
-        push    eax
-        push    ecx
-
-        mov     edi, JSAMPROW [esi]
-        add     edi,edx
-        mov     al, JSAMPLE [edi-1]
-
-        rep stosb
-
-        pop     ecx
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        dec     eax
-        jg      short .expandloop
-
-.expand_end:
-        pop     ecx                             ; output_cols
-
-        ; -- h2v2_downsample
-
-        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov     edx, 0x00020001         ; bias pattern
-        movd    xmm7,edx
-        pcmpeqw xmm6,xmm6
-        pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
-        alignx  16,7
-.rowloop:
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
-        mov     edi, JSAMPROW [edi]                     ; outptr
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-        alignx  16,7
-
-.columnloop_r8:
-        movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        pxor    xmm2,xmm2
-        pxor    xmm3,xmm3
-        mov     ecx, SIZEOF_XMMWORD
-        jmp     short .downsample
-        alignx  16,7
-
-.columnloop:
-        movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqa  xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
-        movdqa  xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-.downsample:
-        movdqa  xmm4,xmm0
-        movdqa  xmm5,xmm1
-        pand    xmm0,xmm6
-        psrlw   xmm4,BYTE_BIT
-        pand    xmm1,xmm6
-        psrlw   xmm5,BYTE_BIT
-        paddw   xmm0,xmm4
-        paddw   xmm1,xmm5
-
-        movdqa  xmm4,xmm2
-        movdqa  xmm5,xmm3
-        pand    xmm2,xmm6
-        psrlw   xmm4,BYTE_BIT
-        pand    xmm3,xmm6
-        psrlw   xmm5,BYTE_BIT
-        paddw   xmm2,xmm4
-        paddw   xmm3,xmm5
-
-        paddw   xmm0,xmm1
-        paddw   xmm2,xmm3
-        paddw   xmm0,xmm7
-        paddw   xmm2,xmm7
-        psrlw   xmm0,2
-        psrlw   xmm2,2
-
-        packuswb xmm0,xmm2
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
-        sub     ecx, byte SIZEOF_XMMWORD        ; outcol
-        add     edx, byte 2*SIZEOF_XMMWORD      ; inptr0
-        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr1
-        add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .columnloop_r8
-
-        pop     esi
-        pop     edi
-        pop     ecx
-
-        add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
-        dec     eax                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdcolext-mmx.asm b/media/libjpeg/simd/jdcolext-mmx.asm
deleted file mode 100644
index 21e34f6786..0000000000
--- a/media/libjpeg/simd/jdcolext-mmx.asm
+++ /dev/null
@@ -1,404 +0,0 @@
-;
-; jdcolext.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
-;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                            JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b)    (b)+8           ; JDIMENSION out_width
-%define input_buf(b)    (b)+12          ; JSAMPIMAGE input_buf
-%define input_row(b)    (b)+16          ; JDIMENSION input_row
-%define output_buf(b)   (b)+20          ; JSAMPARRAY output_buf
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_ycc_rgb_convert_mmx)
-
-EXTN(jsimd_ycc_rgb_convert_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [out_width(eax)]        ; num_cols
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     edi, JSAMPIMAGE [input_buf(eax)]
-        mov     ecx, JDIMENSION [input_row(eax)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     edi, JSAMPARRAY [output_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        push    eax
-        push    edi
-        push    edx
-        push    ebx
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr0
-        mov     ebx, JSAMPROW [ebx]     ; inptr1
-        mov     edx, JSAMPROW [edx]     ; inptr2
-        mov     edi, JSAMPROW [edi]     ; outptr
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-        alignx  16,7
-.columnloop:
-
-        movq    mm5, MMWORD [ebx]       ; mm5=Cb(01234567)
-        movq    mm1, MMWORD [edx]       ; mm1=Cr(01234567)
-
-        pcmpeqw mm4,mm4
-        pcmpeqw mm7,mm7
-        psrlw   mm4,BYTE_BIT
-        psllw   mm7,7                   ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
-        movq    mm0,mm4                 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
-
-        pand    mm4,mm5                 ; mm4=Cb(0246)=CbE
-        psrlw   mm5,BYTE_BIT            ; mm5=Cb(1357)=CbO
-        pand    mm0,mm1                 ; mm0=Cr(0246)=CrE
-        psrlw   mm1,BYTE_BIT            ; mm1=Cr(1357)=CrO
-
-        paddw   mm4,mm7
-        paddw   mm5,mm7
-        paddw   mm0,mm7
-        paddw   mm1,mm7
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movq    mm2,mm4                 ; mm2=CbE
-        movq    mm3,mm5                 ; mm3=CbO
-        paddw   mm4,mm4                 ; mm4=2*CbE
-        paddw   mm5,mm5                 ; mm5=2*CbO
-        movq    mm6,mm0                 ; mm6=CrE
-        movq    mm7,mm1                 ; mm7=CrO
-        paddw   mm0,mm0                 ; mm0=2*CrE
-        paddw   mm1,mm1                 ; mm1=2*CrO
-
-        pmulhw  mm4,[GOTOFF(eax,PW_MF0228)]     ; mm4=(2*CbE * -FIX(0.22800))
-        pmulhw  mm5,[GOTOFF(eax,PW_MF0228)]     ; mm5=(2*CbO * -FIX(0.22800))
-        pmulhw  mm0,[GOTOFF(eax,PW_F0402)]      ; mm0=(2*CrE * FIX(0.40200))
-        pmulhw  mm1,[GOTOFF(eax,PW_F0402)]      ; mm1=(2*CrO * FIX(0.40200))
-
-        paddw   mm4,[GOTOFF(eax,PW_ONE)]
-        paddw   mm5,[GOTOFF(eax,PW_ONE)]
-        psraw   mm4,1                   ; mm4=(CbE * -FIX(0.22800))
-        psraw   mm5,1                   ; mm5=(CbO * -FIX(0.22800))
-        paddw   mm0,[GOTOFF(eax,PW_ONE)]
-        paddw   mm1,[GOTOFF(eax,PW_ONE)]
-        psraw   mm0,1                   ; mm0=(CrE * FIX(0.40200))
-        psraw   mm1,1                   ; mm1=(CrO * FIX(0.40200))
-
-        paddw   mm4,mm2
-        paddw   mm5,mm3
-        paddw   mm4,mm2                 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
-        paddw   mm5,mm3                 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
-        paddw   mm0,mm6                 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
-        paddw   mm1,mm7                 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(B-Y)E
-        movq    MMWORD [wk(1)], mm5     ; wk(1)=(B-Y)O
-
-        movq      mm4,mm2
-        movq      mm5,mm3
-        punpcklwd mm2,mm6
-        punpckhwd mm4,mm6
-        pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
-        punpcklwd mm3,mm7
-        punpckhwd mm5,mm7
-        pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
-        paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     mm2,SCALEBITS
-        psrad     mm4,SCALEBITS
-        paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     mm3,SCALEBITS
-        psrad     mm5,SCALEBITS
-
-        packssdw  mm2,mm4       ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-        packssdw  mm3,mm5       ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-        psubw     mm2,mm6       ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-        psubw     mm3,mm7       ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-        movq      mm5, MMWORD [esi]     ; mm5=Y(01234567)
-
-        pcmpeqw   mm4,mm4
-        psrlw     mm4,BYTE_BIT          ; mm4={0xFF 0x00 0xFF 0x00 ..}
-        pand      mm4,mm5               ; mm4=Y(0246)=YE
-        psrlw     mm5,BYTE_BIT          ; mm5=Y(1357)=YO
-
-        paddw     mm0,mm4               ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
-        paddw     mm1,mm5               ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
-        packuswb  mm0,mm0               ; mm0=(R0 R2 R4 R6 ** ** ** **)
-        packuswb  mm1,mm1               ; mm1=(R1 R3 R5 R7 ** ** ** **)
-
-        paddw     mm2,mm4               ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
-        paddw     mm3,mm5               ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
-        packuswb  mm2,mm2               ; mm2=(G0 G2 G4 G6 ** ** ** **)
-        packuswb  mm3,mm3               ; mm3=(G1 G3 G5 G7 ** ** ** **)
-
-        paddw     mm4, MMWORD [wk(0)]   ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
-        paddw     mm5, MMWORD [wk(1)]   ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
-        packuswb  mm4,mm4               ; mm4=(B0 B2 B4 B6 ** ** ** **)
-        packuswb  mm5,mm5               ; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-        ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
-        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-        punpcklbw mmE,mmB               ; mmE=(20 01 22 03 24 05 26 07)
-        punpcklbw mmD,mmF               ; mmD=(11 21 13 23 15 25 17 27)
-
-        movq      mmG,mmA
-        movq      mmH,mmA
-        punpcklwd mmA,mmE               ; mmA=(00 10 20 01 02 12 22 03)
-        punpckhwd mmG,mmE               ; mmG=(04 14 24 05 06 16 26 07)
-
-        psrlq     mmH,2*BYTE_BIT        ; mmH=(02 12 04 14 06 16 -- --)
-        psrlq     mmE,2*BYTE_BIT        ; mmE=(22 03 24 05 26 07 -- --)
-
-        movq      mmC,mmD
-        movq      mmB,mmD
-        punpcklwd mmD,mmH               ; mmD=(11 21 02 12 13 23 04 14)
-        punpckhwd mmC,mmH               ; mmC=(15 25 06 16 17 27 -- --)
-
-        psrlq     mmB,2*BYTE_BIT        ; mmB=(13 23 15 25 17 27 -- --)
-
-        movq      mmF,mmE
-        punpcklwd mmE,mmB               ; mmE=(22 03 13 23 24 05 15 25)
-        punpckhwd mmF,mmB               ; mmF=(26 07 17 27 -- -- -- --)
-
-        punpckldq mmA,mmD               ; mmA=(00 10 20 01 11 21 02 12)
-        punpckldq mmE,mmG               ; mmE=(22 03 13 23 04 14 24 05)
-        punpckldq mmC,mmF               ; mmC=(15 25 06 16 26 07 17 27)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st16
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
-        sub     ecx, byte SIZEOF_MMWORD
-        jz      short .nextrow
-
-        add     esi, byte SIZEOF_MMWORD                 ; inptr0
-        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-        add     edx, byte SIZEOF_MMWORD                 ; inptr2
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st16:
-        lea     ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
-        cmp     ecx, byte 2*SIZEOF_MMWORD
-        jb      short .column_st8
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-        movq    mmA,mmC
-        sub     ecx, byte 2*SIZEOF_MMWORD
-        add     edi, byte 2*SIZEOF_MMWORD
-        jmp     short .column_st4
-.column_st8:
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st4
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    mmA,mmE
-        sub     ecx, byte SIZEOF_MMWORD
-        add     edi, byte SIZEOF_MMWORD
-.column_st4:
-        movd    eax,mmA
-        cmp     ecx, byte SIZEOF_DWORD
-        jb      short .column_st2
-        mov     DWORD [edi+0*SIZEOF_DWORD], eax
-        psrlq   mmA,DWORD_BIT
-        movd    eax,mmA
-        sub     ecx, byte SIZEOF_DWORD
-        add     edi, byte SIZEOF_DWORD
-.column_st2:
-        cmp     ecx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [edi+0*SIZEOF_WORD], ax
-        shr     eax,WORD_BIT
-        sub     ecx, byte SIZEOF_WORD
-        add     edi, byte SIZEOF_WORD
-.column_st1:
-        cmp     ecx, byte SIZEOF_BYTE
-        jb      short .nextrow
-        mov     BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-        pcmpeqb   mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
-        pxor      mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-        pxor      mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
-        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-        ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
-        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-        punpcklbw mmE,mmG               ; mmE=(20 30 22 32 24 34 26 36)
-        punpcklbw mmB,mmD               ; mmB=(01 11 03 13 05 15 07 17)
-        punpcklbw mmF,mmH               ; mmF=(21 31 23 33 25 35 27 37)
-
-        movq      mmC,mmA
-        punpcklwd mmA,mmE               ; mmA=(00 10 20 30 02 12 22 32)
-        punpckhwd mmC,mmE               ; mmC=(04 14 24 34 06 16 26 36)
-        movq      mmG,mmB
-        punpcklwd mmB,mmF               ; mmB=(01 11 21 31 03 13 23 33)
-        punpckhwd mmG,mmF               ; mmG=(05 15 25 35 07 17 27 37)
-
-        movq      mmD,mmA
-        punpckldq mmA,mmB               ; mmA=(00 10 20 30 01 11 21 31)
-        punpckhdq mmD,mmB               ; mmD=(02 12 22 32 03 13 23 33)
-        movq      mmH,mmC
-        punpckldq mmC,mmG               ; mmC=(04 14 24 34 05 15 25 35)
-        punpckhdq mmH,mmG               ; mmH=(06 16 26 36 07 17 27 37)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st16
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
-        sub     ecx, byte SIZEOF_MMWORD
-        jz      short .nextrow
-
-        add     esi, byte SIZEOF_MMWORD                 ; inptr0
-        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-        add     edx, byte SIZEOF_MMWORD                 ; inptr2
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st16:
-        cmp     ecx, byte SIZEOF_MMWORD/2
-        jb      short .column_st8
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-        movq    mmA,mmC
-        movq    mmD,mmH
-        sub     ecx, byte SIZEOF_MMWORD/2
-        add     edi, byte 2*SIZEOF_MMWORD
-.column_st8:
-        cmp     ecx, byte SIZEOF_MMWORD/4
-        jb      short .column_st4
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    mmA,mmD
-        sub     ecx, byte SIZEOF_MMWORD/4
-        add     edi, byte 1*SIZEOF_MMWORD
-.column_st4:
-        cmp     ecx, byte SIZEOF_MMWORD/8
-        jb      short .nextrow
-        movd    DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        alignx  16,7
-
-.nextrow:
-        pop     ecx
-        pop     esi
-        pop     ebx
-        pop     edx
-        pop     edi
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        add     ebx, byte SIZEOF_JSAMPROW
-        add     edx, byte SIZEOF_JSAMPROW
-        add     edi, byte SIZEOF_JSAMPROW       ; output_buf
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdcolext-sse2-64.asm b/media/libjpeg/simd/jdcolext-sse2-64.asm
deleted file mode 100644
index 4634066c45..0000000000
--- a/media/libjpeg/simd/jdcolext-sse2-64.asm
+++ /dev/null
@@ -1,440 +0,0 @@
-;
-; jdcolext.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                             JSAMPARRAY output_buf, int num_rows)
-;
-
-; r10 = JDIMENSION out_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION input_row
-; r13 = JSAMPARRAY output_buf
-; r14 = int num_rows
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_ycc_rgb_convert_sse2)
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     ecx, r10d        ; num_cols
-        test    rcx,rcx
-        jz      near .return
-
-        push    rcx
-
-        mov     rdi, r11
-        mov     ecx, r12d
-        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-        lea     rsi, [rsi+rcx*SIZEOF_JSAMPROW]
-        lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
-        lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
-        pop     rcx
-
-        mov     rdi, r13
-        mov     eax, r14d
-        test    rax,rax
-        jle     near .return
-.rowloop:
-        push    rax
-        push    rdi
-        push    rdx
-        push    rbx
-        push    rsi
-        push    rcx                     ; col
-
-        mov     rsi, JSAMPROW [rsi]     ; inptr0
-        mov     rbx, JSAMPROW [rbx]     ; inptr1
-        mov     rdx, JSAMPROW [rdx]     ; inptr2
-        mov     rdi, JSAMPROW [rdi]     ; outptr
-.columnloop:
-
-        movdqa  xmm5, XMMWORD [rbx]     ; xmm5=Cb(0123456789ABCDEF)
-        movdqa  xmm1, XMMWORD [rdx]     ; xmm1=Cr(0123456789ABCDEF)
-
-        pcmpeqw xmm4,xmm4
-        pcmpeqw xmm7,xmm7
-        psrlw   xmm4,BYTE_BIT
-        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-        movdqa  xmm0,xmm4               ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
-        pand    xmm4,xmm5               ; xmm4=Cb(02468ACE)=CbE
-        psrlw   xmm5,BYTE_BIT           ; xmm5=Cb(13579BDF)=CbO
-        pand    xmm0,xmm1               ; xmm0=Cr(02468ACE)=CrE
-        psrlw   xmm1,BYTE_BIT           ; xmm1=Cr(13579BDF)=CrO
-
-        paddw   xmm4,xmm7
-        paddw   xmm5,xmm7
-        paddw   xmm0,xmm7
-        paddw   xmm1,xmm7
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movdqa  xmm2,xmm4               ; xmm2=CbE
-        movdqa  xmm3,xmm5               ; xmm3=CbO
-        paddw   xmm4,xmm4               ; xmm4=2*CbE
-        paddw   xmm5,xmm5               ; xmm5=2*CbO
-        movdqa  xmm6,xmm0               ; xmm6=CrE
-        movdqa  xmm7,xmm1               ; xmm7=CrO
-        paddw   xmm0,xmm0               ; xmm0=2*CrE
-        paddw   xmm1,xmm1               ; xmm1=2*CrO
-
-        pmulhw  xmm4,[rel PW_MF0228]    ; xmm4=(2*CbE * -FIX(0.22800))
-        pmulhw  xmm5,[rel PW_MF0228]    ; xmm5=(2*CbO * -FIX(0.22800))
-        pmulhw  xmm0,[rel PW_F0402]     ; xmm0=(2*CrE * FIX(0.40200))
-        pmulhw  xmm1,[rel PW_F0402]     ; xmm1=(2*CrO * FIX(0.40200))
-
-        paddw   xmm4,[rel PW_ONE]
-        paddw   xmm5,[rel PW_ONE]
-        psraw   xmm4,1                  ; xmm4=(CbE * -FIX(0.22800))
-        psraw   xmm5,1                  ; xmm5=(CbO * -FIX(0.22800))
-        paddw   xmm0,[rel PW_ONE]
-        paddw   xmm1,[rel PW_ONE]
-        psraw   xmm0,1                  ; xmm0=(CrE * FIX(0.40200))
-        psraw   xmm1,1                  ; xmm1=(CrO * FIX(0.40200))
-
-        paddw   xmm4,xmm2
-        paddw   xmm5,xmm3
-        paddw   xmm4,xmm2               ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
-        paddw   xmm5,xmm3               ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
-        paddw   xmm0,xmm6               ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
-        paddw   xmm1,xmm7               ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
-
-        movdqa    xmm4,xmm2
-        movdqa    xmm5,xmm3
-        punpcklwd xmm2,xmm6
-        punpckhwd xmm4,xmm6
-        pmaddwd   xmm2,[rel PW_MF0344_F0285]
-        pmaddwd   xmm4,[rel PW_MF0344_F0285]
-        punpcklwd xmm3,xmm7
-        punpckhwd xmm5,xmm7
-        pmaddwd   xmm3,[rel PW_MF0344_F0285]
-        pmaddwd   xmm5,[rel PW_MF0344_F0285]
-
-        paddd     xmm2,[rel PD_ONEHALF]
-        paddd     xmm4,[rel PD_ONEHALF]
-        psrad     xmm2,SCALEBITS
-        psrad     xmm4,SCALEBITS
-        paddd     xmm3,[rel PD_ONEHALF]
-        paddd     xmm5,[rel PD_ONEHALF]
-        psrad     xmm3,SCALEBITS
-        psrad     xmm5,SCALEBITS
-
-        packssdw  xmm2,xmm4     ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-        packssdw  xmm3,xmm5     ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-        psubw     xmm2,xmm6     ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-        psubw     xmm3,xmm7     ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-        movdqa    xmm5, XMMWORD [rsi]   ; xmm5=Y(0123456789ABCDEF)
-
-        pcmpeqw   xmm4,xmm4
-        psrlw     xmm4,BYTE_BIT         ; xmm4={0xFF 0x00 0xFF 0x00 ..}
-        pand      xmm4,xmm5             ; xmm4=Y(02468ACE)=YE
-        psrlw     xmm5,BYTE_BIT         ; xmm5=Y(13579BDF)=YO
-
-        paddw     xmm0,xmm4             ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
-        paddw     xmm1,xmm5             ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
-        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-        paddw     xmm2,xmm4             ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
-        paddw     xmm3,xmm5             ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
-        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-        paddw     xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
-        paddw     xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
-        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-        movdqa    xmmG,xmmA
-        movdqa    xmmH,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-        movdqa    xmmC,xmmD
-        movdqa    xmmB,xmmD
-        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-        movdqa    xmmF,xmmE
-        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-        movdqa    xmmB,xmmE
-        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-        movdqa    xmmB,xmmF
-        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    rdi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     rcx, byte SIZEOF_XMMWORD
-        jz      near .nextrow
-
-        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
-        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
-        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-
-.column_st32:
-        lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
-        cmp     rcx, byte 2*SIZEOF_XMMWORD
-        jb      short .column_st16
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmF
-        sub     rcx, byte 2*SIZEOF_XMMWORD
-        jmp     short .column_st15
-.column_st16:
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st15
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        add     rdi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     rcx, byte SIZEOF_XMMWORD
-.column_st15:
-        ; Store the lower 8 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_MMWORD
-        jb      short .column_st7
-        movq    XMM_MMWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_MMWORD
-        sub     rcx, byte SIZEOF_MMWORD
-        psrldq  xmmA, SIZEOF_MMWORD
-.column_st7:
-        ; Store the lower 4 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_DWORD
-        jb      short .column_st3
-        movd    XMM_DWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_DWORD
-        sub     rcx, byte SIZEOF_DWORD
-        psrldq  xmmA, SIZEOF_DWORD
-.column_st3:
-        ; Store the lower 2 bytes of rax to the output when it has enough
-        ; space.
-        movd    eax, xmmA
-        cmp     rcx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [rdi], ax
-        add     rdi, byte SIZEOF_WORD
-        sub     rcx, byte SIZEOF_WORD
-        shr     rax, 16
-.column_st1:
-        ; Store the lower 1 byte of rax to the output when it has enough
-        ; space.
-        test    rcx, rcx
-        jz      short .nextrow
-        mov     BYTE [rdi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-        movdqa    xmmC,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-        movdqa    xmmG,xmmB
-        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        movdqa    xmmH,xmmC
-        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    rdi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-        movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-        movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     rcx, byte SIZEOF_XMMWORD
-        jz      near .nextrow
-
-        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
-        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
-        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-
-.column_st32:
-        cmp     rcx, byte SIZEOF_XMMWORD/2
-        jb      short .column_st16
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmC
-        movdqa  xmmD,xmmH
-        sub     rcx, byte SIZEOF_XMMWORD/2
-.column_st16:
-        cmp     rcx, byte SIZEOF_XMMWORD/4
-        jb      short .column_st15
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        add     rdi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     rcx, byte SIZEOF_XMMWORD/4
-.column_st15:
-        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_XMMWORD/8
-        jb      short .column_st7
-        movq    MMWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_XMMWORD/8*4
-        sub     rcx, byte SIZEOF_XMMWORD/8
-        psrldq  xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
-        ; space.
-        test    rcx, rcx
-        jz      short .nextrow
-        movd    XMM_DWORD [rdi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.nextrow:
-        pop     rcx
-        pop     rsi
-        pop     rbx
-        pop     rdx
-        pop     rdi
-        pop     rax
-
-        add     rsi, byte SIZEOF_JSAMPROW
-        add     rbx, byte SIZEOF_JSAMPROW
-        add     rdx, byte SIZEOF_JSAMPROW
-        add     rdi, byte SIZEOF_JSAMPROW       ; output_buf
-        dec     rax                             ; num_rows
-        jg      near .rowloop
-
-        sfence          ; flush the write buffer
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdcolext-sse2.asm b/media/libjpeg/simd/jdcolext-sse2.asm
deleted file mode 100644
index 682aef35fc..0000000000
--- a/media/libjpeg/simd/jdcolext-sse2.asm
+++ /dev/null
@@ -1,459 +0,0 @@
-;
-; jdcolext.asm - colorspace conversion (SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                             JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b)    (b)+8           ; JDIMENSION out_width
-%define input_buf(b)    (b)+12          ; JSAMPIMAGE input_buf
-%define input_row(b)    (b)+16          ; JDIMENSION input_row
-%define output_buf(b)   (b)+20          ; JSAMPARRAY output_buf
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_ycc_rgb_convert_sse2)
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [out_width(eax)]        ; num_cols
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     edi, JSAMPIMAGE [input_buf(eax)]
-        mov     ecx, JDIMENSION [input_row(eax)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     edi, JSAMPARRAY [output_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        push    eax
-        push    edi
-        push    edx
-        push    ebx
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr0
-        mov     ebx, JSAMPROW [ebx]     ; inptr1
-        mov     edx, JSAMPROW [edx]     ; inptr2
-        mov     edi, JSAMPROW [edi]     ; outptr
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-        alignx  16,7
-.columnloop:
-
-        movdqa  xmm5, XMMWORD [ebx]     ; xmm5=Cb(0123456789ABCDEF)
-        movdqa  xmm1, XMMWORD [edx]     ; xmm1=Cr(0123456789ABCDEF)
-
-        pcmpeqw xmm4,xmm4
-        pcmpeqw xmm7,xmm7
-        psrlw   xmm4,BYTE_BIT
-        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-        movdqa  xmm0,xmm4               ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
-        pand    xmm4,xmm5               ; xmm4=Cb(02468ACE)=CbE
-        psrlw   xmm5,BYTE_BIT           ; xmm5=Cb(13579BDF)=CbO
-        pand    xmm0,xmm1               ; xmm0=Cr(02468ACE)=CrE
-        psrlw   xmm1,BYTE_BIT           ; xmm1=Cr(13579BDF)=CrO
-
-        paddw   xmm4,xmm7
-        paddw   xmm5,xmm7
-        paddw   xmm0,xmm7
-        paddw   xmm1,xmm7
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movdqa  xmm2,xmm4               ; xmm2=CbE
-        movdqa  xmm3,xmm5               ; xmm3=CbO
-        paddw   xmm4,xmm4               ; xmm4=2*CbE
-        paddw   xmm5,xmm5               ; xmm5=2*CbO
-        movdqa  xmm6,xmm0               ; xmm6=CrE
-        movdqa  xmm7,xmm1               ; xmm7=CrO
-        paddw   xmm0,xmm0               ; xmm0=2*CrE
-        paddw   xmm1,xmm1               ; xmm1=2*CrO
-
-        pmulhw  xmm4,[GOTOFF(eax,PW_MF0228)]    ; xmm4=(2*CbE * -FIX(0.22800))
-        pmulhw  xmm5,[GOTOFF(eax,PW_MF0228)]    ; xmm5=(2*CbO * -FIX(0.22800))
-        pmulhw  xmm0,[GOTOFF(eax,PW_F0402)]     ; xmm0=(2*CrE * FIX(0.40200))
-        pmulhw  xmm1,[GOTOFF(eax,PW_F0402)]     ; xmm1=(2*CrO * FIX(0.40200))
-
-        paddw   xmm4,[GOTOFF(eax,PW_ONE)]
-        paddw   xmm5,[GOTOFF(eax,PW_ONE)]
-        psraw   xmm4,1                  ; xmm4=(CbE * -FIX(0.22800))
-        psraw   xmm5,1                  ; xmm5=(CbO * -FIX(0.22800))
-        paddw   xmm0,[GOTOFF(eax,PW_ONE)]
-        paddw   xmm1,[GOTOFF(eax,PW_ONE)]
-        psraw   xmm0,1                  ; xmm0=(CrE * FIX(0.40200))
-        psraw   xmm1,1                  ; xmm1=(CrO * FIX(0.40200))
-
-        paddw   xmm4,xmm2
-        paddw   xmm5,xmm3
-        paddw   xmm4,xmm2               ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
-        paddw   xmm5,xmm3               ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
-        paddw   xmm0,xmm6               ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
-        paddw   xmm1,xmm7               ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
-
-        movdqa    xmm4,xmm2
-        movdqa    xmm5,xmm3
-        punpcklwd xmm2,xmm6
-        punpckhwd xmm4,xmm6
-        pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
-        punpcklwd xmm3,xmm7
-        punpckhwd xmm5,xmm7
-        pmaddwd   xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
-        paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     xmm4,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     xmm2,SCALEBITS
-        psrad     xmm4,SCALEBITS
-        paddd     xmm3,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     xmm3,SCALEBITS
-        psrad     xmm5,SCALEBITS
-
-        packssdw  xmm2,xmm4     ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-        packssdw  xmm3,xmm5     ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-        psubw     xmm2,xmm6     ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-        psubw     xmm3,xmm7     ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-        movdqa    xmm5, XMMWORD [esi]   ; xmm5=Y(0123456789ABCDEF)
-
-        pcmpeqw   xmm4,xmm4
-        psrlw     xmm4,BYTE_BIT         ; xmm4={0xFF 0x00 0xFF 0x00 ..}
-        pand      xmm4,xmm5             ; xmm4=Y(02468ACE)=YE
-        psrlw     xmm5,BYTE_BIT         ; xmm5=Y(13579BDF)=YO
-
-        paddw     xmm0,xmm4             ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
-        paddw     xmm1,xmm5             ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
-        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-        paddw     xmm2,xmm4             ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
-        paddw     xmm3,xmm5             ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
-        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-        paddw     xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
-        paddw     xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
-        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-        movdqa    xmmG,xmmA
-        movdqa    xmmH,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-        movdqa    xmmC,xmmD
-        movdqa    xmmB,xmmD
-        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-        movdqa    xmmF,xmmE
-        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-        movdqa    xmmB,xmmE
-        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-        movdqa    xmmB,xmmF
-        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    edi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     ecx, byte SIZEOF_XMMWORD
-        jz      near .nextrow
-
-        add     esi, byte SIZEOF_XMMWORD        ; inptr0
-        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-        add     edx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st32:
-        lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
-        cmp     ecx, byte 2*SIZEOF_XMMWORD
-        jb      short .column_st16
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmF
-        sub     ecx, byte 2*SIZEOF_XMMWORD
-        jmp     short .column_st15
-.column_st16:
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st15
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        add     edi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     ecx, byte SIZEOF_XMMWORD
-.column_st15:
-        ; Store the lower 8 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st7
-        movq    XMM_MMWORD [edi], xmmA
-        add     edi, byte SIZEOF_MMWORD
-        sub     ecx, byte SIZEOF_MMWORD
-        psrldq  xmmA, SIZEOF_MMWORD
-.column_st7:
-        ; Store the lower 4 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_DWORD
-        jb      short .column_st3
-        movd    XMM_DWORD [edi], xmmA
-        add     edi, byte SIZEOF_DWORD
-        sub     ecx, byte SIZEOF_DWORD
-        psrldq  xmmA, SIZEOF_DWORD
-.column_st3:
-        ; Store the lower 2 bytes of eax to the output when it has enough
-        ; space.
-        movd    eax, xmmA
-        cmp     ecx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [edi], ax
-        add     edi, byte SIZEOF_WORD
-        sub     ecx, byte SIZEOF_WORD
-        shr     eax, 16
-.column_st1:
-        ; Store the lower 1 byte of eax to the output when it has enough
-        ; space.
-        test    ecx, ecx
-        jz      short .nextrow
-        mov     BYTE [edi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-        movdqa    xmmC,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-        movdqa    xmmG,xmmB
-        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        movdqa    xmmH,xmmC
-        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    edi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-        movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-        movdqu  XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     ecx, byte SIZEOF_XMMWORD
-        jz      near .nextrow
-
-        add     esi, byte SIZEOF_XMMWORD        ; inptr0
-        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-        add     edx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st32:
-        cmp     ecx, byte SIZEOF_XMMWORD/2
-        jb      short .column_st16
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmC
-        movdqa  xmmD,xmmH
-        sub     ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
-        cmp     ecx, byte SIZEOF_XMMWORD/4
-        jb      short .column_st15
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        add     edi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
-        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_XMMWORD/8
-        jb      short .column_st7
-        movq    XMM_MMWORD [edi], xmmA
-        add     edi, byte SIZEOF_XMMWORD/8*4
-        sub     ecx, byte SIZEOF_XMMWORD/8
-        psrldq  xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
-        ; space.
-        test    ecx, ecx
-        jz      short .nextrow
-        movd    XMM_DWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        alignx  16,7
-
-.nextrow:
-        pop     ecx
-        pop     esi
-        pop     ebx
-        pop     edx
-        pop     edi
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        add     ebx, byte SIZEOF_JSAMPROW
-        add     edx, byte SIZEOF_JSAMPROW
-        add     edi, byte SIZEOF_JSAMPROW       ; output_buf
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-        sfence          ; flush the write buffer
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdcolor-altivec.c b/media/libjpeg/simd/jdcolor-altivec.c
deleted file mode 100644
index 0dc4c427c2..0000000000
--- a/media/libjpeg/simd/jdcolor-altivec.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* YCC --> RGB CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_344 22554              /* FIX(0.34414) */
-#define F_0_714 46802              /* FIX(0.71414) */
-#define F_1_402 91881              /* FIX(1.40200) */
-#define F_1_772 116130             /* FIX(1.77200) */
-#define F_0_402 (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
-#define F_0_285 (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
-#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
-#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
-#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
-#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
-#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
diff --git a/media/libjpeg/simd/jdcolor-mmx.asm b/media/libjpeg/simd/jdcolor-mmx.asm
deleted file mode 100644
index 4e58031dd0..0000000000
--- a/media/libjpeg/simd/jdcolor-mmx.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; jdcolor.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_ycc_rgb_convert_mmx)
-
-EXTN(jconst_ycc_rgb_convert_mmx):
-
-PW_F0402        times 4 dw  F_0_402
-PW_MF0228       times 4 dw -F_0_228
-PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
-PW_ONE          times 4 dw  1
-PD_ONEHALF      times 2 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
-%include "jdcolext-mmx.asm"
diff --git a/media/libjpeg/simd/jdcolor-sse2-64.asm b/media/libjpeg/simd/jdcolor-sse2-64.asm
deleted file mode 100644
index d2bf210007..0000000000
--- a/media/libjpeg/simd/jdcolor-sse2-64.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; jdcolor.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_ycc_rgb_convert_sse2)
-
-EXTN(jconst_ycc_rgb_convert_sse2):
-
-PW_F0402        times 8 dw  F_0_402
-PW_MF0228       times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
-%include "jdcolext-sse2-64.asm"
diff --git a/media/libjpeg/simd/jdcolor-sse2.asm b/media/libjpeg/simd/jdcolor-sse2.asm
deleted file mode 100644
index 7ff5d05d0c..0000000000
--- a/media/libjpeg/simd/jdcolor-sse2.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; jdcolor.asm - colorspace conversion (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_ycc_rgb_convert_sse2)
-
-EXTN(jconst_ycc_rgb_convert_sse2):
-
-PW_F0402        times 8 dw  F_0_402
-PW_MF0228       times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
-%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/jdmerge-altivec.c b/media/libjpeg/simd/jdmerge-altivec.c
deleted file mode 100644
index 6a35f2019c..0000000000
--- a/media/libjpeg/simd/jdmerge-altivec.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_344 22554              /* FIX(0.34414) */
-#define F_0_714 46802              /* FIX(0.71414) */
-#define F_1_402 91881              /* FIX(1.40200) */
-#define F_1_772 116130             /* FIX(1.77200) */
-#define F_0_402 (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
-#define F_0_285 (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
-#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
-#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
-#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgb_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgb_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgbx_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgbx_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
-#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
-#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgr_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgr_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgrx_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgrx_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxbgr_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxbgr_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxrgb_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxrgb_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/media/libjpeg/simd/jdmerge-mmx.asm b/media/libjpeg/simd/jdmerge-mmx.asm
deleted file mode 100644
index ee58bff1c6..0000000000
--- a/media/libjpeg/simd/jdmerge-mmx.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-; jdmerge.asm - merged upsampling/color conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_merged_upsample_mmx)
-
-EXTN(jconst_merged_upsample_mmx):
-
-PW_F0402        times 4 dw  F_0_402
-PW_MF0228       times 4 dw -F_0_228
-PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
-PW_ONE          times 4 dw  1
-PD_ONEHALF      times 2 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
diff --git a/media/libjpeg/simd/jdmerge-sse2-64.asm b/media/libjpeg/simd/jdmerge-sse2-64.asm
deleted file mode 100644
index 244bd40234..0000000000
--- a/media/libjpeg/simd/jdmerge-sse2-64.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_merged_upsample_sse2)
-
-EXTN(jconst_merged_upsample_sse2):
-
-PW_F0402        times 8 dw  F_0_402
-PW_MF0228       times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
diff --git a/media/libjpeg/simd/jdmerge-sse2.asm b/media/libjpeg/simd/jdmerge-sse2.asm
deleted file mode 100644
index 236de5a385..0000000000
--- a/media/libjpeg/simd/jdmerge-sse2.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-; jdmerge.asm - merged upsampling/color conversion (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_merged_upsample_sse2)
-
-EXTN(jconst_merged_upsample_sse2):
-
-PW_F0402        times 8 dw  F_0_402
-PW_MF0228       times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/jdmrgext-mmx.asm b/media/libjpeg/simd/jdmrgext-mmx.asm
deleted file mode 100644
index 63f45cf373..0000000000
--- a/media/libjpeg/simd/jdmrgext-mmx.asm
+++ /dev/null
@@ -1,463 +0,0 @@
-;
-; jdmrgext.asm - merged upsampling/color conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
-;                                 JSAMPIMAGE input_buf,
-;                                 JDIMENSION in_row_group_ctr,
-;                                 JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          3
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-EXTN(jsimd_h2v1_merged_upsample_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [output_width(eax)]     ; col
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     edi, JSAMPIMAGE [input_buf(eax)]
-        mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        mov     edi, JSAMPARRAY [output_buf(eax)]
-        mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
-        mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
-        mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
-        mov     edi, JSAMPROW [edi]                             ; outptr
-
-        pop     ecx                     ; col
-
-        alignx  16,7
-.columnloop:
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        movq      mm6, MMWORD [ebx]     ; mm6=Cb(01234567)
-        movq      mm7, MMWORD [edx]     ; mm7=Cr(01234567)
-
-        pxor      mm1,mm1               ; mm1=(all 0's)
-        pcmpeqw   mm3,mm3
-        psllw     mm3,7                 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
-
-        movq      mm4,mm6
-        punpckhbw mm6,mm1               ; mm6=Cb(4567)=CbH
-        punpcklbw mm4,mm1               ; mm4=Cb(0123)=CbL
-        movq      mm0,mm7
-        punpckhbw mm7,mm1               ; mm7=Cr(4567)=CrH
-        punpcklbw mm0,mm1               ; mm0=Cr(0123)=CrL
-
-        paddw     mm6,mm3
-        paddw     mm4,mm3
-        paddw     mm7,mm3
-        paddw     mm0,mm3
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movq    mm5,mm6                 ; mm5=CbH
-        movq    mm2,mm4                 ; mm2=CbL
-        paddw   mm6,mm6                 ; mm6=2*CbH
-        paddw   mm4,mm4                 ; mm4=2*CbL
-        movq    mm1,mm7                 ; mm1=CrH
-        movq    mm3,mm0                 ; mm3=CrL
-        paddw   mm7,mm7                 ; mm7=2*CrH
-        paddw   mm0,mm0                 ; mm0=2*CrL
-
-        pmulhw  mm6,[GOTOFF(eax,PW_MF0228)]     ; mm6=(2*CbH * -FIX(0.22800))
-        pmulhw  mm4,[GOTOFF(eax,PW_MF0228)]     ; mm4=(2*CbL * -FIX(0.22800))
-        pmulhw  mm7,[GOTOFF(eax,PW_F0402)]      ; mm7=(2*CrH * FIX(0.40200))
-        pmulhw  mm0,[GOTOFF(eax,PW_F0402)]      ; mm0=(2*CrL * FIX(0.40200))
-
-        paddw   mm6,[GOTOFF(eax,PW_ONE)]
-        paddw   mm4,[GOTOFF(eax,PW_ONE)]
-        psraw   mm6,1                   ; mm6=(CbH * -FIX(0.22800))
-        psraw   mm4,1                   ; mm4=(CbL * -FIX(0.22800))
-        paddw   mm7,[GOTOFF(eax,PW_ONE)]
-        paddw   mm0,[GOTOFF(eax,PW_ONE)]
-        psraw   mm7,1                   ; mm7=(CrH * FIX(0.40200))
-        psraw   mm0,1                   ; mm0=(CrL * FIX(0.40200))
-
-        paddw   mm6,mm5
-        paddw   mm4,mm2
-        paddw   mm6,mm5                 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
-        paddw   mm4,mm2                 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
-        paddw   mm7,mm1                 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
-        paddw   mm0,mm3                 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
-
-        movq    MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
-        movq    MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
-
-        movq      mm6,mm5
-        movq      mm7,mm2
-        punpcklwd mm5,mm1
-        punpckhwd mm6,mm1
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
-        punpcklwd mm2,mm3
-        punpckhwd mm7,mm3
-        pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
-        paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     mm5,SCALEBITS
-        psrad     mm6,SCALEBITS
-        paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     mm2,SCALEBITS
-        psrad     mm7,SCALEBITS
-
-        packssdw  mm5,mm6       ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-        packssdw  mm2,mm7       ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-        psubw     mm5,mm1       ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-        psubw     mm2,mm3       ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-        movq    MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
-
-        mov     al,2                    ; Yctr
-        jmp     short .Yloop_1st
-        alignx  16,7
-
-.Yloop_2nd:
-        movq    mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
-        movq    mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
-        movq    mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
-        alignx  16,7
-
-.Yloop_1st:
-        movq    mm7, MMWORD [esi]       ; mm7=Y(01234567)
-
-        pcmpeqw mm6,mm6
-        psrlw   mm6,BYTE_BIT            ; mm6={0xFF 0x00 0xFF 0x00 ..}
-        pand    mm6,mm7                 ; mm6=Y(0246)=YE
-        psrlw   mm7,BYTE_BIT            ; mm7=Y(1357)=YO
-
-        movq    mm1,mm0                 ; mm1=mm0=(R-Y)(L/H)
-        movq    mm3,mm2                 ; mm3=mm2=(G-Y)(L/H)
-        movq    mm5,mm4                 ; mm5=mm4=(B-Y)(L/H)
-
-        paddw     mm0,mm6               ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
-        paddw     mm1,mm7               ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
-        packuswb  mm0,mm0               ; mm0=(R0 R2 R4 R6 ** ** ** **)
-        packuswb  mm1,mm1               ; mm1=(R1 R3 R5 R7 ** ** ** **)
-
-        paddw     mm2,mm6               ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
-        paddw     mm3,mm7               ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
-        packuswb  mm2,mm2               ; mm2=(G0 G2 G4 G6 ** ** ** **)
-        packuswb  mm3,mm3               ; mm3=(G1 G3 G5 G7 ** ** ** **)
-
-        paddw     mm4,mm6               ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
-        paddw     mm5,mm7               ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
-        packuswb  mm4,mm4               ; mm4=(B0 B2 B4 B6 ** ** ** **)
-        packuswb  mm5,mm5               ; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-        ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
-        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-        punpcklbw mmE,mmB               ; mmE=(20 01 22 03 24 05 26 07)
-        punpcklbw mmD,mmF               ; mmD=(11 21 13 23 15 25 17 27)
-
-        movq      mmG,mmA
-        movq      mmH,mmA
-        punpcklwd mmA,mmE               ; mmA=(00 10 20 01 02 12 22 03)
-        punpckhwd mmG,mmE               ; mmG=(04 14 24 05 06 16 26 07)
-
-        psrlq     mmH,2*BYTE_BIT        ; mmH=(02 12 04 14 06 16 -- --)
-        psrlq     mmE,2*BYTE_BIT        ; mmE=(22 03 24 05 26 07 -- --)
-
-        movq      mmC,mmD
-        movq      mmB,mmD
-        punpcklwd mmD,mmH               ; mmD=(11 21 02 12 13 23 04 14)
-        punpckhwd mmC,mmH               ; mmC=(15 25 06 16 17 27 -- --)
-
-        psrlq     mmB,2*BYTE_BIT        ; mmB=(13 23 15 25 17 27 -- --)
-
-        movq      mmF,mmE
-        punpcklwd mmE,mmB               ; mmE=(22 03 13 23 24 05 15 25)
-        punpckhwd mmF,mmB               ; mmF=(26 07 17 27 -- -- -- --)
-
-        punpckldq mmA,mmD               ; mmA=(00 10 20 01 11 21 02 12)
-        punpckldq mmE,mmG               ; mmE=(22 03 13 23 04 14 24 05)
-        punpckldq mmC,mmF               ; mmC=(15 25 06 16 26 07 17 27)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st16
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
-        sub     ecx, byte SIZEOF_MMWORD
-        jz      near .endcolumn
-
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-        add     esi, byte SIZEOF_MMWORD                 ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-        add     edx, byte SIZEOF_MMWORD                 ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st16:
-        lea     ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
-        cmp     ecx, byte 2*SIZEOF_MMWORD
-        jb      short .column_st8
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-        movq    mmA,mmC
-        sub     ecx, byte 2*SIZEOF_MMWORD
-        add     edi, byte 2*SIZEOF_MMWORD
-        jmp     short .column_st4
-.column_st8:
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st4
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    mmA,mmE
-        sub     ecx, byte SIZEOF_MMWORD
-        add     edi, byte SIZEOF_MMWORD
-.column_st4:
-        movd    eax,mmA
-        cmp     ecx, byte SIZEOF_DWORD
-        jb      short .column_st2
-        mov     DWORD [edi+0*SIZEOF_DWORD], eax
-        psrlq   mmA,DWORD_BIT
-        movd    eax,mmA
-        sub     ecx, byte SIZEOF_DWORD
-        add     edi, byte SIZEOF_DWORD
-.column_st2:
-        cmp     ecx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [edi+0*SIZEOF_WORD], ax
-        shr     eax,WORD_BIT
-        sub     ecx, byte SIZEOF_WORD
-        add     edi, byte SIZEOF_WORD
-.column_st1:
-        cmp     ecx, byte SIZEOF_BYTE
-        jb      short .endcolumn
-        mov     BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-        pcmpeqb   mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
-        pxor      mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-        pxor      mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
-        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-        ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
-        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-        punpcklbw mmE,mmG               ; mmE=(20 30 22 32 24 34 26 36)
-        punpcklbw mmB,mmD               ; mmB=(01 11 03 13 05 15 07 17)
-        punpcklbw mmF,mmH               ; mmF=(21 31 23 33 25 35 27 37)
-
-        movq      mmC,mmA
-        punpcklwd mmA,mmE               ; mmA=(00 10 20 30 02 12 22 32)
-        punpckhwd mmC,mmE               ; mmC=(04 14 24 34 06 16 26 36)
-        movq      mmG,mmB
-        punpcklwd mmB,mmF               ; mmB=(01 11 21 31 03 13 23 33)
-        punpckhwd mmG,mmF               ; mmG=(05 15 25 35 07 17 27 37)
-
-        movq      mmD,mmA
-        punpckldq mmA,mmB               ; mmA=(00 10 20 30 01 11 21 31)
-        punpckhdq mmD,mmB               ; mmD=(02 12 22 32 03 13 23 33)
-        movq      mmH,mmC
-        punpckldq mmC,mmG               ; mmC=(04 14 24 34 05 15 25 35)
-        punpckhdq mmH,mmG               ; mmH=(06 16 26 36 07 17 27 37)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st16
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
-        sub     ecx, byte SIZEOF_MMWORD
-        jz      short .endcolumn
-
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-        add     esi, byte SIZEOF_MMWORD                 ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-        add     edx, byte SIZEOF_MMWORD                 ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st16:
-        cmp     ecx, byte SIZEOF_MMWORD/2
-        jb      short .column_st8
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-        movq    mmA,mmC
-        movq    mmD,mmH
-        sub     ecx, byte SIZEOF_MMWORD/2
-        add     edi, byte 2*SIZEOF_MMWORD
-.column_st8:
-        cmp     ecx, byte SIZEOF_MMWORD/4
-        jb      short .column_st4
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    mmA,mmD
-        sub     ecx, byte SIZEOF_MMWORD/4
-        add     edi, byte 1*SIZEOF_MMWORD
-.column_st4:
-        cmp     ecx, byte SIZEOF_MMWORD/8
-        jb      short .endcolumn
-        movd    DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
-;                                 JSAMPIMAGE input_buf,
-;                                 JDIMENSION in_row_group_ctr,
-;                                 JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
-
-        align   16
-        global  EXTN(jsimd_h2v2_merged_upsample_mmx)
-
-EXTN(jsimd_h2v2_merged_upsample_mmx):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     eax, JDIMENSION [output_width(ebp)]
-
-        mov     edi, JSAMPIMAGE [input_buf(ebp)]
-        mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        mov     edi, JSAMPARRAY [output_buf(ebp)]
-        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-
-        push    edx                     ; inptr2
-        push    ebx                     ; inptr1
-        push    esi                     ; inptr00
-        mov     ebx,esp
-
-        push    edi                     ; output_buf (outptr0)
-        push    ecx                     ; in_row_group_ctr
-        push    ebx                     ; input_buf
-        push    eax                     ; output_width
-
-        call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-        add     esi, byte SIZEOF_JSAMPROW       ; inptr01
-        add     edi, byte SIZEOF_JSAMPROW       ; outptr1
-        mov     POINTER [ebx+0*SIZEOF_POINTER], esi
-        mov     POINTER [ebx-1*SIZEOF_POINTER], edi
-
-        call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-        add     esp, byte 7*SIZEOF_DWORD
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdmrgext-sse2-64.asm b/media/libjpeg/simd/jdmrgext-sse2-64.asm
deleted file mode 100644
index ad74c5ff4d..0000000000
--- a/media/libjpeg/simd/jdmrgext-sse2-64.asm
+++ /dev/null
@@ -1,537 +0,0 @@
-;
-; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-; r10 = JDIMENSION output_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
-; r13 = JSAMPARRAY output_buf
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          3
-
-        align   16
-        global  EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     ecx, r10d        ; col
-        test    rcx,rcx
-        jz      near .return
-
-        push    rcx
-
-        mov     rdi, r11
-        mov     ecx, r12d
-        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-        mov     rdi, r13
-        mov     rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]         ; inptr0
-        mov     rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]         ; inptr1
-        mov     rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]         ; inptr2
-        mov     rdi, JSAMPROW [rdi]                             ; outptr
-
-        pop     rcx                     ; col
-
-.columnloop:
-
-        movdqa    xmm6, XMMWORD [rbx]   ; xmm6=Cb(0123456789ABCDEF)
-        movdqa    xmm7, XMMWORD [rdx]   ; xmm7=Cr(0123456789ABCDEF)
-
-        pxor      xmm1,xmm1             ; xmm1=(all 0's)
-        pcmpeqw   xmm3,xmm3
-        psllw     xmm3,7                ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-        movdqa    xmm4,xmm6
-        punpckhbw xmm6,xmm1             ; xmm6=Cb(89ABCDEF)=CbH
-        punpcklbw xmm4,xmm1             ; xmm4=Cb(01234567)=CbL
-        movdqa    xmm0,xmm7
-        punpckhbw xmm7,xmm1             ; xmm7=Cr(89ABCDEF)=CrH
-        punpcklbw xmm0,xmm1             ; xmm0=Cr(01234567)=CrL
-
-        paddw     xmm6,xmm3
-        paddw     xmm4,xmm3
-        paddw     xmm7,xmm3
-        paddw     xmm0,xmm3
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movdqa  xmm5,xmm6               ; xmm5=CbH
-        movdqa  xmm2,xmm4               ; xmm2=CbL
-        paddw   xmm6,xmm6               ; xmm6=2*CbH
-        paddw   xmm4,xmm4               ; xmm4=2*CbL
-        movdqa  xmm1,xmm7               ; xmm1=CrH
-        movdqa  xmm3,xmm0               ; xmm3=CrL
-        paddw   xmm7,xmm7               ; xmm7=2*CrH
-        paddw   xmm0,xmm0               ; xmm0=2*CrL
-
-        pmulhw  xmm6,[rel PW_MF0228]    ; xmm6=(2*CbH * -FIX(0.22800))
-        pmulhw  xmm4,[rel PW_MF0228]    ; xmm4=(2*CbL * -FIX(0.22800))
-        pmulhw  xmm7,[rel PW_F0402]     ; xmm7=(2*CrH * FIX(0.40200))
-        pmulhw  xmm0,[rel PW_F0402]     ; xmm0=(2*CrL * FIX(0.40200))
-
-        paddw   xmm6,[rel PW_ONE]
-        paddw   xmm4,[rel PW_ONE]
-        psraw   xmm6,1                  ; xmm6=(CbH * -FIX(0.22800))
-        psraw   xmm4,1                  ; xmm4=(CbL * -FIX(0.22800))
-        paddw   xmm7,[rel PW_ONE]
-        paddw   xmm0,[rel PW_ONE]
-        psraw   xmm7,1                  ; xmm7=(CrH * FIX(0.40200))
-        psraw   xmm0,1                  ; xmm0=(CrL * FIX(0.40200))
-
-        paddw   xmm6,xmm5
-        paddw   xmm4,xmm2
-        paddw   xmm6,xmm5               ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
-        paddw   xmm4,xmm2               ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
-        paddw   xmm7,xmm1               ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
-        paddw   xmm0,xmm3               ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
-        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
-
-        movdqa    xmm6,xmm5
-        movdqa    xmm7,xmm2
-        punpcklwd xmm5,xmm1
-        punpckhwd xmm6,xmm1
-        pmaddwd   xmm5,[rel PW_MF0344_F0285]
-        pmaddwd   xmm6,[rel PW_MF0344_F0285]
-        punpcklwd xmm2,xmm3
-        punpckhwd xmm7,xmm3
-        pmaddwd   xmm2,[rel PW_MF0344_F0285]
-        pmaddwd   xmm7,[rel PW_MF0344_F0285]
-
-        paddd     xmm5,[rel PD_ONEHALF]
-        paddd     xmm6,[rel PD_ONEHALF]
-        psrad     xmm5,SCALEBITS
-        psrad     xmm6,SCALEBITS
-        paddd     xmm2,[rel PD_ONEHALF]
-        paddd     xmm7,[rel PD_ONEHALF]
-        psrad     xmm2,SCALEBITS
-        psrad     xmm7,SCALEBITS
-
-        packssdw  xmm5,xmm6     ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-        packssdw  xmm2,xmm7     ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-        psubw     xmm5,xmm1     ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-        psubw     xmm2,xmm3     ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-        movdqa  XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
-
-        mov     al,2                    ; Yctr
-        jmp     short .Yloop_1st
-
-.Yloop_2nd:
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
-        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
-        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
-
-.Yloop_1st:
-        movdqa  xmm7, XMMWORD [rsi]     ; xmm7=Y(0123456789ABCDEF)
-
-        pcmpeqw xmm6,xmm6
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-        pand    xmm6,xmm7               ; xmm6=Y(02468ACE)=YE
-        psrlw   xmm7,BYTE_BIT           ; xmm7=Y(13579BDF)=YO
-
-        movdqa  xmm1,xmm0               ; xmm1=xmm0=(R-Y)(L/H)
-        movdqa  xmm3,xmm2               ; xmm3=xmm2=(G-Y)(L/H)
-        movdqa  xmm5,xmm4               ; xmm5=xmm4=(B-Y)(L/H)
-
-        paddw     xmm0,xmm6             ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
-        paddw     xmm1,xmm7             ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
-        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-        paddw     xmm2,xmm6             ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
-        paddw     xmm3,xmm7             ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
-        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-        paddw     xmm4,xmm6             ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
-        paddw     xmm5,xmm7             ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
-        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-        movdqa    xmmG,xmmA
-        movdqa    xmmH,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-        movdqa    xmmC,xmmD
-        movdqa    xmmB,xmmD
-        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-        movdqa    xmmF,xmmE
-        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-        movdqa    xmmB,xmmE
-        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-        movdqa    xmmB,xmmF
-        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    rdi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     rcx, byte SIZEOF_XMMWORD
-        jz      near .endcolumn
-
-        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
-        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-
-.column_st32:
-        lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
-        cmp     rcx, byte 2*SIZEOF_XMMWORD
-        jb      short .column_st16
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmF
-        sub     rcx, byte 2*SIZEOF_XMMWORD
-        jmp     short .column_st15
-.column_st16:
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st15
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        add     rdi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     rcx, byte SIZEOF_XMMWORD
-.column_st15:
-        ; Store the lower 8 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_MMWORD
-        jb      short .column_st7
-        movq    XMM_MMWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_MMWORD
-        sub     rcx, byte SIZEOF_MMWORD
-        psrldq  xmmA, SIZEOF_MMWORD
-.column_st7:
-        ; Store the lower 4 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_DWORD
-        jb      short .column_st3
-        movd    XMM_DWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_DWORD
-        sub     rcx, byte SIZEOF_DWORD
-        psrldq  xmmA, SIZEOF_DWORD
-.column_st3:
-        ; Store the lower 2 bytes of rax to the output when it has enough
-        ; space.
-        movd    eax, xmmA
-        cmp     rcx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [rdi], ax
-        add     rdi, byte SIZEOF_WORD
-        sub     rcx, byte SIZEOF_WORD
-        shr     rax, 16
-.column_st1:
-        ; Store the lower 1 byte of rax to the output when it has enough
-        ; space.
-        test    rcx, rcx
-        jz      short .endcolumn
-        mov     BYTE [rdi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-        movdqa    xmmC,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-        movdqa    xmmG,xmmB
-        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        movdqa    xmmH,xmmC
-        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    rdi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-        movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-        movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     rcx, byte SIZEOF_XMMWORD
-        jz      near .endcolumn
-
-        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
-        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-
-.column_st32:
-        cmp     rcx, byte SIZEOF_XMMWORD/2
-        jb      short .column_st16
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmC
-        movdqa  xmmD,xmmH
-        sub     rcx, byte SIZEOF_XMMWORD/2
-.column_st16:
-        cmp     rcx, byte SIZEOF_XMMWORD/4
-        jb      short .column_st15
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        add     rdi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     rcx, byte SIZEOF_XMMWORD/4
-.column_st15:
-        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_XMMWORD/8
-        jb      short .column_st7
-        movq    XMM_MMWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_XMMWORD/8*4
-        sub     rcx, byte SIZEOF_XMMWORD/8
-        psrldq  xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
-        ; space.
-        test    rcx, rcx
-        jz      short .endcolumn
-        movd    XMM_DWORD [rdi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-        sfence          ; flush the write buffer
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-; r10 = JDIMENSION output_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
-; r13 = JSAMPARRAY output_buf
-
-        align   16
-        global  EXTN(jsimd_h2v2_merged_upsample_sse2)
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        mov     eax, r10d
-
-        mov     rdi, r11
-        mov     ecx, r12d
-        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-        mov     rdi, r13
-        lea     rsi, [rsi+rcx*SIZEOF_JSAMPROW]
-
-        push    rdx                     ; inptr2
-        push    rbx                     ; inptr1
-        push    rsi                     ; inptr00
-        mov     rbx,rsp
-
-        push    rdi
-        push    rcx
-        push    rax
-
-        %ifdef WIN64
-        mov r8, rcx
-        mov r9, rdi
-        mov rcx, rax
-        mov rdx, rbx
-        %else
-        mov rdx, rcx
-        mov rcx, rdi
-        mov     rdi, rax
-        mov rsi, rbx
-        %endif
-
-        call    EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-        pop rax
-        pop rcx
-        pop rdi
-        pop rsi
-        pop rbx
-        pop rdx
-
-        add     rdi, byte SIZEOF_JSAMPROW       ; outptr1
-        add     rsi, byte SIZEOF_JSAMPROW       ; inptr01
-
-        push    rdx                     ; inptr2
-        push    rbx                     ; inptr1
-        push    rsi                     ; inptr00
-        mov     rbx,rsp
-
-        push    rdi
-        push    rcx
-        push    rax
-
-        %ifdef WIN64
-        mov r8, rcx
-        mov r9, rdi
-        mov rcx, rax
-        mov rdx, rbx
-        %else
-        mov rdx, rcx
-        mov rcx, rdi
-        mov     rdi, rax
-        mov rsi, rbx
-        %endif
-
-        call    EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-        pop rax
-        pop rcx
-        pop rdi
-        pop rsi
-        pop rbx
-        pop rdx
-
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdmrgext-sse2.asm b/media/libjpeg/simd/jdmrgext-sse2.asm
deleted file mode 100644
index b50f698b49..0000000000
--- a/media/libjpeg/simd/jdmrgext-sse2.asm
+++ /dev/null
@@ -1,518 +0,0 @@
-;
-; jdmrgext.asm - merged upsampling/color conversion (SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          3
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [output_width(eax)]     ; col
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     edi, JSAMPIMAGE [input_buf(eax)]
-        mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        mov     edi, JSAMPARRAY [output_buf(eax)]
-        mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
-        mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
-        mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
-        mov     edi, JSAMPROW [edi]                             ; outptr
-
-        pop     ecx                     ; col
-
-        alignx  16,7
-.columnloop:
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        movdqa    xmm6, XMMWORD [ebx]   ; xmm6=Cb(0123456789ABCDEF)
-        movdqa    xmm7, XMMWORD [edx]   ; xmm7=Cr(0123456789ABCDEF)
-
-        pxor      xmm1,xmm1             ; xmm1=(all 0's)
-        pcmpeqw   xmm3,xmm3
-        psllw     xmm3,7                ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-        movdqa    xmm4,xmm6
-        punpckhbw xmm6,xmm1             ; xmm6=Cb(89ABCDEF)=CbH
-        punpcklbw xmm4,xmm1             ; xmm4=Cb(01234567)=CbL
-        movdqa    xmm0,xmm7
-        punpckhbw xmm7,xmm1             ; xmm7=Cr(89ABCDEF)=CrH
-        punpcklbw xmm0,xmm1             ; xmm0=Cr(01234567)=CrL
-
-        paddw     xmm6,xmm3
-        paddw     xmm4,xmm3
-        paddw     xmm7,xmm3
-        paddw     xmm0,xmm3
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movdqa  xmm5,xmm6               ; xmm5=CbH
-        movdqa  xmm2,xmm4               ; xmm2=CbL
-        paddw   xmm6,xmm6               ; xmm6=2*CbH
-        paddw   xmm4,xmm4               ; xmm4=2*CbL
-        movdqa  xmm1,xmm7               ; xmm1=CrH
-        movdqa  xmm3,xmm0               ; xmm3=CrL
-        paddw   xmm7,xmm7               ; xmm7=2*CrH
-        paddw   xmm0,xmm0               ; xmm0=2*CrL
-
-        pmulhw  xmm6,[GOTOFF(eax,PW_MF0228)]    ; xmm6=(2*CbH * -FIX(0.22800))
-        pmulhw  xmm4,[GOTOFF(eax,PW_MF0228)]    ; xmm4=(2*CbL * -FIX(0.22800))
-        pmulhw  xmm7,[GOTOFF(eax,PW_F0402)]     ; xmm7=(2*CrH * FIX(0.40200))
-        pmulhw  xmm0,[GOTOFF(eax,PW_F0402)]     ; xmm0=(2*CrL * FIX(0.40200))
-
-        paddw   xmm6,[GOTOFF(eax,PW_ONE)]
-        paddw   xmm4,[GOTOFF(eax,PW_ONE)]
-        psraw   xmm6,1                  ; xmm6=(CbH * -FIX(0.22800))
-        psraw   xmm4,1                  ; xmm4=(CbL * -FIX(0.22800))
-        paddw   xmm7,[GOTOFF(eax,PW_ONE)]
-        paddw   xmm0,[GOTOFF(eax,PW_ONE)]
-        psraw   xmm7,1                  ; xmm7=(CrH * FIX(0.40200))
-        psraw   xmm0,1                  ; xmm0=(CrL * FIX(0.40200))
-
-        paddw   xmm6,xmm5
-        paddw   xmm4,xmm2
-        paddw   xmm6,xmm5               ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
-        paddw   xmm4,xmm2               ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
-        paddw   xmm7,xmm1               ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
-        paddw   xmm0,xmm3               ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
-        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
-
-        movdqa    xmm6,xmm5
-        movdqa    xmm7,xmm2
-        punpcklwd xmm5,xmm1
-        punpckhwd xmm6,xmm1
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
-        punpcklwd xmm2,xmm3
-        punpckhwd xmm7,xmm3
-        pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
-        paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     xmm6,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     xmm5,SCALEBITS
-        psrad     xmm6,SCALEBITS
-        paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     xmm7,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     xmm2,SCALEBITS
-        psrad     xmm7,SCALEBITS
-
-        packssdw  xmm5,xmm6     ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-        packssdw  xmm2,xmm7     ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-        psubw     xmm5,xmm1     ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-        psubw     xmm2,xmm3     ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-        movdqa  XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
-
-        mov     al,2                    ; Yctr
-        jmp     short .Yloop_1st
-        alignx  16,7
-
-.Yloop_2nd:
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
-        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
-        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
-        alignx  16,7
-
-.Yloop_1st:
-        movdqa  xmm7, XMMWORD [esi]     ; xmm7=Y(0123456789ABCDEF)
-
-        pcmpeqw xmm6,xmm6
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-        pand    xmm6,xmm7               ; xmm6=Y(02468ACE)=YE
-        psrlw   xmm7,BYTE_BIT           ; xmm7=Y(13579BDF)=YO
-
-        movdqa  xmm1,xmm0               ; xmm1=xmm0=(R-Y)(L/H)
-        movdqa  xmm3,xmm2               ; xmm3=xmm2=(G-Y)(L/H)
-        movdqa  xmm5,xmm4               ; xmm5=xmm4=(B-Y)(L/H)
-
-        paddw     xmm0,xmm6             ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
-        paddw     xmm1,xmm7             ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
-        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-        paddw     xmm2,xmm6             ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
-        paddw     xmm3,xmm7             ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
-        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-        paddw     xmm4,xmm6             ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
-        paddw     xmm5,xmm7             ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
-        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-        movdqa    xmmG,xmmA
-        movdqa    xmmH,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-        movdqa    xmmC,xmmD
-        movdqa    xmmB,xmmD
-        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-        movdqa    xmmF,xmmE
-        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-        movdqa    xmmB,xmmE
-        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-        movdqa    xmmB,xmmF
-        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    edi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     ecx, byte SIZEOF_XMMWORD
-        jz      near .endcolumn
-
-        add     esi, byte SIZEOF_XMMWORD        ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-        add     edx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st32:
-        lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
-        cmp     ecx, byte 2*SIZEOF_XMMWORD
-        jb      short .column_st16
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmF
-        sub     ecx, byte 2*SIZEOF_XMMWORD
-        jmp     short .column_st15
-.column_st16:
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st15
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        add     edi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     ecx, byte SIZEOF_XMMWORD
-.column_st15:
-        ; Store the lower 8 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st7
-        movq    XMM_MMWORD [edi], xmmA
-        add     edi, byte SIZEOF_MMWORD
-        sub     ecx, byte SIZEOF_MMWORD
-        psrldq  xmmA, SIZEOF_MMWORD
-.column_st7:
-        ; Store the lower 4 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_DWORD
-        jb      short .column_st3
-        movd    XMM_DWORD [edi], xmmA
-        add     edi, byte SIZEOF_DWORD
-        sub     ecx, byte SIZEOF_DWORD
-        psrldq  xmmA, SIZEOF_DWORD
-.column_st3:
-        ; Store the lower 2 bytes of eax to the output when it has enough
-        ; space.
-        movd    eax, xmmA
-        cmp     ecx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [edi], ax
-        add     edi, byte SIZEOF_WORD
-        sub     ecx, byte SIZEOF_WORD
-        shr     eax, 16
-.column_st1:
-        ; Store the lower 1 byte of eax to the output when it has enough
-        ; space.
-        test    ecx, ecx
-        jz      short .endcolumn
-        mov     BYTE [edi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-        movdqa    xmmC,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-        movdqa    xmmG,xmmB
-        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        movdqa    xmmH,xmmC
-        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    edi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-        movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-        movdqu  XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     ecx, byte SIZEOF_XMMWORD
-        jz      near .endcolumn
-
-        add     esi, byte SIZEOF_XMMWORD        ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-        add     edx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st32:
-        cmp     ecx, byte SIZEOF_XMMWORD/2
-        jb      short .column_st16
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmC
-        movdqa  xmmD,xmmH
-        sub     ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
-        cmp     ecx, byte SIZEOF_XMMWORD/4
-        jb      short .column_st15
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        add     edi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
-        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_XMMWORD/8
-        jb      short .column_st7
-        movq    XMM_MMWORD [edi], xmmA
-        add     edi, byte SIZEOF_XMMWORD/8*4
-        sub     ecx, byte SIZEOF_XMMWORD/8
-        psrldq  xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
-        ; space.
-        test    ecx, ecx
-        jz      short .endcolumn
-        movd    XMM_DWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-        sfence          ; flush the write buffer
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
-
-        align   16
-        global  EXTN(jsimd_h2v2_merged_upsample_sse2)
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     eax, POINTER [output_width(ebp)]
-
-        mov     edi, JSAMPIMAGE [input_buf(ebp)]
-        mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        mov     edi, JSAMPARRAY [output_buf(ebp)]
-        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-
-        push    edx                     ; inptr2
-        push    ebx                     ; inptr1
-        push    esi                     ; inptr00
-        mov     ebx,esp
-
-        push    edi                     ; output_buf (outptr0)
-        push    ecx                     ; in_row_group_ctr
-        push    ebx                     ; input_buf
-        push    eax                     ; output_width
-
-        call    near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-        add     esi, byte SIZEOF_JSAMPROW       ; inptr01
-        add     edi, byte SIZEOF_JSAMPROW       ; outptr1
-        mov     POINTER [ebx+0*SIZEOF_POINTER], esi
-        mov     POINTER [ebx-1*SIZEOF_POINTER], edi
-
-        call    near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-        add     esp, byte 7*SIZEOF_DWORD
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdsample-mmx.asm b/media/libjpeg/simd/jdsample-mmx.asm
deleted file mode 100644
index 5e4fa7ae22..0000000000
--- a/media/libjpeg/simd/jdsample-mmx.asm
+++ /dev/null
@@ -1,736 +0,0 @@
-;
-; jdsample.asm - upsampling (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fancy_upsample_mmx)
-
-EXTN(jconst_fancy_upsample_mmx):
-
-PW_ONE          times 4 dw  1
-PW_TWO          times 4 dw  2
-PW_THREE        times 4 dw  3
-PW_SEVEN        times 4 dw  7
-PW_EIGHT        times 4 dw  8
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
-;                                JDIMENSION downsampled_width,
-;                                JSAMPARRAY input_data,
-;                                JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_fancy_upsample_mmx)
-
-EXTN(jsimd_h2v1_fancy_upsample_mmx):
-        push    ebp
-        mov     ebp,esp
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
-        test    eax,eax
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    eax                     ; colctr
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr
-
-        test    eax, SIZEOF_MMWORD-1
-        jz      short .skip
-        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-.skip:
-        pxor    mm0,mm0                 ; mm0=(all 0's)
-        pcmpeqb mm7,mm7
-        psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
-        pand    mm7, MMWORD [esi+0*SIZEOF_MMWORD]
-
-        add     eax, byte SIZEOF_MMWORD-1
-        and     eax, byte -SIZEOF_MMWORD
-        cmp     eax, byte SIZEOF_MMWORD
-        ja      short .columnloop
-        alignx  16,7
-
-.columnloop_last:
-        pcmpeqb mm6,mm6
-        psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
-        pand    mm6, MMWORD [esi+0*SIZEOF_MMWORD]
-        jmp     short .upsample
-        alignx  16,7
-
-.columnloop:
-        movq    mm6, MMWORD [esi+1*SIZEOF_MMWORD]
-        psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
-
-.upsample:
-        movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mm2,mm1
-        movq    mm3,mm1                 ; mm1=( 0 1 2 3 4 5 6 7)
-        psllq   mm2,BYTE_BIT            ; mm2=( - 0 1 2 3 4 5 6)
-        psrlq   mm3,BYTE_BIT            ; mm3=( 1 2 3 4 5 6 7 -)
-
-        por     mm2,mm7                 ; mm2=(-1 0 1 2 3 4 5 6)
-        por     mm3,mm6                 ; mm3=( 1 2 3 4 5 6 7 8)
-
-        movq    mm7,mm1
-        psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
-
-        movq      mm4,mm1
-        punpcklbw mm1,mm0               ; mm1=( 0 1 2 3)
-        punpckhbw mm4,mm0               ; mm4=( 4 5 6 7)
-        movq      mm5,mm2
-        punpcklbw mm2,mm0               ; mm2=(-1 0 1 2)
-        punpckhbw mm5,mm0               ; mm5=( 3 4 5 6)
-        movq      mm6,mm3
-        punpcklbw mm3,mm0               ; mm3=( 1 2 3 4)
-        punpckhbw mm6,mm0               ; mm6=( 5 6 7 8)
-
-        pmullw  mm1,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
-        paddw   mm2,[GOTOFF(ebx,PW_ONE)]
-        paddw   mm5,[GOTOFF(ebx,PW_ONE)]
-        paddw   mm3,[GOTOFF(ebx,PW_TWO)]
-        paddw   mm6,[GOTOFF(ebx,PW_TWO)]
-
-        paddw   mm2,mm1
-        paddw   mm5,mm4
-        psrlw   mm2,2                   ; mm2=OutLE=( 0  2  4  6)
-        psrlw   mm5,2                   ; mm5=OutHE=( 8 10 12 14)
-        paddw   mm3,mm1
-        paddw   mm6,mm4
-        psrlw   mm3,2                   ; mm3=OutLO=( 1  3  5  7)
-        psrlw   mm6,2                   ; mm6=OutHO=( 9 11 13 15)
-
-        psllw   mm3,BYTE_BIT
-        psllw   mm6,BYTE_BIT
-        por     mm2,mm3                 ; mm2=OutL=( 0  1  2  3  4  5  6  7)
-        por     mm5,mm6                 ; mm5=OutH=( 8  9 10 11 12 13 14 15)
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm5
-
-        sub     eax, byte SIZEOF_MMWORD
-        add     esi, byte 1*SIZEOF_MMWORD       ; inptr
-        add     edi, byte 2*SIZEOF_MMWORD       ; outptr
-        cmp     eax, byte SIZEOF_MMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     esi
-        pop     edi
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     ecx                             ; rowctr
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
-;                                JDIMENSION downsampled_width,
-;                                JSAMPARRAY input_data,
-;                                JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          4
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void *gotptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_fancy_upsample_mmx)
-
-EXTN(jsimd_h2v2_fancy_upsample_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     edx,eax                         ; edx = original ebp
-        mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
-        test    eax,eax
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(edx)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(edx)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    eax                                     ; colctr
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
-
-        test    eax, SIZEOF_MMWORD-1
-        jz      short .skip
-        push    edx
-        mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-        pop     edx
-.skip:
-        ; -- process the first column block
-
-        movq    mm0, MMWORD [ebx+0*SIZEOF_MMWORD]       ; mm0=row[ 0][0]
-        movq    mm1, MMWORD [ecx+0*SIZEOF_MMWORD]       ; mm1=row[-1][0]
-        movq    mm2, MMWORD [esi+0*SIZEOF_MMWORD]       ; mm2=row[+1][0]
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pxor      mm3,mm3               ; mm3=(all 0's)
-        movq      mm4,mm0
-        punpcklbw mm0,mm3               ; mm0=row[ 0][0]( 0 1 2 3)
-        punpckhbw mm4,mm3               ; mm4=row[ 0][0]( 4 5 6 7)
-        movq      mm5,mm1
-        punpcklbw mm1,mm3               ; mm1=row[-1][0]( 0 1 2 3)
-        punpckhbw mm5,mm3               ; mm5=row[-1][0]( 4 5 6 7)
-        movq      mm6,mm2
-        punpcklbw mm2,mm3               ; mm2=row[+1][0]( 0 1 2 3)
-        punpckhbw mm6,mm3               ; mm6=row[+1][0]( 4 5 6 7)
-
-        pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
-
-        pcmpeqb mm7,mm7
-        psrlq   mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
-
-        paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
-        paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
-        paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
-        paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
-
-        movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1       ; temporarily save
-        movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5       ; the intermediate data
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm6
-
-        pand    mm1,mm7                 ; mm1=( 0 - - -)
-        pand    mm2,mm7                 ; mm2=( 0 - - -)
-
-        movq    MMWORD [wk(0)], mm1
-        movq    MMWORD [wk(1)], mm2
-
-        poppic  ebx
-
-        add     eax, byte SIZEOF_MMWORD-1
-        and     eax, byte -SIZEOF_MMWORD
-        cmp     eax, byte SIZEOF_MMWORD
-        ja      short .columnloop
-        alignx  16,7
-
-.columnloop_last:
-        ; -- process the last column block
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pcmpeqb mm1,mm1
-        psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
-        movq    mm2,mm1
-
-        pand    mm1, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm1=( - - - 7)
-        pand    mm2, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm2=( - - - 7)
-
-        movq    MMWORD [wk(2)], mm1
-        movq    MMWORD [wk(3)], mm2
-
-        jmp     short .upsample
-        alignx  16,7
-
-.columnloop:
-        ; -- process the next column block
-
-        movq    mm0, MMWORD [ebx+1*SIZEOF_MMWORD]       ; mm0=row[ 0][1]
-        movq    mm1, MMWORD [ecx+1*SIZEOF_MMWORD]       ; mm1=row[-1][1]
-        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]       ; mm2=row[+1][1]
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pxor      mm3,mm3               ; mm3=(all 0's)
-        movq      mm4,mm0
-        punpcklbw mm0,mm3               ; mm0=row[ 0][1]( 0 1 2 3)
-        punpckhbw mm4,mm3               ; mm4=row[ 0][1]( 4 5 6 7)
-        movq      mm5,mm1
-        punpcklbw mm1,mm3               ; mm1=row[-1][1]( 0 1 2 3)
-        punpckhbw mm5,mm3               ; mm5=row[-1][1]( 4 5 6 7)
-        movq      mm6,mm2
-        punpcklbw mm2,mm3               ; mm2=row[+1][1]( 0 1 2 3)
-        punpckhbw mm6,mm3               ; mm6=row[+1][1]( 4 5 6 7)
-
-        pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
-
-        paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
-        paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
-        paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
-        paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
-
-        movq    MMWORD [edx+2*SIZEOF_MMWORD], mm1       ; temporarily save
-        movq    MMWORD [edx+3*SIZEOF_MMWORD], mm5       ; the intermediate data
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm6
-
-        psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
-        psllq   mm2,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
-
-        movq    MMWORD [wk(2)], mm1
-        movq    MMWORD [wk(3)], mm2
-
-.upsample:
-        ; -- process the upper row
-
-        movq    mm7, MMWORD [edx+0*SIZEOF_MMWORD]       ; mm7=Int0L=( 0 1 2 3)
-        movq    mm3, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm3=Int0H=( 4 5 6 7)
-
-        movq    mm0,mm7
-        movq    mm4,mm3
-        psrlq   mm0,2*BYTE_BIT                  ; mm0=( 1 2 3 -)
-        psllq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
-        movq    mm5,mm7
-        movq    mm6,mm3
-        psrlq   mm5,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
-        psllq   mm6,2*BYTE_BIT                  ; mm6=( - 4 5 6)
-
-        por     mm0,mm4                         ; mm0=( 1 2 3 4)
-        por     mm5,mm6                         ; mm5=( 3 4 5 6)
-
-        movq    mm1,mm7
-        movq    mm2,mm3
-        psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
-        psrlq   mm2,2*BYTE_BIT                  ; mm2=( 5 6 7 -)
-        movq    mm4,mm3
-        psrlq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
-
-        por     mm1, MMWORD [wk(0)]             ; mm1=(-1 0 1 2)
-        por     mm2, MMWORD [wk(2)]             ; mm2=( 5 6 7 8)
-
-        movq    MMWORD [wk(0)], mm4
-
-        pmullw  mm7,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm3,[GOTOFF(ebx,PW_THREE)]
-        paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   mm5,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   mm0,[GOTOFF(ebx,PW_SEVEN)]
-        paddw   mm2,[GOTOFF(ebx,PW_SEVEN)]
-
-        paddw   mm1,mm7
-        paddw   mm5,mm3
-        psrlw   mm1,4                   ; mm1=Out0LE=( 0  2  4  6)
-        psrlw   mm5,4                   ; mm5=Out0HE=( 8 10 12 14)
-        paddw   mm0,mm7
-        paddw   mm2,mm3
-        psrlw   mm0,4                   ; mm0=Out0LO=( 1  3  5  7)
-        psrlw   mm2,4                   ; mm2=Out0HO=( 9 11 13 15)
-
-        psllw   mm0,BYTE_BIT
-        psllw   mm2,BYTE_BIT
-        por     mm1,mm0                 ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
-        por     mm5,mm2                 ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
-
-        movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1
-        movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5
-
-        ; -- process the lower row
-
-        movq    mm6, MMWORD [edi+0*SIZEOF_MMWORD]       ; mm6=Int1L=( 0 1 2 3)
-        movq    mm4, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm4=Int1H=( 4 5 6 7)
-
-        movq    mm7,mm6
-        movq    mm3,mm4
-        psrlq   mm7,2*BYTE_BIT                  ; mm7=( 1 2 3 -)
-        psllq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
-        movq    mm0,mm6
-        movq    mm2,mm4
-        psrlq   mm0,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
-        psllq   mm2,2*BYTE_BIT                  ; mm2=( - 4 5 6)
-
-        por     mm7,mm3                         ; mm7=( 1 2 3 4)
-        por     mm0,mm2                         ; mm0=( 3 4 5 6)
-
-        movq    mm1,mm6
-        movq    mm5,mm4
-        psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
-        psrlq   mm5,2*BYTE_BIT                  ; mm5=( 5 6 7 -)
-        movq    mm3,mm4
-        psrlq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
-
-        por     mm1, MMWORD [wk(1)]             ; mm1=(-1 0 1 2)
-        por     mm5, MMWORD [wk(3)]             ; mm5=( 5 6 7 8)
-
-        movq    MMWORD [wk(1)], mm3
-
-        pmullw  mm6,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
-        paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   mm0,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   mm7,[GOTOFF(ebx,PW_SEVEN)]
-        paddw   mm5,[GOTOFF(ebx,PW_SEVEN)]
-
-        paddw   mm1,mm6
-        paddw   mm0,mm4
-        psrlw   mm1,4                   ; mm1=Out1LE=( 0  2  4  6)
-        psrlw   mm0,4                   ; mm0=Out1HE=( 8 10 12 14)
-        paddw   mm7,mm6
-        paddw   mm5,mm4
-        psrlw   mm7,4                   ; mm7=Out1LO=( 1  3  5  7)
-        psrlw   mm5,4                   ; mm5=Out1HO=( 9 11 13 15)
-
-        psllw   mm7,BYTE_BIT
-        psllw   mm5,BYTE_BIT
-        por     mm1,mm7                 ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
-        por     mm0,mm5                 ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm1
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm0
-
-        poppic  ebx
-
-        sub     eax, byte SIZEOF_MMWORD
-        add     ecx, byte 1*SIZEOF_MMWORD       ; inptr1(above)
-        add     ebx, byte 1*SIZEOF_MMWORD       ; inptr0
-        add     esi, byte 1*SIZEOF_MMWORD       ; inptr1(below)
-        add     edx, byte 2*SIZEOF_MMWORD       ; outptr0
-        add     edi, byte 2*SIZEOF_MMWORD       ; outptr1
-        cmp     eax, byte SIZEOF_MMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     esi
-        pop     edi
-        pop     ecx
-        pop     eax
-
-        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     ecx, byte 2                     ; rowctr
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
-;                          JDIMENSION output_width,
-;                          JSAMPARRAY input_data,
-;                          JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define output_width(b)         (b)+12          ; JDIMENSION output_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_upsample_mmx)
-
-EXTN(jsimd_h2v1_upsample_mmx):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     edx, JDIMENSION [output_width(ebp)]
-        add     edx, byte (2*SIZEOF_MMWORD)-1
-        and     edx, byte -(2*SIZEOF_MMWORD)
-        jz      short .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      short .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]             ; inptr
-        mov     edi, JSAMPROW [edi]             ; outptr
-        mov     eax,edx                         ; colctr
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-
-        movq      mm1,mm0
-        punpcklbw mm0,mm0
-        punpckhbw mm1,mm1
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
-
-        sub     eax, byte 2*SIZEOF_MMWORD
-        jz      short .nextrow
-
-        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
-
-        movq      mm3,mm2
-        punpcklbw mm2,mm2
-        punpckhbw mm3,mm3
-
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
-
-        sub     eax, byte 2*SIZEOF_MMWORD
-        jz      short .nextrow
-
-        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
-        add     edi, byte 4*SIZEOF_MMWORD       ; outptr
-        jmp     short .columnloop
-        alignx  16,7
-
-.nextrow:
-        pop     esi
-        pop     edi
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     ecx                             ; rowctr
-        jg      short .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
-;                          JDIMENSION output_width,
-;                          JSAMPARRAY input_data,
-;                          JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define output_width(b)         (b)+12          ; JDIMENSION output_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_upsample_mmx)
-
-EXTN(jsimd_h2v2_upsample_mmx):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     edx, JDIMENSION [output_width(ebp)]
-        add     edx, byte (2*SIZEOF_MMWORD)-1
-        and     edx, byte -(2*SIZEOF_MMWORD)
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      short .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]                     ; inptr
-        mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
-        mov     eax,edx                                 ; colctr
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-
-        movq      mm1,mm0
-        punpcklbw mm0,mm0
-        punpckhbw mm1,mm1
-
-        movq    MMWORD [ebx+0*SIZEOF_MMWORD], mm0
-        movq    MMWORD [ebx+1*SIZEOF_MMWORD], mm1
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
-
-        sub     eax, byte 2*SIZEOF_MMWORD
-        jz      short .nextrow
-
-        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
-
-        movq      mm3,mm2
-        punpcklbw mm2,mm2
-        punpckhbw mm3,mm3
-
-        movq    MMWORD [ebx+2*SIZEOF_MMWORD], mm2
-        movq    MMWORD [ebx+3*SIZEOF_MMWORD], mm3
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
-
-        sub     eax, byte 2*SIZEOF_MMWORD
-        jz      short .nextrow
-
-        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
-        add     ebx, byte 4*SIZEOF_MMWORD       ; outptr0
-        add     edi, byte 4*SIZEOF_MMWORD       ; outptr1
-        jmp     short .columnloop
-        alignx  16,7
-
-.nextrow:
-        pop     esi
-        pop     edi
-
-        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     ecx, byte 2                     ; rowctr
-        jg      short .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdsample-sse2-64.asm b/media/libjpeg/simd/jdsample-sse2-64.asm
deleted file mode 100644
index 1faaed648a..0000000000
--- a/media/libjpeg/simd/jdsample-sse2-64.asm
+++ /dev/null
@@ -1,670 +0,0 @@
-;
-; jdsample.asm - upsampling (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fancy_upsample_sse2)
-
-EXTN(jconst_fancy_upsample_sse2):
-
-PW_ONE          times 8 dw  1
-PW_TWO          times 8 dw  2
-PW_THREE        times 8 dw  3
-PW_SEVEN        times 8 dw  7
-PW_EIGHT        times 8 dw  8
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v1_fancy_upsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov     eax, r11d  ; colctr
-        test    rax,rax
-        jz      near .return
-
-        mov     rcx, r10        ; rowctr
-        test    rcx,rcx
-        jz      near .return
-
-        mov     rsi, r12        ; input_data
-        mov     rdi, r13
-        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
-.rowloop:
-        push    rax                     ; colctr
-        push    rdi
-        push    rsi
-
-        mov     rsi, JSAMPROW [rsi]     ; inptr
-        mov     rdi, JSAMPROW [rdi]     ; outptr
-
-        test    rax, SIZEOF_XMMWORD-1
-        jz      short .skip
-        mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-.skip:
-        pxor    xmm0,xmm0               ; xmm0=(all 0's)
-        pcmpeqb xmm7,xmm7
-        psrldq  xmm7,(SIZEOF_XMMWORD-1)
-        pand    xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-        add     rax, byte SIZEOF_XMMWORD-1
-        and     rax, byte -SIZEOF_XMMWORD
-        cmp     rax, byte SIZEOF_XMMWORD
-        ja      short .columnloop
-
-.columnloop_last:
-        pcmpeqb xmm6,xmm6
-        pslldq  xmm6,(SIZEOF_XMMWORD-1)
-        pand    xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        jmp     short .upsample
-
-.columnloop:
-        movdqa  xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        pslldq  xmm6,(SIZEOF_XMMWORD-1)
-
-.upsample:
-        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqa  xmm2,xmm1
-        movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
-        pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
-        psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
-
-        por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
-        por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
-
-        movdqa  xmm7,xmm1
-        psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
-
-        movdqa    xmm4,xmm1
-        punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm2
-        punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
-        punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
-        movdqa    xmm6,xmm3
-        punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
-        punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
-
-        pmullw  xmm1,[rel PW_THREE]
-        pmullw  xmm4,[rel PW_THREE]
-        paddw   xmm2,[rel PW_ONE]
-        paddw   xmm5,[rel PW_ONE]
-        paddw   xmm3,[rel PW_TWO]
-        paddw   xmm6,[rel PW_TWO]
-
-        paddw   xmm2,xmm1
-        paddw   xmm5,xmm4
-        psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
-        paddw   xmm3,xmm1
-        paddw   xmm6,xmm4
-        psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm3,BYTE_BIT
-        psllw   xmm6,BYTE_BIT
-        por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
-        por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
-
-        sub     rax, byte SIZEOF_XMMWORD
-        add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        cmp     rax, byte SIZEOF_XMMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     rsi
-        pop     rdi
-        pop     rax
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
-        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     rcx                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        uncollect_args
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          4
-
-        align   16
-        global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v2_fancy_upsample_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     eax, r11d  ; colctr
-        test    rax,rax
-        jz      near .return
-
-        mov     rcx, r10        ; rowctr
-        test    rcx,rcx
-        jz      near .return
-
-        mov     rsi, r12        ; input_data
-        mov     rdi, r13
-        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
-.rowloop:
-        push    rax                                     ; colctr
-        push    rcx
-        push    rdi
-        push    rsi
-
-        mov     rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
-        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
-
-        test    rax, SIZEOF_XMMWORD-1
-        jz      short .skip
-        push    rdx
-        mov     dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-        pop     rdx
-.skip:
-        ; -- process the first column block
-
-        movdqa  xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
-        movdqa  xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
-        movdqa  xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
-
-        pxor      xmm3,xmm3             ; xmm3=(all 0's)
-        movdqa    xmm4,xmm0
-        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm1
-        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm6,xmm2
-        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-        pmullw  xmm0,[rel PW_THREE]
-        pmullw  xmm4,[rel PW_THREE]
-
-        pcmpeqb xmm7,xmm7
-        psrldq  xmm7,(SIZEOF_XMMWORD-2)
-
-        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-        movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
-        movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
-
-        pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
-        pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
-
-        movdqa  XMMWORD [wk(0)], xmm1
-        movdqa  XMMWORD [wk(1)], xmm2
-
-        add     rax, byte SIZEOF_XMMWORD-1
-        and     rax, byte -SIZEOF_XMMWORD
-        cmp     rax, byte SIZEOF_XMMWORD
-        ja      short .columnloop
-
-.columnloop_last:
-        ; -- process the last column block
-
-        pcmpeqb xmm1,xmm1
-        pslldq  xmm1,(SIZEOF_XMMWORD-2)
-        movdqa  xmm2,xmm1
-
-        pand    xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-        pand    xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
-        movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
-        movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
-
-        jmp     near .upsample
-
-.columnloop:
-        ; -- process the next column block
-
-        movdqa  xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
-        movdqa  xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
-        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
-
-        pxor      xmm3,xmm3             ; xmm3=(all 0's)
-        movdqa    xmm4,xmm0
-        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm1
-        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm6,xmm2
-        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-        pmullw  xmm0,[rel PW_THREE]
-        pmullw  xmm4,[rel PW_THREE]
-
-        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-        movdqa  XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
-        movdqa  XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
-        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
-
-        pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
-        pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
-
-        movdqa  XMMWORD [wk(2)], xmm1
-        movdqa  XMMWORD [wk(3)], xmm2
-
-.upsample:
-        ; -- process the upper row
-
-        movdqa  xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-        movdqa  xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-
-        movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
-        movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
-        psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
-        pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
-        movdqa  xmm5,xmm7
-        movdqa  xmm6,xmm3
-        psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
-        pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
-
-        por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
-        por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
-
-        movdqa  xmm1,xmm7
-        movdqa  xmm2,xmm3
-        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
-        psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
-        movdqa  xmm4,xmm3
-        psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
-
-        por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
-        por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
-
-        movdqa  XMMWORD [wk(0)], xmm4
-
-        pmullw  xmm7,[rel PW_THREE]
-        pmullw  xmm3,[rel PW_THREE]
-        paddw   xmm1,[rel PW_EIGHT]
-        paddw   xmm5,[rel PW_EIGHT]
-        paddw   xmm0,[rel PW_SEVEN]
-        paddw   xmm2,[rel PW_SEVEN]
-
-        paddw   xmm1,xmm7
-        paddw   xmm5,xmm3
-        psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
-        paddw   xmm0,xmm7
-        paddw   xmm2,xmm3
-        psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm0,BYTE_BIT
-        psllw   xmm2,BYTE_BIT
-        por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
-        por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
-
-        ; -- process the lower row
-
-        movdqa  xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
-        movdqa  xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
-        movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
-        movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
-        psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
-        pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
-        movdqa  xmm0,xmm6
-        movdqa  xmm2,xmm4
-        psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
-        pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
-
-        por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
-        por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
-
-        movdqa  xmm1,xmm6
-        movdqa  xmm5,xmm4
-        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
-        psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
-        movdqa  xmm3,xmm4
-        psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
-
-        por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
-        por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
-
-        movdqa  XMMWORD [wk(1)], xmm3
-
-        pmullw  xmm6,[rel PW_THREE]
-        pmullw  xmm4,[rel PW_THREE]
-        paddw   xmm1,[rel PW_EIGHT]
-        paddw   xmm0,[rel PW_EIGHT]
-        paddw   xmm7,[rel PW_SEVEN]
-        paddw   xmm5,[rel PW_SEVEN]
-
-        paddw   xmm1,xmm6
-        paddw   xmm0,xmm4
-        psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
-        paddw   xmm7,xmm6
-        paddw   xmm5,xmm4
-        psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm7,BYTE_BIT
-        psllw   xmm5,BYTE_BIT
-        por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
-        por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
-
-        sub     rax, byte SIZEOF_XMMWORD
-        add     rcx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
-        add     rbx, byte 1*SIZEOF_XMMWORD      ; inptr0
-        add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
-        add     rdx, byte 2*SIZEOF_XMMWORD      ; outptr0
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr1
-        cmp     rax, byte SIZEOF_XMMWORD
-        ja      near .columnloop
-        test    rax,rax
-        jnz     near .columnloop_last
-
-        pop     rsi
-        pop     rdi
-        pop     rcx
-        pop     rax
-
-        add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     rcx, byte 2                     ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_upsample_sse2)
-
-EXTN(jsimd_h2v1_upsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov     edx, r11d
-        add     rdx, byte (2*SIZEOF_XMMWORD)-1
-        and     rdx, byte -(2*SIZEOF_XMMWORD)
-        jz      near .return
-
-        mov     rcx, r10        ; rowctr
-        test    rcx,rcx
-        jz      short .return
-
-        mov     rsi, r12 ; input_data
-        mov     rdi, r13
-        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
-.rowloop:
-        push    rdi
-        push    rsi
-
-        mov     rsi, JSAMPROW [rsi]             ; inptr
-        mov     rdi, JSAMPROW [rdi]             ; outptr
-        mov     rax,rdx                         ; colctr
-.columnloop:
-
-        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-        movdqa    xmm1,xmm0
-        punpcklbw xmm0,xmm0
-        punpckhbw xmm1,xmm1
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
-
-        sub     rax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-        movdqa    xmm3,xmm2
-        punpcklbw xmm2,xmm2
-        punpckhbw xmm3,xmm3
-
-        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
-
-        sub     rax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr
-        jmp     short .columnloop
-
-.nextrow:
-        pop     rsi
-        pop     rdi
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
-        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     rcx                             ; rowctr
-        jg      short .rowloop
-
-.return:
-        uncollect_args
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_upsample_sse2)
-
-EXTN(jsimd_h2v2_upsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        mov     edx, r11d
-        add     rdx, byte (2*SIZEOF_XMMWORD)-1
-        and     rdx, byte -(2*SIZEOF_XMMWORD)
-        jz      near .return
-
-        mov     rcx, r10        ; rowctr
-        test    rcx,rcx
-        jz      near .return
-
-        mov     rsi, r12        ; input_data
-        mov     rdi, r13
-        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
-.rowloop:
-        push    rdi
-        push    rsi
-
-        mov     rsi, JSAMPROW [rsi]                     ; inptr
-        mov     rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
-        mov     rax,rdx                                 ; colctr
-.columnloop:
-
-        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-        movdqa    xmm1,xmm0
-        punpcklbw xmm0,xmm0
-        punpckhbw xmm1,xmm1
-
-        movdqa  XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
-
-        sub     rax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-        movdqa    xmm3,xmm2
-        punpcklbw xmm2,xmm2
-        punpckhbw xmm3,xmm3
-
-        movdqa  XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
-        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
-
-        sub     rax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     rbx, byte 4*SIZEOF_XMMWORD      ; outptr0
-        add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr1
-        jmp     short .columnloop
-
-.nextrow:
-        pop     rsi
-        pop     rdi
-
-        add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     rcx, byte 2                     ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdsample-sse2.asm b/media/libjpeg/simd/jdsample-sse2.asm
deleted file mode 100644
index 1d0059e803..0000000000
--- a/media/libjpeg/simd/jdsample-sse2.asm
+++ /dev/null
@@ -1,728 +0,0 @@
-;
-; jdsample.asm - upsampling (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fancy_upsample_sse2)
-
-EXTN(jconst_fancy_upsample_sse2):
-
-PW_ONE          times 8 dw  1
-PW_TWO          times 8 dw  2
-PW_THREE        times 8 dw  3
-PW_SEVEN        times 8 dw  7
-PW_EIGHT        times 8 dw  8
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v1_fancy_upsample_sse2):
-        push    ebp
-        mov     ebp,esp
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
-        test    eax,eax
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    eax                     ; colctr
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr
-
-        test    eax, SIZEOF_XMMWORD-1
-        jz      short .skip
-        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-.skip:
-        pxor    xmm0,xmm0               ; xmm0=(all 0's)
-        pcmpeqb xmm7,xmm7
-        psrldq  xmm7,(SIZEOF_XMMWORD-1)
-        pand    xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-        add     eax, byte SIZEOF_XMMWORD-1
-        and     eax, byte -SIZEOF_XMMWORD
-        cmp     eax, byte SIZEOF_XMMWORD
-        ja      short .columnloop
-        alignx  16,7
-
-.columnloop_last:
-        pcmpeqb xmm6,xmm6
-        pslldq  xmm6,(SIZEOF_XMMWORD-1)
-        pand    xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        jmp     short .upsample
-        alignx  16,7
-
-.columnloop:
-        movdqa  xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        pslldq  xmm6,(SIZEOF_XMMWORD-1)
-
-.upsample:
-        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqa  xmm2,xmm1
-        movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
-        pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
-        psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
-
-        por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
-        por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
-
-        movdqa  xmm7,xmm1
-        psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
-
-        movdqa    xmm4,xmm1
-        punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm2
-        punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
-        punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
-        movdqa    xmm6,xmm3
-        punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
-        punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
-
-        pmullw  xmm1,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
-        paddw   xmm2,[GOTOFF(ebx,PW_ONE)]
-        paddw   xmm5,[GOTOFF(ebx,PW_ONE)]
-        paddw   xmm3,[GOTOFF(ebx,PW_TWO)]
-        paddw   xmm6,[GOTOFF(ebx,PW_TWO)]
-
-        paddw   xmm2,xmm1
-        paddw   xmm5,xmm4
-        psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
-        paddw   xmm3,xmm1
-        paddw   xmm6,xmm4
-        psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm3,BYTE_BIT
-        psllw   xmm6,BYTE_BIT
-        por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
-        por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
-
-        sub     eax, byte SIZEOF_XMMWORD
-        add     esi, byte 1*SIZEOF_XMMWORD      ; inptr
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        cmp     eax, byte SIZEOF_XMMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     esi
-        pop     edi
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     ecx                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          4
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void *gotptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v2_fancy_upsample_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     edx,eax                         ; edx = original ebp
-        mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
-        test    eax,eax
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(edx)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(edx)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    eax                                     ; colctr
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
-
-        test    eax, SIZEOF_XMMWORD-1
-        jz      short .skip
-        push    edx
-        mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-        pop     edx
-.skip:
-        ; -- process the first column block
-
-        movdqa  xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
-        movdqa  xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
-        movdqa  xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pxor      xmm3,xmm3             ; xmm3=(all 0's)
-        movdqa    xmm4,xmm0
-        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm1
-        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm6,xmm2
-        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-        pmullw  xmm0,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
-
-        pcmpeqb xmm7,xmm7
-        psrldq  xmm7,(SIZEOF_XMMWORD-2)
-
-        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-        movdqa  XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
-        movdqa  XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
-
-        pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
-        pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
-
-        movdqa  XMMWORD [wk(0)], xmm1
-        movdqa  XMMWORD [wk(1)], xmm2
-
-        poppic  ebx
-
-        add     eax, byte SIZEOF_XMMWORD-1
-        and     eax, byte -SIZEOF_XMMWORD
-        cmp     eax, byte SIZEOF_XMMWORD
-        ja      short .columnloop
-        alignx  16,7
-
-.columnloop_last:
-        ; -- process the last column block
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pcmpeqb xmm1,xmm1
-        pslldq  xmm1,(SIZEOF_XMMWORD-2)
-        movdqa  xmm2,xmm1
-
-        pand    xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
-        pand    xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
-        movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
-        movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
-
-        jmp     near .upsample
-        alignx  16,7
-
-.columnloop:
-        ; -- process the next column block
-
-        movdqa  xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
-        movdqa  xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
-        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pxor      xmm3,xmm3             ; xmm3=(all 0's)
-        movdqa    xmm4,xmm0
-        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm1
-        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm6,xmm2
-        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-        pmullw  xmm0,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
-
-        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-        movdqa  XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
-        movdqa  XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
-        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
-
-        pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
-        pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
-
-        movdqa  XMMWORD [wk(2)], xmm1
-        movdqa  XMMWORD [wk(3)], xmm2
-
-.upsample:
-        ; -- process the upper row
-
-        movdqa  xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
-        movdqa  xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
-
-        movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
-        movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
-        psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
-        pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
-        movdqa  xmm5,xmm7
-        movdqa  xmm6,xmm3
-        psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
-        pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
-
-        por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
-        por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
-
-        movdqa  xmm1,xmm7
-        movdqa  xmm2,xmm3
-        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
-        psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
-        movdqa  xmm4,xmm3
-        psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
-
-        por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
-        por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
-
-        movdqa  XMMWORD [wk(0)], xmm4
-
-        pmullw  xmm7,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm3,[GOTOFF(ebx,PW_THREE)]
-        paddw   xmm1,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   xmm5,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   xmm0,[GOTOFF(ebx,PW_SEVEN)]
-        paddw   xmm2,[GOTOFF(ebx,PW_SEVEN)]
-
-        paddw   xmm1,xmm7
-        paddw   xmm5,xmm3
-        psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
-        paddw   xmm0,xmm7
-        paddw   xmm2,xmm3
-        psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm0,BYTE_BIT
-        psllw   xmm2,BYTE_BIT
-        por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
-        por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
-
-        ; -- process the lower row
-
-        movdqa  xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
-        movdqa  xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
-        movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
-        movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
-        psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
-        pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
-        movdqa  xmm0,xmm6
-        movdqa  xmm2,xmm4
-        psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
-        pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
-
-        por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
-        por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
-
-        movdqa  xmm1,xmm6
-        movdqa  xmm5,xmm4
-        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
-        psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
-        movdqa  xmm3,xmm4
-        psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
-
-        por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
-        por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
-
-        movdqa  XMMWORD [wk(1)], xmm3
-
-        pmullw  xmm6,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
-        paddw   xmm1,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   xmm0,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   xmm7,[GOTOFF(ebx,PW_SEVEN)]
-        paddw   xmm5,[GOTOFF(ebx,PW_SEVEN)]
-
-        paddw   xmm1,xmm6
-        paddw   xmm0,xmm4
-        psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
-        paddw   xmm7,xmm6
-        paddw   xmm5,xmm4
-        psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm7,BYTE_BIT
-        psllw   xmm5,BYTE_BIT
-        por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
-        por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
-
-        poppic  ebx
-
-        sub     eax, byte SIZEOF_XMMWORD
-        add     ecx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
-        add     ebx, byte 1*SIZEOF_XMMWORD      ; inptr0
-        add     esi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
-        add     edx, byte 2*SIZEOF_XMMWORD      ; outptr0
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr1
-        cmp     eax, byte SIZEOF_XMMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     esi
-        pop     edi
-        pop     ecx
-        pop     eax
-
-        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     ecx, byte 2                     ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define output_width(b)         (b)+12          ; JDIMENSION output_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_upsample_sse2)
-
-EXTN(jsimd_h2v1_upsample_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     edx, JDIMENSION [output_width(ebp)]
-        add     edx, byte (2*SIZEOF_XMMWORD)-1
-        and     edx, byte -(2*SIZEOF_XMMWORD)
-        jz      short .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      short .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]             ; inptr
-        mov     edi, JSAMPROW [edi]             ; outptr
-        mov     eax,edx                         ; colctr
-        alignx  16,7
-.columnloop:
-
-        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-        movdqa    xmm1,xmm0
-        punpcklbw xmm0,xmm0
-        punpckhbw xmm1,xmm1
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
-
-        sub     eax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-        movdqa    xmm3,xmm2
-        punpcklbw xmm2,xmm2
-        punpckhbw xmm3,xmm3
-
-        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
-
-        sub     eax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     edi, byte 4*SIZEOF_XMMWORD      ; outptr
-        jmp     short .columnloop
-        alignx  16,7
-
-.nextrow:
-        pop     esi
-        pop     edi
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     ecx                             ; rowctr
-        jg      short .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define output_width(b)         (b)+12          ; JDIMENSION output_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_upsample_sse2)
-
-EXTN(jsimd_h2v2_upsample_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     edx, JDIMENSION [output_width(ebp)]
-        add     edx, byte (2*SIZEOF_XMMWORD)-1
-        and     edx, byte -(2*SIZEOF_XMMWORD)
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]                     ; inptr
-        mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
-        mov     eax,edx                                 ; colctr
-        alignx  16,7
-.columnloop:
-
-        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-        movdqa    xmm1,xmm0
-        punpcklbw xmm0,xmm0
-        punpckhbw xmm1,xmm1
-
-        movdqa  XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
-
-        sub     eax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-        movdqa    xmm3,xmm2
-        punpcklbw xmm2,xmm2
-        punpckhbw xmm3,xmm3
-
-        movdqa  XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
-        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
-
-        sub     eax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     ebx, byte 4*SIZEOF_XMMWORD      ; outptr0
-        add     edi, byte 4*SIZEOF_XMMWORD      ; outptr1
-        jmp     short .columnloop
-        alignx  16,7
-
-.nextrow:
-        pop     esi
-        pop     edi
-
-        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     ecx, byte 2                     ; rowctr
-        jg      short .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctflt-3dn.asm b/media/libjpeg/simd/jfdctflt-3dn.asm
deleted file mode 100644
index 219161819a..0000000000
--- a/media/libjpeg/simd/jfdctflt-3dn.asm
+++ /dev/null
@@ -1,319 +0,0 @@
-;
-; jfdctflt.asm - floating-point FDCT (3DNow!)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_float_3dnow)
-
-EXTN(jconst_fdct_float_3dnow):
-
-PD_0_382        times 2 dd  0.382683432365089771728460
-PD_0_707        times 2 dd  0.707106781186547524400844
-PD_0_541        times 2 dd  0.541196100146196984399723
-PD_1_306        times 2 dd  1.306562964876376527856643
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_3dnow (FAST_FLOAT *data)
-;
-
-%define data(b)         (b)+8           ; FAST_FLOAT *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_float_3dnow)
-
-EXTN(jsimd_fdct_float_3dnow):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.rowloop:
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
-
-        ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
-
-        movq      mm4,mm0               ; transpose coefficients
-        punpckldq mm0,mm1               ; mm0=(00 10)=data0
-        punpckhdq mm4,mm1               ; mm4=(01 11)=data1
-        movq      mm5,mm2               ; transpose coefficients
-        punpckldq mm2,mm3               ; mm2=(06 16)=data6
-        punpckhdq mm5,mm3               ; mm5=(07 17)=data7
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        pfsub   mm4,mm2                 ; mm4=data1-data6=tmp6
-        pfsub   mm0,mm5                 ; mm0=data0-data7=tmp7
-        pfadd   mm6,mm2                 ; mm6=data1+data6=tmp1
-        pfadd   mm7,mm5                 ; mm7=data0+data7=tmp0
-
-        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
-
-        ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm0     ; wk(1)=tmp7
-
-        movq      mm4,mm1               ; transpose coefficients
-        punpckldq mm1,mm3               ; mm1=(02 12)=data2
-        punpckhdq mm4,mm3               ; mm4=(03 13)=data3
-        movq      mm0,mm2               ; transpose coefficients
-        punpckldq mm2,mm5               ; mm2=(04 14)=data4
-        punpckhdq mm0,mm5               ; mm0=(05 15)=data5
-
-        movq    mm3,mm4
-        movq    mm5,mm1
-        pfadd   mm4,mm2                 ; mm4=data3+data4=tmp3
-        pfadd   mm1,mm0                 ; mm1=data2+data5=tmp2
-        pfsub   mm3,mm2                 ; mm3=data3-data4=tmp4
-        pfsub   mm5,mm0                 ; mm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm2,mm7
-        movq    mm0,mm6
-        pfsub   mm7,mm4                 ; mm7=tmp13
-        pfsub   mm6,mm1                 ; mm6=tmp12
-        pfadd   mm2,mm4                 ; mm2=tmp10
-        pfadd   mm0,mm1                 ; mm0=tmp11
-
-        pfadd   mm6,mm7
-        pfmul   mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
-
-        movq    mm4,mm2
-        movq    mm1,mm7
-        pfsub   mm2,mm0                 ; mm2=data4
-        pfsub   mm7,mm6                 ; mm7=data6
-        pfadd   mm4,mm0                 ; mm4=data0
-        pfadd   mm1,mm6                 ; mm1=data2
-
-        movq    MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
-        movq    MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [wk(0)]     ; mm0=tmp6
-        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp7
-
-        pfadd   mm3,mm5                 ; mm3=tmp10
-        pfadd   mm5,mm0                 ; mm5=tmp11
-        pfadd   mm0,mm6                 ; mm0=tmp12, mm6=tmp7
-
-        pfmul   mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
-
-        movq    mm2,mm3                 ; mm2=tmp10
-        pfsub   mm3,mm0
-        pfmul   mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
-        pfmul   mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
-        pfmul   mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
-        pfadd   mm2,mm3                 ; mm2=z2
-        pfadd   mm0,mm3                 ; mm0=z4
-
-        movq    mm7,mm6
-        pfsub   mm6,mm5                 ; mm6=z13
-        pfadd   mm7,mm5                 ; mm7=z11
-
-        movq    mm4,mm6
-        movq    mm1,mm7
-        pfsub   mm6,mm2                 ; mm6=data3
-        pfsub   mm7,mm0                 ; mm7=data7
-        pfadd   mm4,mm2                 ; mm4=data5
-        pfadd   mm1,mm0                 ; mm1=data1
-
-        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
-        movq    MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-        add     edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
-        ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
-
-        movq      mm4,mm0               ; transpose coefficients
-        punpckldq mm0,mm1               ; mm0=(00 01)=data0
-        punpckhdq mm4,mm1               ; mm4=(10 11)=data1
-        movq      mm5,mm2               ; transpose coefficients
-        punpckldq mm2,mm3               ; mm2=(60 61)=data6
-        punpckhdq mm5,mm3               ; mm5=(70 71)=data7
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        pfsub   mm4,mm2                 ; mm4=data1-data6=tmp6
-        pfsub   mm0,mm5                 ; mm0=data0-data7=tmp7
-        pfadd   mm6,mm2                 ; mm6=data1+data6=tmp1
-        pfadd   mm7,mm5                 ; mm7=data0+data7=tmp0
-
-        movq    mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
-        ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm0     ; wk(1)=tmp7
-
-        movq      mm4,mm1               ; transpose coefficients
-        punpckldq mm1,mm3               ; mm1=(20 21)=data2
-        punpckhdq mm4,mm3               ; mm4=(30 31)=data3
-        movq      mm0,mm2               ; transpose coefficients
-        punpckldq mm2,mm5               ; mm2=(40 41)=data4
-        punpckhdq mm0,mm5               ; mm0=(50 51)=data5
-
-        movq    mm3,mm4
-        movq    mm5,mm1
-        pfadd   mm4,mm2                 ; mm4=data3+data4=tmp3
-        pfadd   mm1,mm0                 ; mm1=data2+data5=tmp2
-        pfsub   mm3,mm2                 ; mm3=data3-data4=tmp4
-        pfsub   mm5,mm0                 ; mm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm2,mm7
-        movq    mm0,mm6
-        pfsub   mm7,mm4                 ; mm7=tmp13
-        pfsub   mm6,mm1                 ; mm6=tmp12
-        pfadd   mm2,mm4                 ; mm2=tmp10
-        pfadd   mm0,mm1                 ; mm0=tmp11
-
-        pfadd   mm6,mm7
-        pfmul   mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
-
-        movq    mm4,mm2
-        movq    mm1,mm7
-        pfsub   mm2,mm0                 ; mm2=data4
-        pfsub   mm7,mm6                 ; mm7=data6
-        pfadd   mm4,mm0                 ; mm4=data0
-        pfadd   mm1,mm6                 ; mm1=data2
-
-        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
-        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [wk(0)]     ; mm0=tmp6
-        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp7
-
-        pfadd   mm3,mm5                 ; mm3=tmp10
-        pfadd   mm5,mm0                 ; mm5=tmp11
-        pfadd   mm0,mm6                 ; mm0=tmp12, mm6=tmp7
-
-        pfmul   mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
-
-        movq    mm2,mm3                 ; mm2=tmp10
-        pfsub   mm3,mm0
-        pfmul   mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
-        pfmul   mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
-        pfmul   mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
-        pfadd   mm2,mm3                 ; mm2=z2
-        pfadd   mm0,mm3                 ; mm0=z4
-
-        movq    mm7,mm6
-        pfsub   mm6,mm5                 ; mm6=z13
-        pfadd   mm7,mm5                 ; mm7=z11
-
-        movq    mm4,mm6
-        movq    mm1,mm7
-        pfsub   mm6,mm2                 ; mm6=data3
-        pfsub   mm7,mm0                 ; mm7=data7
-        pfadd   mm4,mm2                 ; mm4=data5
-        pfadd   mm1,mm0                 ; mm1=data1
-
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
-        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-        add     edx, byte 2*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .columnloop
-
-        femms           ; empty MMX/3DNow! state
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctflt-sse-64.asm b/media/libjpeg/simd/jfdctflt-sse-64.asm
deleted file mode 100644
index 4b64ea4bb5..0000000000
--- a/media/libjpeg/simd/jfdctflt-sse-64.asm
+++ /dev/null
@@ -1,357 +0,0 @@
-;
-; jfdctflt.asm - floating-point FDCT (64-bit SSE)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_float_sse)
-
-EXTN(jconst_fdct_float_sse):
-
-PD_0_382        times 4 dd  0.382683432365089771728460
-PD_0_707        times 4 dd  0.707106781186547524400844
-PD_0_541        times 4 dd  0.541196100146196984399723
-PD_1_306        times 4 dd  1.306562964876376527856643
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT *data)
-;
-
-; r10 = FAST_FLOAT *data
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_float_sse)
-
-EXTN(jsimd_fdct_float_sse):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process rows.
-
-        mov     rdx, r10        ; (FAST_FLOAT *)
-        mov     rcx, DCTSIZE/4
-.rowloop:
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
-        ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
-        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
-        unpcklps xmm0,xmm1              ; xmm0=(20 30 21 31)
-        unpckhps xmm4,xmm1              ; xmm4=(22 32 23 33)
-        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
-        unpcklps xmm2,xmm3              ; xmm2=(24 34 25 35)
-        unpckhps xmm5,xmm3              ; xmm5=(26 36 27 37)
-
-        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
-        ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
-        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
-        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
-
-        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
-        unpcklps xmm6,xmm7              ; xmm6=(00 10 01 11)
-        unpckhps xmm4,xmm7              ; xmm4=(02 12 03 13)
-        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
-        unpcklps xmm1,xmm3              ; xmm1=(04 14 05 15)
-        unpckhps xmm2,xmm3              ; xmm2=(06 16 07 17)
-
-        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm0             ; xmm6=(00 10 20 30)=data0
-        unpckhps2 xmm7,xmm0             ; xmm7=(01 11 21 31)=data1
-        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
-        unpcklps2 xmm2,xmm5             ; xmm2=(06 16 26 36)=data6
-        unpckhps2 xmm3,xmm5             ; xmm3=(07 17 27 37)=data7
-
-        movaps  xmm0,xmm7
-        movaps  xmm5,xmm6
-        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
-        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
-        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
-
-        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
-        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
-        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(02 12 22 32)=data2
-        unpckhps2 xmm7,xmm2             ; xmm7=(03 13 23 33)=data3
-        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm3             ; xmm1=(04 14 24 34)=data4
-        unpckhps2 xmm6,xmm3             ; xmm6=(05 15 25 35)=data5
-
-        movaps  xmm2,xmm7
-        movaps  xmm3,xmm4
-        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
-        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
-        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movaps  xmm1,xmm5
-        movaps  xmm6,xmm0
-        subps   xmm5,xmm7               ; xmm5=tmp13
-        subps   xmm0,xmm4               ; xmm0=tmp12
-        addps   xmm1,xmm7               ; xmm1=tmp10
-        addps   xmm6,xmm4               ; xmm6=tmp11
-
-        addps   xmm0,xmm5
-        mulps   xmm0,[rel PD_0_707] ; xmm0=z1
-
-        movaps  xmm7,xmm1
-        movaps  xmm4,xmm5
-        subps   xmm1,xmm6               ; xmm1=data4
-        subps   xmm5,xmm0               ; xmm5=data6
-        addps   xmm7,xmm6               ; xmm7=data0
-        addps   xmm4,xmm0               ; xmm4=data2
-
-        movaps  XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-        ; -- Odd part
-
-        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
-        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
-
-        addps   xmm2,xmm3               ; xmm2=tmp10
-        addps   xmm3,xmm6               ; xmm3=tmp11
-        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
-
-        mulps   xmm3,[rel PD_0_707] ; xmm3=z3
-
-        movaps  xmm1,xmm2               ; xmm1=tmp10
-        subps   xmm2,xmm6
-        mulps   xmm2,[rel PD_0_382] ; xmm2=z5
-        mulps   xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-        mulps   xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-        addps   xmm1,xmm2               ; xmm1=z2
-        addps   xmm6,xmm2               ; xmm6=z4
-
-        movaps  xmm5,xmm0
-        subps   xmm0,xmm3               ; xmm0=z13
-        addps   xmm5,xmm3               ; xmm5=z11
-
-        movaps  xmm7,xmm0
-        movaps  xmm4,xmm5
-        subps   xmm0,xmm1               ; xmm0=data3
-        subps   xmm5,xmm6               ; xmm5=data7
-        addps   xmm7,xmm1               ; xmm7=data5
-        addps   xmm4,xmm6               ; xmm4=data1
-
-        movaps  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-        add     rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     rcx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     rdx, r10        ; (FAST_FLOAT *)
-        mov     rcx, DCTSIZE/4
-.columnloop:
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
-        ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
-        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
-        unpcklps xmm0,xmm1              ; xmm0=(02 03 12 13)
-        unpckhps xmm4,xmm1              ; xmm4=(22 23 32 33)
-        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
-        unpcklps xmm2,xmm3              ; xmm2=(42 43 52 53)
-        unpckhps xmm5,xmm3              ; xmm5=(62 63 72 73)
-
-        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
-        ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
-        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
-        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
-
-        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
-        unpcklps xmm6,xmm7              ; xmm6=(00 01 10 11)
-        unpckhps xmm4,xmm7              ; xmm4=(20 21 30 31)
-        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
-        unpcklps xmm1,xmm3              ; xmm1=(40 41 50 51)
-        unpckhps xmm2,xmm3              ; xmm2=(60 61 70 71)
-
-        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm0             ; xmm6=(00 01 02 03)=data0
-        unpckhps2 xmm7,xmm0             ; xmm7=(10 11 12 13)=data1
-        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
-        unpcklps2 xmm2,xmm5             ; xmm2=(60 61 62 63)=data6
-        unpckhps2 xmm3,xmm5             ; xmm3=(70 71 72 73)=data7
-
-        movaps  xmm0,xmm7
-        movaps  xmm5,xmm6
-        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
-        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
-        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
-
-        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
-        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
-        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(20 21 22 23)=data2
-        unpckhps2 xmm7,xmm2             ; xmm7=(30 31 32 33)=data3
-        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm3             ; xmm1=(40 41 42 43)=data4
-        unpckhps2 xmm6,xmm3             ; xmm6=(50 51 52 53)=data5
-
-        movaps  xmm2,xmm7
-        movaps  xmm3,xmm4
-        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
-        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
-        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movaps  xmm1,xmm5
-        movaps  xmm6,xmm0
-        subps   xmm5,xmm7               ; xmm5=tmp13
-        subps   xmm0,xmm4               ; xmm0=tmp12
-        addps   xmm1,xmm7               ; xmm1=tmp10
-        addps   xmm6,xmm4               ; xmm6=tmp11
-
-        addps   xmm0,xmm5
-        mulps   xmm0,[rel PD_0_707] ; xmm0=z1
-
-        movaps  xmm7,xmm1
-        movaps  xmm4,xmm5
-        subps   xmm1,xmm6               ; xmm1=data4
-        subps   xmm5,xmm0               ; xmm5=data6
-        addps   xmm7,xmm6               ; xmm7=data0
-        addps   xmm4,xmm0               ; xmm4=data2
-
-        movaps  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-        ; -- Odd part
-
-        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
-        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
-
-        addps   xmm2,xmm3               ; xmm2=tmp10
-        addps   xmm3,xmm6               ; xmm3=tmp11
-        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
-
-        mulps   xmm3,[rel PD_0_707] ; xmm3=z3
-
-        movaps  xmm1,xmm2               ; xmm1=tmp10
-        subps   xmm2,xmm6
-        mulps   xmm2,[rel PD_0_382] ; xmm2=z5
-        mulps   xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-        mulps   xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-        addps   xmm1,xmm2               ; xmm1=z2
-        addps   xmm6,xmm2               ; xmm6=z4
-
-        movaps  xmm5,xmm0
-        subps   xmm0,xmm3               ; xmm0=z13
-        addps   xmm5,xmm3               ; xmm5=z11
-
-        movaps  xmm7,xmm0
-        movaps  xmm4,xmm5
-        subps   xmm0,xmm1               ; xmm0=data3
-        subps   xmm5,xmm6               ; xmm5=data7
-        addps   xmm7,xmm1               ; xmm7=data5
-        addps   xmm4,xmm6               ; xmm4=data1
-
-        movaps  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-        add     rdx, byte 4*SIZEOF_FAST_FLOAT
-        dec     rcx
-        jnz     near .columnloop
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctflt-sse.asm b/media/libjpeg/simd/jfdctflt-sse.asm
deleted file mode 100644
index e7ede26c0c..0000000000
--- a/media/libjpeg/simd/jfdctflt-sse.asm
+++ /dev/null
@@ -1,369 +0,0 @@
-;
-; jfdctflt.asm - floating-point FDCT (SSE)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_float_sse)
-
-EXTN(jconst_fdct_float_sse):
-
-PD_0_382        times 4 dd  0.382683432365089771728460
-PD_0_707        times 4 dd  0.707106781186547524400844
-PD_0_541        times 4 dd  0.541196100146196984399723
-PD_1_306        times 4 dd  1.306562964876376527856643
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT *data)
-;
-
-%define data(b)         (b)+8           ; FAST_FLOAT *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_float_sse)
-
-EXTN(jsimd_fdct_float_sse):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.rowloop:
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
-        ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
-        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
-        unpcklps xmm0,xmm1              ; xmm0=(20 30 21 31)
-        unpckhps xmm4,xmm1              ; xmm4=(22 32 23 33)
-        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
-        unpcklps xmm2,xmm3              ; xmm2=(24 34 25 35)
-        unpckhps xmm5,xmm3              ; xmm5=(26 36 27 37)
-
-        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
-        ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
-        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
-        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
-
-        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
-        unpcklps xmm6,xmm7              ; xmm6=(00 10 01 11)
-        unpckhps xmm4,xmm7              ; xmm4=(02 12 03 13)
-        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
-        unpcklps xmm1,xmm3              ; xmm1=(04 14 05 15)
-        unpckhps xmm2,xmm3              ; xmm2=(06 16 07 17)
-
-        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm0             ; xmm6=(00 10 20 30)=data0
-        unpckhps2 xmm7,xmm0             ; xmm7=(01 11 21 31)=data1
-        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
-        unpcklps2 xmm2,xmm5             ; xmm2=(06 16 26 36)=data6
-        unpckhps2 xmm3,xmm5             ; xmm3=(07 17 27 37)=data7
-
-        movaps  xmm0,xmm7
-        movaps  xmm5,xmm6
-        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
-        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
-        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
-
-        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
-        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
-        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(02 12 22 32)=data2
-        unpckhps2 xmm7,xmm2             ; xmm7=(03 13 23 33)=data3
-        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm3             ; xmm1=(04 14 24 34)=data4
-        unpckhps2 xmm6,xmm3             ; xmm6=(05 15 25 35)=data5
-
-        movaps  xmm2,xmm7
-        movaps  xmm3,xmm4
-        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
-        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
-        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movaps  xmm1,xmm5
-        movaps  xmm6,xmm0
-        subps   xmm5,xmm7               ; xmm5=tmp13
-        subps   xmm0,xmm4               ; xmm0=tmp12
-        addps   xmm1,xmm7               ; xmm1=tmp10
-        addps   xmm6,xmm4               ; xmm6=tmp11
-
-        addps   xmm0,xmm5
-        mulps   xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
-        movaps  xmm7,xmm1
-        movaps  xmm4,xmm5
-        subps   xmm1,xmm6               ; xmm1=data4
-        subps   xmm5,xmm0               ; xmm5=data6
-        addps   xmm7,xmm6               ; xmm7=data0
-        addps   xmm4,xmm0               ; xmm4=data2
-
-        movaps  XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-        ; -- Odd part
-
-        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
-        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
-
-        addps   xmm2,xmm3               ; xmm2=tmp10
-        addps   xmm3,xmm6               ; xmm3=tmp11
-        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
-
-        mulps   xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
-        movaps  xmm1,xmm2               ; xmm1=tmp10
-        subps   xmm2,xmm6
-        mulps   xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
-        mulps   xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-        mulps   xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-        addps   xmm1,xmm2               ; xmm1=z2
-        addps   xmm6,xmm2               ; xmm6=z4
-
-        movaps  xmm5,xmm0
-        subps   xmm0,xmm3               ; xmm0=z13
-        addps   xmm5,xmm3               ; xmm5=z11
-
-        movaps  xmm7,xmm0
-        movaps  xmm4,xmm5
-        subps   xmm0,xmm1               ; xmm0=data3
-        subps   xmm5,xmm6               ; xmm5=data7
-        addps   xmm7,xmm1               ; xmm7=data5
-        addps   xmm4,xmm6               ; xmm4=data1
-
-        movaps  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-        add     edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.columnloop:
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
-        ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
-        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
-        unpcklps xmm0,xmm1              ; xmm0=(02 03 12 13)
-        unpckhps xmm4,xmm1              ; xmm4=(22 23 32 33)
-        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
-        unpcklps xmm2,xmm3              ; xmm2=(42 43 52 53)
-        unpckhps xmm5,xmm3              ; xmm5=(62 63 72 73)
-
-        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
-        ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
-        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
-        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
-
-        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
-        unpcklps xmm6,xmm7              ; xmm6=(00 01 10 11)
-        unpckhps xmm4,xmm7              ; xmm4=(20 21 30 31)
-        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
-        unpcklps xmm1,xmm3              ; xmm1=(40 41 50 51)
-        unpckhps xmm2,xmm3              ; xmm2=(60 61 70 71)
-
-        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm0             ; xmm6=(00 01 02 03)=data0
-        unpckhps2 xmm7,xmm0             ; xmm7=(10 11 12 13)=data1
-        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
-        unpcklps2 xmm2,xmm5             ; xmm2=(60 61 62 63)=data6
-        unpckhps2 xmm3,xmm5             ; xmm3=(70 71 72 73)=data7
-
-        movaps  xmm0,xmm7
-        movaps  xmm5,xmm6
-        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
-        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
-        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
-
-        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
-        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
-        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(20 21 22 23)=data2
-        unpckhps2 xmm7,xmm2             ; xmm7=(30 31 32 33)=data3
-        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm3             ; xmm1=(40 41 42 43)=data4
-        unpckhps2 xmm6,xmm3             ; xmm6=(50 51 52 53)=data5
-
-        movaps  xmm2,xmm7
-        movaps  xmm3,xmm4
-        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
-        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
-        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movaps  xmm1,xmm5
-        movaps  xmm6,xmm0
-        subps   xmm5,xmm7               ; xmm5=tmp13
-        subps   xmm0,xmm4               ; xmm0=tmp12
-        addps   xmm1,xmm7               ; xmm1=tmp10
-        addps   xmm6,xmm4               ; xmm6=tmp11
-
-        addps   xmm0,xmm5
-        mulps   xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
-        movaps  xmm7,xmm1
-        movaps  xmm4,xmm5
-        subps   xmm1,xmm6               ; xmm1=data4
-        subps   xmm5,xmm0               ; xmm5=data6
-        addps   xmm7,xmm6               ; xmm7=data0
-        addps   xmm4,xmm0               ; xmm4=data2
-
-        movaps  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-        ; -- Odd part
-
-        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
-        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
-
-        addps   xmm2,xmm3               ; xmm2=tmp10
-        addps   xmm3,xmm6               ; xmm3=tmp11
-        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
-
-        mulps   xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
-        movaps  xmm1,xmm2               ; xmm1=tmp10
-        subps   xmm2,xmm6
-        mulps   xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
-        mulps   xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-        mulps   xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-        addps   xmm1,xmm2               ; xmm1=z2
-        addps   xmm6,xmm2               ; xmm6=z4
-
-        movaps  xmm5,xmm0
-        subps   xmm0,xmm3               ; xmm0=z13
-        addps   xmm5,xmm3               ; xmm5=z11
-
-        movaps  xmm7,xmm0
-        movaps  xmm4,xmm5
-        subps   xmm0,xmm1               ; xmm0=data3
-        subps   xmm5,xmm6               ; xmm5=data7
-        addps   xmm7,xmm1               ; xmm7=data5
-        addps   xmm4,xmm6               ; xmm4=data1
-
-        movaps  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-        add     edx, byte 4*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .columnloop
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctfst-mmx.asm b/media/libjpeg/simd/jfdctfst-mmx.asm
deleted file mode 100644
index eb2eb9c50d..0000000000
--- a/media/libjpeg/simd/jfdctfst-mmx.asm
+++ /dev/null
@@ -1,396 +0,0 @@
-;
-; jfdctfst.asm - fast integer FDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382 equ      98             ; FIX(0.382683433)
-F_0_541 equ     139             ; FIX(0.541196100)
-F_0_707 equ     181             ; FIX(0.707106781)
-F_1_306 equ     334             ; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
-F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_fdct_ifast_mmx)
-
-EXTN(jconst_fdct_ifast_mmx):
-
-PW_F0707        times 4 dw  F_0_707 << CONST_SHIFT
-PW_F0382        times 4 dw  F_0_382 << CONST_SHIFT
-PW_F0541        times 4 dw  F_0_541 << CONST_SHIFT
-PW_F1306        times 4 dw  F_1_306 << CONST_SHIFT
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_mmx (DCTELEM *data)
-;
-
-%define data(b)         (b)+8           ; DCTELEM *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_ifast_mmx)
-
-EXTN(jsimd_fdct_ifast_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.rowloop:
-
-        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-        movq    mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
-
-        ; mm0=(20 21 22 23), mm2=(24 25 26 27)
-        ; mm1=(30 31 32 33), mm3=(34 35 36 37)
-
-        movq      mm4,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm1               ; mm0=(20 30 21 31)
-        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
-        movq      mm5,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm3               ; mm2=(24 34 25 35)
-        punpckhwd mm5,mm3               ; mm5=(26 36 27 37)
-
-        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
-
-        ; mm6=(00 01 02 03), mm1=(04 05 06 07)
-        ; mm7=(10 11 12 13), mm3=(14 15 16 17)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
-
-        movq      mm4,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
-        punpckhwd mm4,mm7               ; mm4=(02 12 03 13)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm3               ; mm1=(04 14 05 15)
-        punpckhwd mm2,mm3               ; mm2=(06 16 07 17)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm0               ; mm6=(00 10 20 30)=data0
-        punpckhdq mm7,mm0               ; mm7=(01 11 21 31)=data1
-        movq      mm3,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm5               ; mm2=(06 16 26 36)=data6
-        punpckhdq mm3,mm5               ; mm3=(07 17 27 37)=data7
-
-        movq    mm0,mm7
-        movq    mm5,mm6
-        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
-        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
-        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
-        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
-
-        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
-        movq    mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
-
-        movq      mm7,mm4               ; transpose coefficients(phase 2)
-        punpckldq mm4,mm2               ; mm4=(02 12 22 32)=data2
-        punpckhdq mm7,mm2               ; mm7=(03 13 23 33)=data3
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm3               ; mm1=(04 14 24 34)=data4
-        punpckhdq mm6,mm3               ; mm6=(05 15 25 35)=data5
-
-        movq    mm2,mm7
-        movq    mm3,mm4
-        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
-        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
-        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
-        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm1,mm5
-        movq    mm6,mm0
-        psubw   mm5,mm7                 ; mm5=tmp13
-        psubw   mm0,mm4                 ; mm0=tmp12
-        paddw   mm1,mm7                 ; mm1=tmp10
-        paddw   mm6,mm4                 ; mm6=tmp11
-
-        paddw   mm0,mm5
-        psllw   mm0,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
-
-        movq    mm7,mm1
-        movq    mm4,mm5
-        psubw   mm1,mm6                 ; mm1=data4
-        psubw   mm5,mm0                 ; mm5=data6
-        paddw   mm7,mm6                 ; mm7=data0
-        paddw   mm4,mm0                 ; mm4=data2
-
-        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-
-        ; -- Odd part
-
-        movq    mm6, MMWORD [wk(0)]     ; mm6=tmp6
-        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp7
-
-        paddw   mm2,mm3                 ; mm2=tmp10
-        paddw   mm3,mm6                 ; mm3=tmp11
-        paddw   mm6,mm0                 ; mm6=tmp12, mm0=tmp7
-
-        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   mm6,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   mm3,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
-
-        movq    mm1,mm2                 ; mm1=tmp10
-        psubw   mm2,mm6
-        pmulhw  mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
-        pmulhw  mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
-        pmulhw  mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
-        paddw   mm1,mm2                 ; mm1=z2
-        paddw   mm6,mm2                 ; mm6=z4
-
-        movq    mm5,mm0
-        psubw   mm0,mm3                 ; mm0=z13
-        paddw   mm5,mm3                 ; mm5=z11
-
-        movq    mm7,mm0
-        movq    mm4,mm5
-        psubw   mm0,mm1                 ; mm0=data3
-        psubw   mm5,mm6                 ; mm5=data7
-        paddw   mm7,mm1                 ; mm7=data5
-        paddw   mm4,mm6                 ; mm4=data1
-
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
-
-        add     edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-        ; mm0=(02 12 22 32), mm2=(42 52 62 72)
-        ; mm1=(03 13 23 33), mm3=(43 53 63 73)
-
-        movq      mm4,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm1               ; mm0=(02 03 12 13)
-        punpckhwd mm4,mm1               ; mm4=(22 23 32 33)
-        movq      mm5,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm3               ; mm2=(42 43 52 53)
-        punpckhwd mm5,mm3               ; mm5=(62 63 72 73)
-
-        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-
-        ; mm6=(00 10 20 30), mm1=(40 50 60 70)
-        ; mm7=(01 11 21 31), mm3=(41 51 61 71)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
-
-        movq      mm4,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 01 10 11)
-        punpckhwd mm4,mm7               ; mm4=(20 21 30 31)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm3               ; mm1=(40 41 50 51)
-        punpckhwd mm2,mm3               ; mm2=(60 61 70 71)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm0               ; mm6=(00 01 02 03)=data0
-        punpckhdq mm7,mm0               ; mm7=(10 11 12 13)=data1
-        movq      mm3,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm5               ; mm2=(60 61 62 63)=data6
-        punpckhdq mm3,mm5               ; mm3=(70 71 72 73)=data7
-
-        movq    mm0,mm7
-        movq    mm5,mm6
-        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
-        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
-        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
-        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
-
-        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
-        movq    mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
-
-        movq      mm7,mm4               ; transpose coefficients(phase 2)
-        punpckldq mm4,mm2               ; mm4=(20 21 22 23)=data2
-        punpckhdq mm7,mm2               ; mm7=(30 31 32 33)=data3
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm3               ; mm1=(40 41 42 43)=data4
-        punpckhdq mm6,mm3               ; mm6=(50 51 52 53)=data5
-
-        movq    mm2,mm7
-        movq    mm3,mm4
-        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
-        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
-        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
-        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm1,mm5
-        movq    mm6,mm0
-        psubw   mm5,mm7                 ; mm5=tmp13
-        psubw   mm0,mm4                 ; mm0=tmp12
-        paddw   mm1,mm7                 ; mm1=tmp10
-        paddw   mm6,mm4                 ; mm6=tmp11
-
-        paddw   mm0,mm5
-        psllw   mm0,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
-
-        movq    mm7,mm1
-        movq    mm4,mm5
-        psubw   mm1,mm6                 ; mm1=data4
-        psubw   mm5,mm0                 ; mm5=data6
-        paddw   mm7,mm6                 ; mm7=data0
-        paddw   mm4,mm0                 ; mm4=data2
-
-        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-
-        ; -- Odd part
-
-        movq    mm6, MMWORD [wk(0)]     ; mm6=tmp6
-        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp7
-
-        paddw   mm2,mm3                 ; mm2=tmp10
-        paddw   mm3,mm6                 ; mm3=tmp11
-        paddw   mm6,mm0                 ; mm6=tmp12, mm0=tmp7
-
-        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   mm6,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   mm3,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
-
-        movq    mm1,mm2                 ; mm1=tmp10
-        psubw   mm2,mm6
-        pmulhw  mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
-        pmulhw  mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
-        pmulhw  mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
-        paddw   mm1,mm2                 ; mm1=z2
-        paddw   mm6,mm2                 ; mm6=z4
-
-        movq    mm5,mm0
-        psubw   mm0,mm3                 ; mm0=z13
-        paddw   mm5,mm3                 ; mm5=z11
-
-        movq    mm7,mm0
-        movq    mm4,mm5
-        psubw   mm0,mm1                 ; mm0=data3
-        psubw   mm5,mm6                 ; mm5=data7
-        paddw   mm7,mm1                 ; mm7=data5
-        paddw   mm4,mm6                 ; mm4=data1
-
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
-
-        add     edx, byte 4*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     near .columnloop
-
-        emms            ; empty MMX state
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctfst-sse2-64.asm b/media/libjpeg/simd/jfdctfst-sse2-64.asm
deleted file mode 100644
index 4c96685427..0000000000
--- a/media/libjpeg/simd/jfdctfst-sse2-64.asm
+++ /dev/null
@@ -1,391 +0,0 @@
-;
-; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382 equ      98             ; FIX(0.382683433)
-F_0_541 equ     139             ; FIX(0.541196100)
-F_0_707 equ     181             ; FIX(0.707106781)
-F_1_306 equ     334             ; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
-F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_fdct_ifast_sse2)
-
-EXTN(jconst_fdct_ifast_sse2):
-
-PW_F0707        times 8 dw  F_0_707 << CONST_SHIFT
-PW_F0382        times 8 dw  F_0_382 << CONST_SHIFT
-PW_F0541        times 8 dw  F_0_541 << CONST_SHIFT
-PW_F1306        times 8 dw  F_1_306 << CONST_SHIFT
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM *data)
-;
-
-; r10 = DCTELEM *data
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_ifast_sse2)
-
-EXTN(jsimd_fdct_ifast_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process rows.
-
-        mov     rdx, r10        ; (DCTELEM *)
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
-        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
-        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
-        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
-
-        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
-        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
-        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
-
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
-        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
-        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
-        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
-        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-        movdqa  xmm6,xmm1
-        movdqa  xmm3,xmm0
-        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
-        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
-        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
-        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
-        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
-
-        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
-        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
-        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
-        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-        movdqa  xmm2,xmm1
-        movdqa  xmm5,xmm7
-        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
-        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
-        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
-        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm0,xmm6
-        psubw   xmm3,xmm1               ; xmm3=tmp13
-        psubw   xmm6,xmm7               ; xmm6=tmp12
-        paddw   xmm4,xmm1               ; xmm4=tmp10
-        paddw   xmm0,xmm7               ; xmm0=tmp11
-
-        paddw   xmm6,xmm3
-        psllw   xmm6,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm6,[rel PW_F0707] ; xmm6=z1
-
-        movdqa  xmm1,xmm4
-        movdqa  xmm7,xmm3
-        psubw   xmm4,xmm0               ; xmm4=data4
-        psubw   xmm3,xmm6               ; xmm3=data6
-        paddw   xmm1,xmm0               ; xmm1=data0
-        paddw   xmm7,xmm6               ; xmm7=data2
-
-        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6
-
-        ; -- Odd part
-
-        paddw   xmm2,xmm5               ; xmm2=tmp10
-        paddw   xmm5,xmm0               ; xmm5=tmp11
-        paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7
-
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[rel PW_F0707] ; xmm5=z3
-
-        movdqa  xmm4,xmm2               ; xmm4=tmp10
-        psubw   xmm2,xmm0
-        pmulhw  xmm2,[rel PW_F0382] ; xmm2=z5
-        pmulhw  xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-        pmulhw  xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
-        paddw   xmm4,xmm2               ; xmm4=z2
-        paddw   xmm0,xmm2               ; xmm0=z4
-
-        movdqa  xmm3,xmm6
-        psubw   xmm6,xmm5               ; xmm6=z13
-        paddw   xmm3,xmm5               ; xmm3=z11
-
-        movdqa  xmm2,xmm6
-        movdqa  xmm5,xmm3
-        psubw   xmm6,xmm4               ; xmm6=data3
-        psubw   xmm3,xmm0               ; xmm3=data7
-        paddw   xmm2,xmm4               ; xmm2=data5
-        paddw   xmm5,xmm0               ; xmm5=data1
-
-        ; ---- Pass 2: process columns.
-
-        ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
-        ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
-        movdqa    xmm4,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71)
-        movdqa    xmm0,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33)
-        punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73)
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6
-
-        ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
-        ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
-
-        movdqa    xmm7,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35)
-        punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75)
-        movdqa    xmm0,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37)
-        punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77)
-
-        movdqa    xmm2,xmm5             ; transpose coefficients(phase 2)
-        punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17)
-        punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37)
-        movdqa    xmm3,xmm7             ; transpose coefficients(phase 2)
-        punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57)
-        punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77)
-
-        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
-
-        movdqa    xmm2,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33)
-        movdqa    xmm7,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53)
-        punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73)
-
-        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0
-        punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1
-        movdqa     xmm0,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6
-        punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm3,xmm1
-        psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6
-        psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7
-        paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1
-        paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
-        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
-
-        movdqa     xmm6,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2
-        punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3
-        movdqa     xmm1,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4
-        punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5
-
-        movdqa  xmm7,xmm6
-        movdqa  xmm0,xmm2
-        paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3
-        paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2
-        psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4
-        psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm1,xmm5
-        psubw   xmm3,xmm6               ; xmm3=tmp13
-        psubw   xmm5,xmm2               ; xmm5=tmp12
-        paddw   xmm4,xmm6               ; xmm4=tmp10
-        paddw   xmm1,xmm2               ; xmm1=tmp11
-
-        paddw   xmm5,xmm3
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[rel PW_F0707] ; xmm5=z1
-
-        movdqa  xmm6,xmm4
-        movdqa  xmm2,xmm3
-        psubw   xmm4,xmm1               ; xmm4=data4
-        psubw   xmm3,xmm5               ; xmm3=data6
-        paddw   xmm6,xmm1               ; xmm6=data0
-        paddw   xmm2,xmm5               ; xmm2=data2
-
-        movdqa  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
-        movdqa  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
-
-        ; -- Odd part
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
-
-        paddw   xmm7,xmm0               ; xmm7=tmp10
-        paddw   xmm0,xmm1               ; xmm0=tmp11
-        paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7
-
-        psllw   xmm7,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm0,[rel PW_F0707] ; xmm0=z3
-
-        movdqa  xmm4,xmm7               ; xmm4=tmp10
-        psubw   xmm7,xmm1
-        pmulhw  xmm7,[rel PW_F0382] ; xmm7=z5
-        pmulhw  xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-        pmulhw  xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
-        paddw   xmm4,xmm7               ; xmm4=z2
-        paddw   xmm1,xmm7               ; xmm1=z4
-
-        movdqa  xmm3,xmm5
-        psubw   xmm5,xmm0               ; xmm5=z13
-        paddw   xmm3,xmm0               ; xmm3=z11
-
-        movdqa  xmm6,xmm5
-        movdqa  xmm2,xmm3
-        psubw   xmm5,xmm4               ; xmm5=data3
-        psubw   xmm3,xmm1               ; xmm3=data7
-        paddw   xmm6,xmm4               ; xmm6=data5
-        paddw   xmm2,xmm1               ; xmm2=data1
-
-        movdqa  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
-        movdqa  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
-        movdqa  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctfst-sse2.asm b/media/libjpeg/simd/jfdctfst-sse2.asm
deleted file mode 100644
index 54856a2363..0000000000
--- a/media/libjpeg/simd/jfdctfst-sse2.asm
+++ /dev/null
@@ -1,403 +0,0 @@
-;
-; jfdctfst.asm - fast integer FDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382 equ      98             ; FIX(0.382683433)
-F_0_541 equ     139             ; FIX(0.541196100)
-F_0_707 equ     181             ; FIX(0.707106781)
-F_1_306 equ     334             ; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
-F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_fdct_ifast_sse2)
-
-EXTN(jconst_fdct_ifast_sse2):
-
-PW_F0707        times 8 dw  F_0_707 << CONST_SHIFT
-PW_F0382        times 8 dw  F_0_382 << CONST_SHIFT
-PW_F0541        times 8 dw  F_0_541 << CONST_SHIFT
-PW_F1306        times 8 dw  F_1_306 << CONST_SHIFT
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM *data)
-;
-
-%define data(b)         (b)+8           ; DCTELEM *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_ifast_sse2)
-
-EXTN(jsimd_fdct_ifast_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
-        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
-        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
-
-        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
-        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
-        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
-
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
-        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
-        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
-        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
-        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-        movdqa  xmm6,xmm1
-        movdqa  xmm3,xmm0
-        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
-        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
-        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
-        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
-        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
-
-        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
-        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
-        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
-        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-        movdqa  xmm2,xmm1
-        movdqa  xmm5,xmm7
-        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
-        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
-        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
-        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm0,xmm6
-        psubw   xmm3,xmm1               ; xmm3=tmp13
-        psubw   xmm6,xmm7               ; xmm6=tmp12
-        paddw   xmm4,xmm1               ; xmm4=tmp10
-        paddw   xmm0,xmm7               ; xmm0=tmp11
-
-        paddw   xmm6,xmm3
-        psllw   xmm6,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
-
-        movdqa  xmm1,xmm4
-        movdqa  xmm7,xmm3
-        psubw   xmm4,xmm0               ; xmm4=data4
-        psubw   xmm3,xmm6               ; xmm3=data6
-        paddw   xmm1,xmm0               ; xmm1=data0
-        paddw   xmm7,xmm6               ; xmm7=data2
-
-        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6
-
-        ; -- Odd part
-
-        paddw   xmm2,xmm5               ; xmm2=tmp10
-        paddw   xmm5,xmm0               ; xmm5=tmp11
-        paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7
-
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
-
-        movdqa  xmm4,xmm2               ; xmm4=tmp10
-        psubw   xmm2,xmm0
-        pmulhw  xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
-        pmulhw  xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-        pmulhw  xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
-        paddw   xmm4,xmm2               ; xmm4=z2
-        paddw   xmm0,xmm2               ; xmm0=z4
-
-        movdqa  xmm3,xmm6
-        psubw   xmm6,xmm5               ; xmm6=z13
-        paddw   xmm3,xmm5               ; xmm3=z11
-
-        movdqa  xmm2,xmm6
-        movdqa  xmm5,xmm3
-        psubw   xmm6,xmm4               ; xmm6=data3
-        psubw   xmm3,xmm0               ; xmm3=data7
-        paddw   xmm2,xmm4               ; xmm2=data5
-        paddw   xmm5,xmm0               ; xmm5=data1
-
-        ; ---- Pass 2: process columns.
-
-;       mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-
-        ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
-        ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
-        movdqa    xmm4,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71)
-        movdqa    xmm0,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33)
-        punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73)
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6
-
-        ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
-        ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
-
-        movdqa    xmm7,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35)
-        punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75)
-        movdqa    xmm0,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37)
-        punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77)
-
-        movdqa    xmm2,xmm5             ; transpose coefficients(phase 2)
-        punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17)
-        punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37)
-        movdqa    xmm3,xmm7             ; transpose coefficients(phase 2)
-        punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57)
-        punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77)
-
-        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
-
-        movdqa    xmm2,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33)
-        movdqa    xmm7,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53)
-        punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73)
-
-        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0
-        punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1
-        movdqa     xmm0,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6
-        punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm3,xmm1
-        psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6
-        psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7
-        paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1
-        paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
-        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
-
-        movdqa     xmm6,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2
-        punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3
-        movdqa     xmm1,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4
-        punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5
-
-        movdqa  xmm7,xmm6
-        movdqa  xmm0,xmm2
-        paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3
-        paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2
-        psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4
-        psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm1,xmm5
-        psubw   xmm3,xmm6               ; xmm3=tmp13
-        psubw   xmm5,xmm2               ; xmm5=tmp12
-        paddw   xmm4,xmm6               ; xmm4=tmp10
-        paddw   xmm1,xmm2               ; xmm1=tmp11
-
-        paddw   xmm5,xmm3
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
-
-        movdqa  xmm6,xmm4
-        movdqa  xmm2,xmm3
-        psubw   xmm4,xmm1               ; xmm4=data4
-        psubw   xmm3,xmm5               ; xmm3=data6
-        paddw   xmm6,xmm1               ; xmm6=data0
-        paddw   xmm2,xmm5               ; xmm2=data2
-
-        movdqa  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
-        movdqa  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
-        movdqa  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
-
-        ; -- Odd part
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
-
-        paddw   xmm7,xmm0               ; xmm7=tmp10
-        paddw   xmm0,xmm1               ; xmm0=tmp11
-        paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7
-
-        psllw   xmm7,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
-
-        movdqa  xmm4,xmm7               ; xmm4=tmp10
-        psubw   xmm7,xmm1
-        pmulhw  xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
-        pmulhw  xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-        pmulhw  xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
-        paddw   xmm4,xmm7               ; xmm4=z2
-        paddw   xmm1,xmm7               ; xmm1=z4
-
-        movdqa  xmm3,xmm5
-        psubw   xmm5,xmm0               ; xmm5=z13
-        paddw   xmm3,xmm0               ; xmm3=z11
-
-        movdqa  xmm6,xmm5
-        movdqa  xmm2,xmm3
-        psubw   xmm5,xmm4               ; xmm5=data3
-        psubw   xmm3,xmm1               ; xmm3=data7
-        paddw   xmm6,xmm4               ; xmm6=data5
-        paddw   xmm2,xmm1               ; xmm2=data1
-
-        movdqa  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
-        movdqa  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
-        movdqa  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
-        movdqa  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctint-altivec.c b/media/libjpeg/simd/jfdctint-altivec.c
deleted file mode 100644
index e6e8a5687e..0000000000
--- a/media/libjpeg/simd/jfdctint-altivec.c
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* SLOW INTEGER FORWARD DCT */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_298 2446   /* FIX(0.298631336) */
-#define F_0_390 3196   /* FIX(0.390180644) */
-#define F_0_541 4433   /* FIX(0.541196100) */
-#define F_0_765 6270   /* FIX(0.765366865) */
-#define F_0_899 7373   /* FIX(0.899976223) */
-#define F_1_175 9633   /* FIX(1.175875602) */
-#define F_1_501 12299  /* FIX(1.501321110) */
-#define F_1_847 15137  /* FIX(1.847759065) */
-#define F_1_961 16069  /* FIX(1.961570560) */
-#define F_2_053 16819  /* FIX(2.053119869) */
-#define F_2_562 20995  /* FIX(2.562915447) */
-#define F_3_072 25172  /* FIX(3.072711026) */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
-
-
-#define DO_FDCT_COMMON(PASS)  \
-{  \
-  /* (Original)  \
-   * z1 = (tmp12 + tmp13) * 0.541196100;  \
-   * data2 = z1 + tmp13 * 0.765366865;  \
-   * data6 = z1 + tmp12 * -1.847759065;  \
-   *  \
-   * (This implementation)  \
-   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;  \
-   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);  \
-   */  \
-  \
-  tmp1312l = vec_mergeh(tmp13, tmp12);  \
-  tmp1312h = vec_mergel(tmp13, tmp12);  \
-  \
-  out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS);  \
-  out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS);  \
-  out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS);  \
-  out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS);  \
-  \
-  out2l = vec_sra(out2l, descale_p##PASS);  \
-  out2h = vec_sra(out2h, descale_p##PASS);  \
-  out6l = vec_sra(out6l, descale_p##PASS);  \
-  out6h = vec_sra(out6h, descale_p##PASS);  \
-  \
-  out2 = vec_pack(out2l, out2h);  \
-  out6 = vec_pack(out6l, out6h);  \
-  \
-  /* Odd part */  \
-  \
-  z3 = vec_add(tmp4, tmp6);  \
-  z4 = vec_add(tmp5, tmp7);  \
-  \
-  /* (Original)  \
-   * z5 = (z3 + z4) * 1.175875602;  \
-   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
-   * z3 += z5;  z4 += z5;  \
-   *  \
-   * (This implementation)  \
-   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
-   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
-   */  \
-  \
-  z34l = vec_mergeh(z3, z4);  \
-  z34h = vec_mergel(z3, z4);  \
-  \
-  z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS);  \
-  z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS);  \
-  z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS);  \
-  z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS);  \
-  \
-  /* (Original)  \
-   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;  \
-   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;  \
-   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;  \
-   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
-   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;  \
-   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;  \
-   *  \
-   * (This implementation)  \
-   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;  \
-   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;  \
-   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);  \
-   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);  \
-   * data7 = tmp4 + z3;  data5 = tmp5 + z4;  \
-   * data3 = tmp6 + z3;  data1 = tmp7 + z4;  \
-   */  \
-  \
-  tmp47l = vec_mergeh(tmp4, tmp7);  \
-  tmp47h = vec_mergel(tmp4, tmp7);  \
-  \
-  out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l);  \
-  out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h);  \
-  out1l = vec_msums(tmp47l, pw_mf089_f060, z4l);  \
-  out1h = vec_msums(tmp47h, pw_mf089_f060, z4h);  \
-  \
-  out7l = vec_sra(out7l, descale_p##PASS);  \
-  out7h = vec_sra(out7h, descale_p##PASS);  \
-  out1l = vec_sra(out1l, descale_p##PASS);  \
-  out1h = vec_sra(out1h, descale_p##PASS);  \
-  \
-  out7 = vec_pack(out7l, out7h);  \
-  out1 = vec_pack(out1l, out1h);  \
-  \
-  tmp56l = vec_mergeh(tmp5, tmp6);  \
-  tmp56h = vec_mergel(tmp5, tmp6);  \
-  \
-  out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l);  \
-  out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h);  \
-  out3l = vec_msums(tmp56l, pw_mf256_f050, z3l);  \
-  out3h = vec_msums(tmp56h, pw_mf256_f050, z3h);  \
-  \
-  out5l = vec_sra(out5l, descale_p##PASS);  \
-  out5h = vec_sra(out5h, descale_p##PASS);  \
-  out3l = vec_sra(out3l, descale_p##PASS);  \
-  out3h = vec_sra(out3h, descale_p##PASS);  \
-  \
-  out5 = vec_pack(out5l, out5h);  \
-  out3 = vec_pack(out3l, out3h);  \
-}
-
-#define DO_FDCT_PASS1()  \
-{  \
-  /* Even part */  \
-  \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
-  \
-  out0  = vec_add(tmp10, tmp11);  \
-  out0  = vec_sl(out0, pass1_bits);  \
-  out4  = vec_sub(tmp10, tmp11);  \
-  out4  = vec_sl(out4, pass1_bits);  \
-  \
-  DO_FDCT_COMMON(1);  \
-}
-
-#define DO_FDCT_PASS2()  \
-{  \
-  /* Even part */  \
-  \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
-  \
-  out0  = vec_add(tmp10, tmp11);  \
-  out0  = vec_add(out0, pw_descale_p2x);  \
-  out0  = vec_sra(out0, pass1_bits);  \
-  out4  = vec_sub(tmp10, tmp11);  \
-  out4  = vec_add(out4, pw_descale_p2x);  \
-  out4  = vec_sra(out4, pass1_bits);  \
-  \
-  DO_FDCT_COMMON(2);  \
-}
-
-
-void
-jsimd_fdct_islow_altivec (DCTELEM *data)
-{
-  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
-    col0, col1, col2, col3, col4, col5, col6, col7,
-    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
-    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
-    z3, z4, z34l, z34h,
-    out0, out1, out2, out3, out4, out5, out6, out7;
-  __vector int z3l, z3h, z4l, z4h,
-    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
-    out7l, out7h;
-
-  /* Constants */
-  __vector short
-    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
-    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
-    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
-    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
-    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
-    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
-    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
-    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
-    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
-  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
-  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
-    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
-  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
-    descale_p2 = { __4X(DESCALE_P2) };
-
-  /* Pass 1: process rows */
-
-  row0 = vec_ld(0, data);
-  row1 = vec_ld(16, data);
-  row2 = vec_ld(32, data);
-  row3 = vec_ld(48, data);
-  row4 = vec_ld(64, data);
-  row5 = vec_ld(80, data);
-  row6 = vec_ld(96, data);
-  row7 = vec_ld(112, data);
-
-  TRANSPOSE(row, col);
-
-  tmp0 = vec_add(col0, col7);
-  tmp7 = vec_sub(col0, col7);
-  tmp1 = vec_add(col1, col6);
-  tmp6 = vec_sub(col1, col6);
-  tmp2 = vec_add(col2, col5);
-  tmp5 = vec_sub(col2, col5);
-  tmp3 = vec_add(col3, col4);
-  tmp4 = vec_sub(col3, col4);
-
-  DO_FDCT_PASS1();
-
-  /* Pass 2: process columns */
-
-  TRANSPOSE(out, row);
-
-  tmp0 = vec_add(row0, row7);
-  tmp7 = vec_sub(row0, row7);
-  tmp1 = vec_add(row1, row6);
-  tmp6 = vec_sub(row1, row6);
-  tmp2 = vec_add(row2, row5);
-  tmp5 = vec_sub(row2, row5);
-  tmp3 = vec_add(row3, row4);
-  tmp4 = vec_sub(row3, row4);
-
-  DO_FDCT_PASS2();
-
-  vec_st(out0, 0, data);
-  vec_st(out1, 16, data);
-  vec_st(out2, 32, data);
-  vec_st(out3, 48, data);
-  vec_st(out4, 64, data);
-  vec_st(out5, 80, data);
-  vec_st(out6, 96, data);
-  vec_st(out7, 112, data);
-}
diff --git a/media/libjpeg/simd/jfdctint-mmx.asm b/media/libjpeg/simd/jfdctint-mmx.asm
deleted file mode 100644
index 9142ad8816..0000000000
--- a/media/libjpeg/simd/jfdctint-mmx.asm
+++ /dev/null
@@ -1,621 +0,0 @@
-;
-; jfdctint.asm - accurate integer FDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_islow_mmx)
-
-EXTN(jconst_fdct_islow_mmx):
-
-PW_F130_F054    times 2 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 2 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 2 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 2 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 2 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 2 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 2 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 2 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 2 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 2 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X  times 4 dw  1 << (PASS1_BITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_mmx (DCTELEM *data)
-;
-
-%define data(b)         (b)+8           ; DCTELEM *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_islow_mmx)
-
-EXTN(jsimd_fdct_islow_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.rowloop:
-
-        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-        movq    mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
-
-        ; mm0=(20 21 22 23), mm2=(24 25 26 27)
-        ; mm1=(30 31 32 33), mm3=(34 35 36 37)
-
-        movq      mm4,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm1               ; mm0=(20 30 21 31)
-        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
-        movq      mm5,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm3               ; mm2=(24 34 25 35)
-        punpckhwd mm5,mm3               ; mm5=(26 36 27 37)
-
-        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
-
-        ; mm6=(00 01 02 03), mm1=(04 05 06 07)
-        ; mm7=(10 11 12 13), mm3=(14 15 16 17)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
-
-        movq      mm4,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
-        punpckhwd mm4,mm7               ; mm4=(02 12 03 13)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm3               ; mm1=(04 14 05 15)
-        punpckhwd mm2,mm3               ; mm2=(06 16 07 17)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm0               ; mm6=(00 10 20 30)=data0
-        punpckhdq mm7,mm0               ; mm7=(01 11 21 31)=data1
-        movq      mm3,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm5               ; mm2=(06 16 26 36)=data6
-        punpckhdq mm3,mm5               ; mm3=(07 17 27 37)=data7
-
-        movq    mm0,mm7
-        movq    mm5,mm6
-        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
-        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
-        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
-        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
-
-        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
-        movq    mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
-
-        movq      mm7,mm4               ; transpose coefficients(phase 2)
-        punpckldq mm4,mm2               ; mm4=(02 12 22 32)=data2
-        punpckhdq mm7,mm2               ; mm7=(03 13 23 33)=data3
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm3               ; mm1=(04 14 24 34)=data4
-        punpckhdq mm6,mm3               ; mm6=(05 15 25 35)=data5
-
-        movq    mm2,mm7
-        movq    mm3,mm4
-        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
-        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
-        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
-        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm1,mm5
-        movq    mm6,mm0
-        paddw   mm5,mm7                 ; mm5=tmp10
-        paddw   mm0,mm4                 ; mm0=tmp11
-        psubw   mm1,mm7                 ; mm1=tmp13
-        psubw   mm6,mm4                 ; mm6=tmp12
-
-        movq    mm7,mm5
-        paddw   mm5,mm0                 ; mm5=tmp10+tmp11
-        psubw   mm7,mm0                 ; mm7=tmp10-tmp11
-
-        psllw   mm5,PASS1_BITS          ; mm5=data0
-        psllw   mm7,PASS1_BITS          ; mm7=data4
-
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movq      mm4,mm1               ; mm1=tmp13
-        movq      mm0,mm1
-        punpcklwd mm4,mm6               ; mm6=tmp12
-        punpckhwd mm0,mm6
-        movq      mm1,mm4
-        movq      mm6,mm0
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=data2L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]        ; mm0=data2H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=data6L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]       ; mm6=data6H
-
-        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm4,DESCALE_P1
-        psrad   mm0,DESCALE_P1
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm1,DESCALE_P1
-        psrad   mm6,DESCALE_P1
-
-        packssdw  mm4,mm0               ; mm4=data2
-        packssdw  mm1,mm6               ; mm1=data6
-
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
-
-        ; -- Odd part
-
-        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp6
-        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp7
-
-        movq    mm0,mm2                 ; mm2=tmp4
-        movq    mm6,mm3                 ; mm3=tmp5
-        paddw   mm0,mm5                 ; mm0=z3
-        paddw   mm6,mm7                 ; mm6=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movq      mm4,mm0
-        movq      mm1,mm0
-        punpcklwd mm4,mm6
-        punpckhwd mm1,mm6
-        movq      mm0,mm4
-        movq      mm6,mm1
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]       ; mm4=z3L
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]       ; mm1=z3H
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]        ; mm0=z4L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]        ; mm6=z4H
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=z3L
-        movq    MMWORD [wk(1)], mm1     ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movq      mm4,mm2
-        movq      mm1,mm2
-        punpcklwd mm4,mm7
-        punpckhwd mm1,mm7
-        movq      mm2,mm4
-        movq      mm7,mm1
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm4=tmp4L
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm1=tmp4H
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]       ; mm2=tmp7L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]       ; mm7=tmp7H
-
-        paddd   mm4, MMWORD [wk(0)]     ; mm4=data7L
-        paddd   mm1, MMWORD [wk(1)]     ; mm1=data7H
-        paddd   mm2,mm0                 ; mm2=data1L
-        paddd   mm7,mm6                 ; mm7=data1H
-
-        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm4,DESCALE_P1
-        psrad   mm1,DESCALE_P1
-        paddd   mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm2,DESCALE_P1
-        psrad   mm7,DESCALE_P1
-
-        packssdw  mm4,mm1               ; mm4=data7
-        packssdw  mm2,mm7               ; mm2=data1
-
-        movq    MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
-
-        movq      mm1,mm3
-        movq      mm7,mm3
-        punpcklwd mm1,mm5
-        punpckhwd mm7,mm5
-        movq      mm3,mm1
-        movq      mm5,mm7
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm1=tmp5L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm7=tmp5H
-        pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]       ; mm3=tmp6L
-        pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]       ; mm5=tmp6H
-
-        paddd   mm1,mm0                 ; mm1=data5L
-        paddd   mm7,mm6                 ; mm7=data5H
-        paddd   mm3, MMWORD [wk(0)]     ; mm3=data3L
-        paddd   mm5, MMWORD [wk(1)]     ; mm5=data3H
-
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm1,DESCALE_P1
-        psrad   mm7,DESCALE_P1
-        paddd   mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm3,DESCALE_P1
-        psrad   mm5,DESCALE_P1
-
-        packssdw  mm1,mm7               ; mm1=data5
-        packssdw  mm3,mm5               ; mm3=data3
-
-        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
-
-        add     edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-        ; mm0=(02 12 22 32), mm2=(42 52 62 72)
-        ; mm1=(03 13 23 33), mm3=(43 53 63 73)
-
-        movq      mm4,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm1               ; mm0=(02 03 12 13)
-        punpckhwd mm4,mm1               ; mm4=(22 23 32 33)
-        movq      mm5,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm3               ; mm2=(42 43 52 53)
-        punpckhwd mm5,mm3               ; mm5=(62 63 72 73)
-
-        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-
-        ; mm6=(00 10 20 30), mm1=(40 50 60 70)
-        ; mm7=(01 11 21 31), mm3=(41 51 61 71)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
-
-        movq      mm4,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 01 10 11)
-        punpckhwd mm4,mm7               ; mm4=(20 21 30 31)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm3               ; mm1=(40 41 50 51)
-        punpckhwd mm2,mm3               ; mm2=(60 61 70 71)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm0               ; mm6=(00 01 02 03)=data0
-        punpckhdq mm7,mm0               ; mm7=(10 11 12 13)=data1
-        movq      mm3,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm5               ; mm2=(60 61 62 63)=data6
-        punpckhdq mm3,mm5               ; mm3=(70 71 72 73)=data7
-
-        movq    mm0,mm7
-        movq    mm5,mm6
-        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
-        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
-        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
-        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
-
-        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
-        movq    mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
-
-        movq      mm7,mm4               ; transpose coefficients(phase 2)
-        punpckldq mm4,mm2               ; mm4=(20 21 22 23)=data2
-        punpckhdq mm7,mm2               ; mm7=(30 31 32 33)=data3
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm3               ; mm1=(40 41 42 43)=data4
-        punpckhdq mm6,mm3               ; mm6=(50 51 52 53)=data5
-
-        movq    mm2,mm7
-        movq    mm3,mm4
-        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
-        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
-        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
-        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm1,mm5
-        movq    mm6,mm0
-        paddw   mm5,mm7                 ; mm5=tmp10
-        paddw   mm0,mm4                 ; mm0=tmp11
-        psubw   mm1,mm7                 ; mm1=tmp13
-        psubw   mm6,mm4                 ; mm6=tmp12
-
-        movq    mm7,mm5
-        paddw   mm5,mm0                 ; mm5=tmp10+tmp11
-        psubw   mm7,mm0                 ; mm7=tmp10-tmp11
-
-        paddw   mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        paddw   mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        psraw   mm5,PASS1_BITS          ; mm5=data0
-        psraw   mm7,PASS1_BITS          ; mm7=data4
-
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movq      mm4,mm1               ; mm1=tmp13
-        movq      mm0,mm1
-        punpcklwd mm4,mm6               ; mm6=tmp12
-        punpckhwd mm0,mm6
-        movq      mm1,mm4
-        movq      mm6,mm0
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=data2L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]        ; mm0=data2H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=data6L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]       ; mm6=data6H
-
-        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm4,DESCALE_P2
-        psrad   mm0,DESCALE_P2
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm1,DESCALE_P2
-        psrad   mm6,DESCALE_P2
-
-        packssdw  mm4,mm0               ; mm4=data2
-        packssdw  mm1,mm6               ; mm1=data6
-
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
-
-        ; -- Odd part
-
-        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp6
-        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp7
-
-        movq    mm0,mm2                 ; mm2=tmp4
-        movq    mm6,mm3                 ; mm3=tmp5
-        paddw   mm0,mm5                 ; mm0=z3
-        paddw   mm6,mm7                 ; mm6=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movq      mm4,mm0
-        movq      mm1,mm0
-        punpcklwd mm4,mm6
-        punpckhwd mm1,mm6
-        movq      mm0,mm4
-        movq      mm6,mm1
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]       ; mm4=z3L
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]       ; mm1=z3H
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]        ; mm0=z4L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]        ; mm6=z4H
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=z3L
-        movq    MMWORD [wk(1)], mm1     ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movq      mm4,mm2
-        movq      mm1,mm2
-        punpcklwd mm4,mm7
-        punpckhwd mm1,mm7
-        movq      mm2,mm4
-        movq      mm7,mm1
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm4=tmp4L
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm1=tmp4H
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]       ; mm2=tmp7L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]       ; mm7=tmp7H
-
-        paddd   mm4, MMWORD [wk(0)]     ; mm4=data7L
-        paddd   mm1, MMWORD [wk(1)]     ; mm1=data7H
-        paddd   mm2,mm0                 ; mm2=data1L
-        paddd   mm7,mm6                 ; mm7=data1H
-
-        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm4,DESCALE_P2
-        psrad   mm1,DESCALE_P2
-        paddd   mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm2,DESCALE_P2
-        psrad   mm7,DESCALE_P2
-
-        packssdw  mm4,mm1               ; mm4=data7
-        packssdw  mm2,mm7               ; mm2=data1
-
-        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
-
-        movq      mm1,mm3
-        movq      mm7,mm3
-        punpcklwd mm1,mm5
-        punpckhwd mm7,mm5
-        movq      mm3,mm1
-        movq      mm5,mm7
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm1=tmp5L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm7=tmp5H
-        pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]       ; mm3=tmp6L
-        pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]       ; mm5=tmp6H
-
-        paddd   mm1,mm0                 ; mm1=data5L
-        paddd   mm7,mm6                 ; mm7=data5H
-        paddd   mm3, MMWORD [wk(0)]     ; mm3=data3L
-        paddd   mm5, MMWORD [wk(1)]     ; mm5=data3H
-
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm1,DESCALE_P2
-        psrad   mm7,DESCALE_P2
-        paddd   mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm3,DESCALE_P2
-        psrad   mm5,DESCALE_P2
-
-        packssdw  mm1,mm7               ; mm1=data5
-        packssdw  mm3,mm5               ; mm3=data3
-
-        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
-
-        add     edx, byte 4*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     near .columnloop
-
-        emms            ; empty MMX state
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctint-sse2-64.asm b/media/libjpeg/simd/jfdctint-sse2-64.asm
deleted file mode 100644
index 9a0ca0fd28..0000000000
--- a/media/libjpeg/simd/jfdctint-sse2-64.asm
+++ /dev/null
@@ -1,621 +0,0 @@
-;
-; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_islow_sse2)
-
-EXTN(jconst_fdct_islow_sse2):
-
-PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X  times 8 dw  1 << (PASS1_BITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM *data)
-;
-
-; r10 = DCTELEM *data
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          6
-
-        align   16
-        global  EXTN(jsimd_fdct_islow_sse2)
-
-EXTN(jsimd_fdct_islow_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process rows.
-
-        mov     rdx, r10        ; (DCTELEM *)
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
-        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
-        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
-        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
-
-        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
-        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
-        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
-        movdqa  XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
-
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
-        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
-        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
-        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
-        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-        movdqa  xmm6,xmm1
-        movdqa  xmm3,xmm0
-        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
-        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
-        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
-        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa  xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
-        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
-
-        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
-        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
-        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
-        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-        movdqa  xmm2,xmm1
-        movdqa  xmm5,xmm7
-        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
-        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
-        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
-        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm0,xmm6
-        paddw   xmm3,xmm1               ; xmm3=tmp10
-        paddw   xmm6,xmm7               ; xmm6=tmp11
-        psubw   xmm4,xmm1               ; xmm4=tmp13
-        psubw   xmm0,xmm7               ; xmm0=tmp12
-
-        movdqa  xmm1,xmm3
-        paddw   xmm3,xmm6               ; xmm3=tmp10+tmp11
-        psubw   xmm1,xmm6               ; xmm1=tmp10-tmp11
-
-        psllw   xmm3,PASS1_BITS         ; xmm3=data0
-        psllw   xmm1,PASS1_BITS         ; xmm1=data4
-
-        movdqa  XMMWORD [wk(2)], xmm3   ; wk(2)=data0
-        movdqa  XMMWORD [wk(3)], xmm1   ; wk(3)=data4
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movdqa    xmm7,xmm4             ; xmm4=tmp13
-        movdqa    xmm6,xmm4
-        punpcklwd xmm7,xmm0             ; xmm0=tmp12
-        punpckhwd xmm6,xmm0
-        movdqa    xmm4,xmm7
-        movdqa    xmm0,xmm6
-        pmaddwd   xmm7,[rel PW_F130_F054]       ; xmm7=data2L
-        pmaddwd   xmm6,[rel PW_F130_F054]       ; xmm6=data2H
-        pmaddwd   xmm4,[rel PW_F054_MF130]      ; xmm4=data6L
-        pmaddwd   xmm0,[rel PW_F054_MF130]      ; xmm0=data6H
-
-        paddd   xmm7,[rel PD_DESCALE_P1]
-        paddd   xmm6,[rel PD_DESCALE_P1]
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-        paddd   xmm4,[rel PD_DESCALE_P1]
-        paddd   xmm0,[rel PD_DESCALE_P1]
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm7,xmm6             ; xmm7=data2
-        packssdw  xmm4,xmm0             ; xmm4=data6
-
-        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=data2
-        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=data6
-
-        ; -- Odd part
-
-        movdqa  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
-        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
-
-        movdqa  xmm6,xmm2               ; xmm2=tmp4
-        movdqa  xmm0,xmm5               ; xmm5=tmp5
-        paddw   xmm6,xmm3               ; xmm6=z3
-        paddw   xmm0,xmm1               ; xmm0=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm7,xmm6
-        movdqa    xmm4,xmm6
-        punpcklwd xmm7,xmm0
-        punpckhwd xmm4,xmm0
-        movdqa    xmm6,xmm7
-        movdqa    xmm0,xmm4
-        pmaddwd   xmm7,[rel PW_MF078_F117]      ; xmm7=z3L
-        pmaddwd   xmm4,[rel PW_MF078_F117]      ; xmm4=z3H
-        pmaddwd   xmm6,[rel PW_F117_F078]       ; xmm6=z4L
-        pmaddwd   xmm0,[rel PW_F117_F078]       ; xmm0=z4H
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
-        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movdqa    xmm7,xmm2
-        movdqa    xmm4,xmm2
-        punpcklwd xmm7,xmm1
-        punpckhwd xmm4,xmm1
-        movdqa    xmm2,xmm7
-        movdqa    xmm1,xmm4
-        pmaddwd   xmm7,[rel PW_MF060_MF089]     ; xmm7=tmp4L
-        pmaddwd   xmm4,[rel PW_MF060_MF089]     ; xmm4=tmp4H
-        pmaddwd   xmm2,[rel PW_MF089_F060]      ; xmm2=tmp7L
-        pmaddwd   xmm1,[rel PW_MF089_F060]      ; xmm1=tmp7H
-
-        paddd   xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
-        paddd   xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
-        paddd   xmm2,xmm6               ; xmm2=data1L
-        paddd   xmm1,xmm0               ; xmm1=data1H
-
-        paddd   xmm7,[rel PD_DESCALE_P1]
-        paddd   xmm4,[rel PD_DESCALE_P1]
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm4,DESCALE_P1
-        paddd   xmm2,[rel PD_DESCALE_P1]
-        paddd   xmm1,[rel PD_DESCALE_P1]
-        psrad   xmm2,DESCALE_P1
-        psrad   xmm1,DESCALE_P1
-
-        packssdw  xmm7,xmm4             ; xmm7=data7
-        packssdw  xmm2,xmm1             ; xmm2=data1
-
-        movdqa    xmm4,xmm5
-        movdqa    xmm1,xmm5
-        punpcklwd xmm4,xmm3
-        punpckhwd xmm1,xmm3
-        movdqa    xmm5,xmm4
-        movdqa    xmm3,xmm1
-        pmaddwd   xmm4,[rel PW_MF050_MF256]     ; xmm4=tmp5L
-        pmaddwd   xmm1,[rel PW_MF050_MF256]     ; xmm1=tmp5H
-        pmaddwd   xmm5,[rel PW_MF256_F050]      ; xmm5=tmp6L
-        pmaddwd   xmm3,[rel PW_MF256_F050]      ; xmm3=tmp6H
-
-        paddd   xmm4,xmm6               ; xmm4=data5L
-        paddd   xmm1,xmm0               ; xmm1=data5H
-        paddd   xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
-        paddd   xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
-
-        paddd   xmm4,[rel PD_DESCALE_P1]
-        paddd   xmm1,[rel PD_DESCALE_P1]
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm1,DESCALE_P1
-        paddd   xmm5,[rel PD_DESCALE_P1]
-        paddd   xmm3,[rel PD_DESCALE_P1]
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm3,DESCALE_P1
-
-        packssdw  xmm4,xmm1             ; xmm4=data5
-        packssdw  xmm5,xmm3             ; xmm5=data3
-
-        ; ---- Pass 2: process columns.
-
-        movdqa  xmm6, XMMWORD [wk(2)]   ; xmm6=col0
-        movdqa  xmm0, XMMWORD [wk(4)]   ; xmm0=col2
-
-        ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
-        ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
-        movdqa    xmm1,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm2             ; xmm6=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm1,xmm2             ; xmm1=(40 41 50 51 60 61 70 71)
-        movdqa    xmm3,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm5             ; xmm0=(02 03 12 13 22 23 32 33)
-        punpckhwd xmm3,xmm5             ; xmm3=(42 43 52 53 62 63 72 73)
-
-        movdqa  xmm2, XMMWORD [wk(3)]   ; xmm2=col4
-        movdqa  xmm5, XMMWORD [wk(5)]   ; xmm5=col6
-
-        ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
-        ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
-
-        movdqa    xmm0,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm4             ; xmm2=(04 05 14 15 24 25 34 35)
-        punpckhwd xmm0,xmm4             ; xmm0=(44 45 54 55 64 65 74 75)
-        movdqa    xmm3,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm7             ; xmm5=(06 07 16 17 26 27 36 37)
-        punpckhwd xmm3,xmm7             ; xmm3=(46 47 56 57 66 67 76 77)
-
-        movdqa    xmm4,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(04 05 06 07 14 15 16 17)
-        punpckhdq xmm4,xmm5             ; xmm4=(24 25 26 27 34 35 36 37)
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm3             ; xmm0=(44 45 46 47 54 55 56 57)
-        punpckhdq xmm7,xmm3             ; xmm7=(64 65 66 67 74 75 76 77)
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
-        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
-        movdqa  XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
-        movdqa  XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
-
-        movdqa    xmm4,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm5             ; xmm6=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm4,xmm5             ; xmm4=(20 21 22 23 30 31 32 33)
-        movdqa    xmm0,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm3             ; xmm1=(40 41 42 43 50 51 52 53)
-        punpckhdq xmm0,xmm3             ; xmm0=(60 61 62 63 70 71 72 73)
-
-        movdqa     xmm5,xmm6            ; transpose coefficients(phase 3)
-        punpcklqdq xmm6,xmm2            ; xmm6=(00 01 02 03 04 05 06 07)=data0
-        punpckhqdq xmm5,xmm2            ; xmm5=(10 11 12 13 14 15 16 17)=data1
-        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm7            ; xmm0=(60 61 62 63 64 65 66 67)=data6
-        punpckhqdq xmm3,xmm7            ; xmm3=(70 71 72 73 74 75 76 77)=data7
-
-        movdqa  xmm2,xmm5
-        movdqa  xmm7,xmm6
-        psubw   xmm5,xmm0               ; xmm5=data1-data6=tmp6
-        psubw   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        paddw   xmm2,xmm0               ; xmm2=data1+data6=tmp1
-        paddw   xmm7,xmm3               ; xmm7=data0+data7=tmp0
-
-        movdqa  xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
-        movdqa  xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movdqa     xmm5,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm0            ; xmm4=(20 21 22 23 24 25 26 27)=data2
-        punpckhqdq xmm5,xmm0            ; xmm5=(30 31 32 33 34 35 36 37)=data3
-        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm3            ; xmm1=(40 41 42 43 44 45 46 47)=data4
-        punpckhqdq xmm6,xmm3            ; xmm6=(50 51 52 53 54 55 56 57)=data5
-
-        movdqa  xmm0,xmm5
-        movdqa  xmm3,xmm4
-        paddw   xmm5,xmm1               ; xmm5=data3+data4=tmp3
-        paddw   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        psubw   xmm0,xmm1               ; xmm0=data3-data4=tmp4
-        psubw   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm1,xmm7
-        movdqa  xmm6,xmm2
-        paddw   xmm7,xmm5               ; xmm7=tmp10
-        paddw   xmm2,xmm4               ; xmm2=tmp11
-        psubw   xmm1,xmm5               ; xmm1=tmp13
-        psubw   xmm6,xmm4               ; xmm6=tmp12
-
-        movdqa  xmm5,xmm7
-        paddw   xmm7,xmm2               ; xmm7=tmp10+tmp11
-        psubw   xmm5,xmm2               ; xmm5=tmp10-tmp11
-
-        paddw   xmm7,[rel PW_DESCALE_P2X]
-        paddw   xmm5,[rel PW_DESCALE_P2X]
-        psraw   xmm7,PASS1_BITS         ; xmm7=data0
-        psraw   xmm5,PASS1_BITS         ; xmm5=data4
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
-        movdqa  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movdqa    xmm4,xmm1             ; xmm1=tmp13
-        movdqa    xmm2,xmm1
-        punpcklwd xmm4,xmm6             ; xmm6=tmp12
-        punpckhwd xmm2,xmm6
-        movdqa    xmm1,xmm4
-        movdqa    xmm6,xmm2
-        pmaddwd   xmm4,[rel PW_F130_F054]       ; xmm4=data2L
-        pmaddwd   xmm2,[rel PW_F130_F054]       ; xmm2=data2H
-        pmaddwd   xmm1,[rel PW_F054_MF130]      ; xmm1=data6L
-        pmaddwd   xmm6,[rel PW_F054_MF130]      ; xmm6=data6H
-
-        paddd   xmm4,[rel PD_DESCALE_P2]
-        paddd   xmm2,[rel PD_DESCALE_P2]
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm2,DESCALE_P2
-        paddd   xmm1,[rel PD_DESCALE_P2]
-        paddd   xmm6,[rel PD_DESCALE_P2]
-        psrad   xmm1,DESCALE_P2
-        psrad   xmm6,DESCALE_P2
-
-        packssdw  xmm4,xmm2             ; xmm4=data2
-        packssdw  xmm1,xmm6             ; xmm1=data6
-
-        movdqa  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
-
-        ; -- Odd part
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
-
-        movdqa  xmm2,xmm0               ; xmm0=tmp4
-        movdqa  xmm6,xmm3               ; xmm3=tmp5
-        paddw   xmm2,xmm7               ; xmm2=z3
-        paddw   xmm6,xmm5               ; xmm6=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm4,xmm2
-        movdqa    xmm1,xmm2
-        punpcklwd xmm4,xmm6
-        punpckhwd xmm1,xmm6
-        movdqa    xmm2,xmm4
-        movdqa    xmm6,xmm1
-        pmaddwd   xmm4,[rel PW_MF078_F117]      ; xmm4=z3L
-        pmaddwd   xmm1,[rel PW_MF078_F117]      ; xmm1=z3H
-        pmaddwd   xmm2,[rel PW_F117_F078]       ; xmm2=z4L
-        pmaddwd   xmm6,[rel PW_F117_F078]       ; xmm6=z4H
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
-        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movdqa    xmm4,xmm0
-        movdqa    xmm1,xmm0
-        punpcklwd xmm4,xmm5
-        punpckhwd xmm1,xmm5
-        movdqa    xmm0,xmm4
-        movdqa    xmm5,xmm1
-        pmaddwd   xmm4,[rel PW_MF060_MF089]     ; xmm4=tmp4L
-        pmaddwd   xmm1,[rel PW_MF060_MF089]     ; xmm1=tmp4H
-        pmaddwd   xmm0,[rel PW_MF089_F060]      ; xmm0=tmp7L
-        pmaddwd   xmm5,[rel PW_MF089_F060]      ; xmm5=tmp7H
-
-        paddd   xmm4, XMMWORD [wk(0)]   ; xmm4=data7L
-        paddd   xmm1, XMMWORD [wk(1)]   ; xmm1=data7H
-        paddd   xmm0,xmm2               ; xmm0=data1L
-        paddd   xmm5,xmm6               ; xmm5=data1H
-
-        paddd   xmm4,[rel PD_DESCALE_P2]
-        paddd   xmm1,[rel PD_DESCALE_P2]
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm0,[rel PD_DESCALE_P2]
-        paddd   xmm5,[rel PD_DESCALE_P2]
-        psrad   xmm0,DESCALE_P2
-        psrad   xmm5,DESCALE_P2
-
-        packssdw  xmm4,xmm1             ; xmm4=data7
-        packssdw  xmm0,xmm5             ; xmm0=data1
-
-        movdqa  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
-
-        movdqa    xmm1,xmm3
-        movdqa    xmm5,xmm3
-        punpcklwd xmm1,xmm7
-        punpckhwd xmm5,xmm7
-        movdqa    xmm3,xmm1
-        movdqa    xmm7,xmm5
-        pmaddwd   xmm1,[rel PW_MF050_MF256]     ; xmm1=tmp5L
-        pmaddwd   xmm5,[rel PW_MF050_MF256]     ; xmm5=tmp5H
-        pmaddwd   xmm3,[rel PW_MF256_F050]      ; xmm3=tmp6L
-        pmaddwd   xmm7,[rel PW_MF256_F050]      ; xmm7=tmp6H
-
-        paddd   xmm1,xmm2               ; xmm1=data5L
-        paddd   xmm5,xmm6               ; xmm5=data5H
-        paddd   xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
-        paddd   xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
-
-        paddd   xmm1,[rel PD_DESCALE_P2]
-        paddd   xmm5,[rel PD_DESCALE_P2]
-        psrad   xmm1,DESCALE_P2
-        psrad   xmm5,DESCALE_P2
-        paddd   xmm3,[rel PD_DESCALE_P2]
-        paddd   xmm7,[rel PD_DESCALE_P2]
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm1,xmm5             ; xmm1=data5
-        packssdw  xmm3,xmm7             ; xmm3=data3
-
-        movdqa  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctint-sse2.asm b/media/libjpeg/simd/jfdctint-sse2.asm
deleted file mode 100644
index db9d0bbe44..0000000000
--- a/media/libjpeg/simd/jfdctint-sse2.asm
+++ /dev/null
@@ -1,633 +0,0 @@
-;
-; jfdctint.asm - accurate integer FDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_islow_sse2)
-
-EXTN(jconst_fdct_islow_sse2):
-
-PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X  times 8 dw  1 << (PASS1_BITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM *data)
-;
-
-%define data(b)         (b)+8           ; DCTELEM *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          6
-
-        align   16
-        global  EXTN(jsimd_fdct_islow_sse2)
-
-EXTN(jsimd_fdct_islow_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
-        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
-        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
-
-        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
-        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
-        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
-        movdqa  XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
-
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
-        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
-        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
-        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
-        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-        movdqa  xmm6,xmm1
-        movdqa  xmm3,xmm0
-        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
-        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
-        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
-        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa  xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
-        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
-
-        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
-        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
-        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
-        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-        movdqa  xmm2,xmm1
-        movdqa  xmm5,xmm7
-        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
-        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
-        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
-        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm0,xmm6
-        paddw   xmm3,xmm1               ; xmm3=tmp10
-        paddw   xmm6,xmm7               ; xmm6=tmp11
-        psubw   xmm4,xmm1               ; xmm4=tmp13
-        psubw   xmm0,xmm7               ; xmm0=tmp12
-
-        movdqa  xmm1,xmm3
-        paddw   xmm3,xmm6               ; xmm3=tmp10+tmp11
-        psubw   xmm1,xmm6               ; xmm1=tmp10-tmp11
-
-        psllw   xmm3,PASS1_BITS         ; xmm3=data0
-        psllw   xmm1,PASS1_BITS         ; xmm1=data4
-
-        movdqa  XMMWORD [wk(2)], xmm3   ; wk(2)=data0
-        movdqa  XMMWORD [wk(3)], xmm1   ; wk(3)=data4
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movdqa    xmm7,xmm4             ; xmm4=tmp13
-        movdqa    xmm6,xmm4
-        punpcklwd xmm7,xmm0             ; xmm0=tmp12
-        punpckhwd xmm6,xmm0
-        movdqa    xmm4,xmm7
-        movdqa    xmm0,xmm6
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_F130_F054)]       ; xmm7=data2L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]       ; xmm6=data2H
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm4=data6L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm0=data6H
-
-        paddd   xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm7,xmm6             ; xmm7=data2
-        packssdw  xmm4,xmm0             ; xmm4=data6
-
-        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=data2
-        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=data6
-
-        ; -- Odd part
-
-        movdqa  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
-        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
-
-        movdqa  xmm6,xmm2               ; xmm2=tmp4
-        movdqa  xmm0,xmm5               ; xmm5=tmp5
-        paddw   xmm6,xmm3               ; xmm6=z3
-        paddw   xmm0,xmm1               ; xmm0=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm7,xmm6
-        movdqa    xmm4,xmm6
-        punpcklwd xmm7,xmm0
-        punpckhwd xmm4,xmm0
-        movdqa    xmm6,xmm7
-        movdqa    xmm0,xmm4
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm7=z3L
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm4=z3H
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]       ; xmm6=z4L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_F117_F078)]       ; xmm0=z4H
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
-        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movdqa    xmm7,xmm2
-        movdqa    xmm4,xmm2
-        punpcklwd xmm7,xmm1
-        punpckhwd xmm4,xmm1
-        movdqa    xmm2,xmm7
-        movdqa    xmm1,xmm4
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm7=tmp4L
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm4=tmp4H
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm2=tmp7L
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm1=tmp7H
-
-        paddd   xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
-        paddd   xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
-        paddd   xmm2,xmm6               ; xmm2=data1L
-        paddd   xmm1,xmm0               ; xmm1=data1H
-
-        paddd   xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm4,DESCALE_P1
-        paddd   xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm2,DESCALE_P1
-        psrad   xmm1,DESCALE_P1
-
-        packssdw  xmm7,xmm4             ; xmm7=data7
-        packssdw  xmm2,xmm1             ; xmm2=data1
-
-        movdqa    xmm4,xmm5
-        movdqa    xmm1,xmm5
-        punpcklwd xmm4,xmm3
-        punpckhwd xmm1,xmm3
-        movdqa    xmm5,xmm4
-        movdqa    xmm3,xmm1
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm4=tmp5L
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm1=tmp5H
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm5=tmp6L
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm3=tmp6H
-
-        paddd   xmm4,xmm6               ; xmm4=data5L
-        paddd   xmm1,xmm0               ; xmm1=data5H
-        paddd   xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
-        paddd   xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
-
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm1,DESCALE_P1
-        paddd   xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm3,DESCALE_P1
-
-        packssdw  xmm4,xmm1             ; xmm4=data5
-        packssdw  xmm5,xmm3             ; xmm5=data3
-
-        ; ---- Pass 2: process columns.
-
-;       mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-
-        movdqa  xmm6, XMMWORD [wk(2)]   ; xmm6=col0
-        movdqa  xmm0, XMMWORD [wk(4)]   ; xmm0=col2
-
-        ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
-        ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
-        movdqa    xmm1,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm2             ; xmm6=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm1,xmm2             ; xmm1=(40 41 50 51 60 61 70 71)
-        movdqa    xmm3,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm5             ; xmm0=(02 03 12 13 22 23 32 33)
-        punpckhwd xmm3,xmm5             ; xmm3=(42 43 52 53 62 63 72 73)
-
-        movdqa  xmm2, XMMWORD [wk(3)]   ; xmm2=col4
-        movdqa  xmm5, XMMWORD [wk(5)]   ; xmm5=col6
-
-        ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
-        ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
-
-        movdqa    xmm0,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm4             ; xmm2=(04 05 14 15 24 25 34 35)
-        punpckhwd xmm0,xmm4             ; xmm0=(44 45 54 55 64 65 74 75)
-        movdqa    xmm3,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm7             ; xmm5=(06 07 16 17 26 27 36 37)
-        punpckhwd xmm3,xmm7             ; xmm3=(46 47 56 57 66 67 76 77)
-
-        movdqa    xmm4,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(04 05 06 07 14 15 16 17)
-        punpckhdq xmm4,xmm5             ; xmm4=(24 25 26 27 34 35 36 37)
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm3             ; xmm0=(44 45 46 47 54 55 56 57)
-        punpckhdq xmm7,xmm3             ; xmm7=(64 65 66 67 74 75 76 77)
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
-        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
-        movdqa  XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
-        movdqa  XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
-
-        movdqa    xmm4,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm5             ; xmm6=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm4,xmm5             ; xmm4=(20 21 22 23 30 31 32 33)
-        movdqa    xmm0,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm3             ; xmm1=(40 41 42 43 50 51 52 53)
-        punpckhdq xmm0,xmm3             ; xmm0=(60 61 62 63 70 71 72 73)
-
-        movdqa     xmm5,xmm6            ; transpose coefficients(phase 3)
-        punpcklqdq xmm6,xmm2            ; xmm6=(00 01 02 03 04 05 06 07)=data0
-        punpckhqdq xmm5,xmm2            ; xmm5=(10 11 12 13 14 15 16 17)=data1
-        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm7            ; xmm0=(60 61 62 63 64 65 66 67)=data6
-        punpckhqdq xmm3,xmm7            ; xmm3=(70 71 72 73 74 75 76 77)=data7
-
-        movdqa  xmm2,xmm5
-        movdqa  xmm7,xmm6
-        psubw   xmm5,xmm0               ; xmm5=data1-data6=tmp6
-        psubw   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        paddw   xmm2,xmm0               ; xmm2=data1+data6=tmp1
-        paddw   xmm7,xmm3               ; xmm7=data0+data7=tmp0
-
-        movdqa  xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
-        movdqa  xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movdqa     xmm5,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm0            ; xmm4=(20 21 22 23 24 25 26 27)=data2
-        punpckhqdq xmm5,xmm0            ; xmm5=(30 31 32 33 34 35 36 37)=data3
-        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm3            ; xmm1=(40 41 42 43 44 45 46 47)=data4
-        punpckhqdq xmm6,xmm3            ; xmm6=(50 51 52 53 54 55 56 57)=data5
-
-        movdqa  xmm0,xmm5
-        movdqa  xmm3,xmm4
-        paddw   xmm5,xmm1               ; xmm5=data3+data4=tmp3
-        paddw   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        psubw   xmm0,xmm1               ; xmm0=data3-data4=tmp4
-        psubw   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm1,xmm7
-        movdqa  xmm6,xmm2
-        paddw   xmm7,xmm5               ; xmm7=tmp10
-        paddw   xmm2,xmm4               ; xmm2=tmp11
-        psubw   xmm1,xmm5               ; xmm1=tmp13
-        psubw   xmm6,xmm4               ; xmm6=tmp12
-
-        movdqa  xmm5,xmm7
-        paddw   xmm7,xmm2               ; xmm7=tmp10+tmp11
-        psubw   xmm5,xmm2               ; xmm5=tmp10-tmp11
-
-        paddw   xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        paddw   xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        psraw   xmm7,PASS1_BITS         ; xmm7=data0
-        psraw   xmm5,PASS1_BITS         ; xmm5=data4
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
-        movdqa  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movdqa    xmm4,xmm1             ; xmm1=tmp13
-        movdqa    xmm2,xmm1
-        punpcklwd xmm4,xmm6             ; xmm6=tmp12
-        punpckhwd xmm2,xmm6
-        movdqa    xmm1,xmm4
-        movdqa    xmm6,xmm2
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]       ; xmm4=data2L
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F130_F054)]       ; xmm2=data2H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm1=data6L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm6=data6H
-
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm2,DESCALE_P2
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm1,DESCALE_P2
-        psrad   xmm6,DESCALE_P2
-
-        packssdw  xmm4,xmm2             ; xmm4=data2
-        packssdw  xmm1,xmm6             ; xmm1=data6
-
-        movdqa  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
-
-        ; -- Odd part
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
-
-        movdqa  xmm2,xmm0               ; xmm0=tmp4
-        movdqa  xmm6,xmm3               ; xmm3=tmp5
-        paddw   xmm2,xmm7               ; xmm2=z3
-        paddw   xmm6,xmm5               ; xmm6=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm4,xmm2
-        movdqa    xmm1,xmm2
-        punpcklwd xmm4,xmm6
-        punpckhwd xmm1,xmm6
-        movdqa    xmm2,xmm4
-        movdqa    xmm6,xmm1
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm4=z3L
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm1=z3H
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F117_F078)]       ; xmm2=z4L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]       ; xmm6=z4H
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
-        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movdqa    xmm4,xmm0
-        movdqa    xmm1,xmm0
-        punpcklwd xmm4,xmm5
-        punpckhwd xmm1,xmm5
-        movdqa    xmm0,xmm4
-        movdqa    xmm5,xmm1
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm4=tmp4L
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm1=tmp4H
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm0=tmp7L
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm5=tmp7H
-
-        paddd   xmm4, XMMWORD [wk(0)]   ; xmm4=data7L
-        paddd   xmm1, XMMWORD [wk(1)]   ; xmm1=data7H
-        paddd   xmm0,xmm2               ; xmm0=data1L
-        paddd   xmm5,xmm6               ; xmm5=data1H
-
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm0,DESCALE_P2
-        psrad   xmm5,DESCALE_P2
-
-        packssdw  xmm4,xmm1             ; xmm4=data7
-        packssdw  xmm0,xmm5             ; xmm0=data1
-
-        movdqa  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
-
-        movdqa    xmm1,xmm3
-        movdqa    xmm5,xmm3
-        punpcklwd xmm1,xmm7
-        punpckhwd xmm5,xmm7
-        movdqa    xmm3,xmm1
-        movdqa    xmm7,xmm5
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm1=tmp5L
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm5=tmp5H
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm3=tmp6L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm7=tmp6H
-
-        paddd   xmm1,xmm2               ; xmm1=data5L
-        paddd   xmm5,xmm6               ; xmm5=data5H
-        paddd   xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
-        paddd   xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
-
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm1,DESCALE_P2
-        psrad   xmm5,DESCALE_P2
-        paddd   xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm1,xmm5             ; xmm1=data5
-        packssdw  xmm3,xmm7             ; xmm3=data3
-
-        movdqa  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctflt-3dn.asm b/media/libjpeg/simd/jidctflt-3dn.asm
deleted file mode 100644
index 99356f20a4..0000000000
--- a/media/libjpeg/simd/jidctflt-3dn.asm
+++ /dev/null
@@ -1,451 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_float_3dnow)
-
-EXTN(jconst_idct_float_3dnow):
-
-PD_1_414        times 2 dd  1.414213562373095048801689
-PD_1_847        times 2 dd  1.847759065022573512256366
-PD_1_082        times 2 dd  1.082392200292393968799446
-PD_2_613        times 2 dd  2.613125929752753055713286
-PD_RNDINT_MAGIC times 2 dd  100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_3dnow (void *dct_table, JCOEFPTR coef_block,
-;                         JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-                                        ; FAST_FLOAT workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_float_3dnow)
-
-EXTN(jsimd_idct_float_3dnow):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     ecx, DCTSIZE/2                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        pushpic ebx             ; save GOT address
-        mov     ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        mov     eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        or      ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        or      ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        or      eax,ebx
-        poppic  ebx             ; restore GOT address
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd mm0,mm0
-        psrad     mm0,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm0,mm0
-
-        pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movq      mm1,mm0
-        punpckldq mm0,mm0
-        punpckhdq mm1,mm1
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
-        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
-        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movd      mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movd      mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movd      mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd mm0,mm0
-        punpcklwd mm1,mm1
-        psrad     mm0,(DWORD_BIT-WORD_BIT)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm0,mm0
-        pi2fd     mm1,mm1
-
-        pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        pfmul     mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        punpcklwd mm2,mm2
-        punpcklwd mm3,mm3
-        psrad     mm2,(DWORD_BIT-WORD_BIT)
-        psrad     mm3,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm2,mm2
-        pi2fd     mm3,mm3
-
-        pfmul     mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        pfmul     mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        pfsub   mm0,mm2                 ; mm0=tmp11
-        pfsub   mm1,mm3
-        pfadd   mm4,mm2                 ; mm4=tmp10
-        pfadd   mm5,mm3                 ; mm5=tmp13
-
-        pfmul   mm1,[GOTOFF(ebx,PD_1_414)]
-        pfsub   mm1,mm5                 ; mm1=tmp12
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        pfsub   mm4,mm5                 ; mm4=tmp3
-        pfsub   mm0,mm1                 ; mm0=tmp2
-        pfadd   mm6,mm5                 ; mm6=tmp0
-        pfadd   mm7,mm1                 ; mm7=tmp1
-
-        movq    MMWORD [wk(1)], mm4     ; tmp3
-        movq    MMWORD [wk(0)], mm0     ; tmp2
-
-        ; -- Odd part
-
-        movd      mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movd      mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movd      mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movd      mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd mm2,mm2
-        punpcklwd mm3,mm3
-        psrad     mm2,(DWORD_BIT-WORD_BIT)
-        psrad     mm3,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm2,mm2
-        pi2fd     mm3,mm3
-
-        pfmul     mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        pfmul     mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        punpcklwd mm5,mm5
-        punpcklwd mm1,mm1
-        psrad     mm5,(DWORD_BIT-WORD_BIT)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm5,mm5
-        pi2fd     mm1,mm1
-
-        pfmul     mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        pfmul     mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movq    mm4,mm2
-        movq    mm0,mm5
-        pfadd   mm2,mm1                 ; mm2=z11
-        pfadd   mm5,mm3                 ; mm5=z13
-        pfsub   mm4,mm1                 ; mm4=z12
-        pfsub   mm0,mm3                 ; mm0=z10
-
-        movq    mm1,mm2
-        pfsub   mm2,mm5
-        pfadd   mm1,mm5                 ; mm1=tmp7
-
-        pfmul   mm2,[GOTOFF(ebx,PD_1_414)]      ; mm2=tmp11
-
-        movq    mm3,mm0
-        pfadd   mm0,mm4
-        pfmul   mm0,[GOTOFF(ebx,PD_1_847)]      ; mm0=z5
-        pfmul   mm3,[GOTOFF(ebx,PD_2_613)]      ; mm3=(z10 * 2.613125930)
-        pfmul   mm4,[GOTOFF(ebx,PD_1_082)]      ; mm4=(z12 * 1.082392200)
-        pfsubr  mm3,mm0                 ; mm3=tmp12
-        pfsub   mm4,mm0                 ; mm4=tmp10
-
-        ; -- Final output stage
-
-        pfsub   mm3,mm1                 ; mm3=tmp6
-        movq    mm5,mm6
-        movq    mm0,mm7
-        pfadd   mm6,mm1                 ; mm6=data0=(00 01)
-        pfadd   mm7,mm3                 ; mm7=data1=(10 11)
-        pfsub   mm5,mm1                 ; mm5=data7=(70 71)
-        pfsub   mm0,mm3                 ; mm0=data6=(60 61)
-        pfsub   mm2,mm3                 ; mm2=tmp5
-
-        movq      mm1,mm6               ; transpose coefficients
-        punpckldq mm6,mm7               ; mm6=(00 10)
-        punpckhdq mm1,mm7               ; mm1=(01 11)
-        movq      mm3,mm0               ; transpose coefficients
-        punpckldq mm0,mm5               ; mm0=(60 70)
-        punpckhdq mm3,mm5               ; mm3=(61 71)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
-        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp2
-        movq    mm5, MMWORD [wk(1)]     ; mm5=tmp3
-
-        pfadd   mm4,mm2                 ; mm4=tmp4
-        movq    mm6,mm7
-        movq    mm1,mm5
-        pfadd   mm7,mm2                 ; mm7=data2=(20 21)
-        pfadd   mm5,mm4                 ; mm5=data4=(40 41)
-        pfsub   mm6,mm2                 ; mm6=data5=(50 51)
-        pfsub   mm1,mm4                 ; mm1=data3=(30 31)
-
-        movq      mm0,mm7               ; transpose coefficients
-        punpckldq mm7,mm1               ; mm7=(20 30)
-        punpckhdq mm0,mm1               ; mm0=(21 31)
-        movq      mm3,mm5               ; transpose coefficients
-        punpckldq mm5,mm6               ; mm5=(40 50)
-        punpckhdq mm3,mm6               ; mm3=(41 51)
-
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
-        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
-
-.nextcolumn:
-        add     esi, byte 2*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 2*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
-        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; -- Prefetch the next coefficient block
-
-        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/2                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        pfsub   mm0,mm2                 ; mm0=tmp11
-        pfsub   mm1,mm3
-        pfadd   mm4,mm2                 ; mm4=tmp10
-        pfadd   mm5,mm3                 ; mm5=tmp13
-
-        pfmul   mm1,[GOTOFF(ebx,PD_1_414)]
-        pfsub   mm1,mm5                 ; mm1=tmp12
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        pfsub   mm4,mm5                 ; mm4=tmp3
-        pfsub   mm0,mm1                 ; mm0=tmp2
-        pfadd   mm6,mm5                 ; mm6=tmp0
-        pfadd   mm7,mm1                 ; mm7=tmp1
-
-        movq    MMWORD [wk(1)], mm4     ; tmp3
-        movq    MMWORD [wk(0)], mm0     ; tmp2
-
-        ; -- Odd part
-
-        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movq    mm4,mm2
-        movq    mm0,mm5
-        pfadd   mm2,mm1                 ; mm2=z11
-        pfadd   mm5,mm3                 ; mm5=z13
-        pfsub   mm4,mm1                 ; mm4=z12
-        pfsub   mm0,mm3                 ; mm0=z10
-
-        movq    mm1,mm2
-        pfsub   mm2,mm5
-        pfadd   mm1,mm5                 ; mm1=tmp7
-
-        pfmul   mm2,[GOTOFF(ebx,PD_1_414)]      ; mm2=tmp11
-
-        movq    mm3,mm0
-        pfadd   mm0,mm4
-        pfmul   mm0,[GOTOFF(ebx,PD_1_847)]      ; mm0=z5
-        pfmul   mm3,[GOTOFF(ebx,PD_2_613)]      ; mm3=(z10 * 2.613125930)
-        pfmul   mm4,[GOTOFF(ebx,PD_1_082)]      ; mm4=(z12 * 1.082392200)
-        pfsubr  mm3,mm0                 ; mm3=tmp12
-        pfsub   mm4,mm0                 ; mm4=tmp10
-
-        ; -- Final output stage
-
-        pfsub   mm3,mm1                 ; mm3=tmp6
-        movq    mm5,mm6
-        movq    mm0,mm7
-        pfadd   mm6,mm1                 ; mm6=data0=(00 10)
-        pfadd   mm7,mm3                 ; mm7=data1=(01 11)
-        pfsub   mm5,mm1                 ; mm5=data7=(07 17)
-        pfsub   mm0,mm3                 ; mm0=data6=(06 16)
-        pfsub   mm2,mm3                 ; mm2=tmp5
-
-        movq    mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]       ; mm1=[PD_RNDINT_MAGIC]
-        pcmpeqd mm3,mm3
-        psrld   mm3,WORD_BIT            ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
-
-        pfadd   mm6,mm1                 ; mm6=roundint(data0/8)=(00 ** 10 **)
-        pfadd   mm7,mm1                 ; mm7=roundint(data1/8)=(01 ** 11 **)
-        pfadd   mm0,mm1                 ; mm0=roundint(data6/8)=(06 ** 16 **)
-        pfadd   mm5,mm1                 ; mm5=roundint(data7/8)=(07 ** 17 **)
-
-        pand    mm6,mm3                 ; mm6=(00 -- 10 --)
-        pslld   mm7,WORD_BIT            ; mm7=(-- 01 -- 11)
-        pand    mm0,mm3                 ; mm0=(06 -- 16 --)
-        pslld   mm5,WORD_BIT            ; mm5=(-- 07 -- 17)
-        por     mm6,mm7                 ; mm6=(00 01 10 11)
-        por     mm0,mm5                 ; mm0=(06 07 16 17)
-
-        movq    mm1, MMWORD [wk(0)]     ; mm1=tmp2
-        movq    mm3, MMWORD [wk(1)]     ; mm3=tmp3
-
-        pfadd   mm4,mm2                 ; mm4=tmp4
-        movq    mm7,mm1
-        movq    mm5,mm3
-        pfadd   mm1,mm2                 ; mm1=data2=(02 12)
-        pfadd   mm3,mm4                 ; mm3=data4=(04 14)
-        pfsub   mm7,mm2                 ; mm7=data5=(05 15)
-        pfsub   mm5,mm4                 ; mm5=data3=(03 13)
-
-        movq    mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]       ; mm2=[PD_RNDINT_MAGIC]
-        pcmpeqd mm4,mm4
-        psrld   mm4,WORD_BIT            ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
-
-        pfadd   mm3,mm2                 ; mm3=roundint(data4/8)=(04 ** 14 **)
-        pfadd   mm7,mm2                 ; mm7=roundint(data5/8)=(05 ** 15 **)
-        pfadd   mm1,mm2                 ; mm1=roundint(data2/8)=(02 ** 12 **)
-        pfadd   mm5,mm2                 ; mm5=roundint(data3/8)=(03 ** 13 **)
-
-        pand    mm3,mm4                 ; mm3=(04 -- 14 --)
-        pslld   mm7,WORD_BIT            ; mm7=(-- 05 -- 15)
-        pand    mm1,mm4                 ; mm1=(02 -- 12 --)
-        pslld   mm5,WORD_BIT            ; mm5=(-- 03 -- 13)
-        por     mm3,mm7                 ; mm3=(04 05 14 15)
-        por     mm1,mm5                 ; mm1=(02 03 12 13)
-
-        movq      mm2,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm2=[PB_CENTERJSAMP]
-
-        packsswb  mm6,mm3               ; mm6=(00 01 10 11 04 05 14 15)
-        packsswb  mm1,mm0               ; mm1=(02 03 12 13 06 07 16 17)
-        paddb     mm6,mm2
-        paddb     mm1,mm2
-
-        movq      mm4,mm6               ; transpose coefficients(phase 2)
-        punpcklwd mm6,mm1               ; mm6=(00 01 02 03 10 11 12 13)
-        punpckhwd mm4,mm1               ; mm4=(04 05 06 07 14 15 16 17)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 3)
-        punpckldq mm6,mm4               ; mm6=(00 01 02 03 04 05 06 07)
-        punpckhdq mm7,mm4               ; mm7=(10 11 12 13 14 15 16 17)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 2*SIZEOF_FAST_FLOAT   ; wsptr
-        add     edi, byte 2*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        femms           ; empty MMX/3DNow! state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctflt-sse.asm b/media/libjpeg/simd/jidctflt-sse.asm
deleted file mode 100644
index 4d4af2fffc..0000000000
--- a/media/libjpeg/simd/jidctflt-sse.asm
+++ /dev/null
@@ -1,571 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (SSE & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_float_sse)
-
-EXTN(jconst_idct_float_sse):
-
-PD_1_414        times 4 dd  1.414213562373095048801689
-PD_1_847        times 4 dd  1.847759065022573512256366
-PD_1_082        times 4 dd  1.082392200292393968799446
-PD_M2_613       times 4 dd -2.613125929752753055713286
-PD_0_125        times 4 dd  0.125       ; 1/8
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-                                        ; FAST_FLOAT workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_float_sse)
-
-EXTN(jsimd_idct_float_sse):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     mm1,mm0
-        packsswb mm1,mm1
-        movd    eax,mm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-        punpckhwd mm1,mm0                       ; mm1=(** 02 ** 03)
-        punpcklwd mm0,mm0                       ; mm0=(00 00 01 01)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)      ; mm1=in0H=(02 03)
-        psrad     mm0,(DWORD_BIT-WORD_BIT)      ; mm0=in0L=(00 01)
-        cvtpi2ps  xmm3,mm1                      ; xmm3=(02 03 ** **)
-        cvtpi2ps  xmm0,mm0                      ; xmm0=(00 01 ** **)
-        movlhps   xmm0,xmm3                     ; xmm0=in0=(00 01 02 03)
-
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm1,xmm0
-        movaps  xmm2,xmm0
-        movaps  xmm3,xmm0
-
-        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
-        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
-        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
-        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq      mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq      mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq      mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        punpckhwd mm4,mm0                       ; mm4=(** 02 ** 03)
-        punpcklwd mm0,mm0                       ; mm0=(00 00 01 01)
-        punpckhwd mm5,mm1                       ; mm5=(** 22 ** 23)
-        punpcklwd mm1,mm1                       ; mm1=(20 20 21 21)
-
-        psrad     mm4,(DWORD_BIT-WORD_BIT)      ; mm4=in0H=(02 03)
-        psrad     mm0,(DWORD_BIT-WORD_BIT)      ; mm0=in0L=(00 01)
-        cvtpi2ps  xmm4,mm4                      ; xmm4=(02 03 ** **)
-        cvtpi2ps  xmm0,mm0                      ; xmm0=(00 01 ** **)
-        psrad     mm5,(DWORD_BIT-WORD_BIT)      ; mm5=in2H=(22 23)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)      ; mm1=in2L=(20 21)
-        cvtpi2ps  xmm5,mm5                      ; xmm5=(22 23 ** **)
-        cvtpi2ps  xmm1,mm1                      ; xmm1=(20 21 ** **)
-
-        punpckhwd mm6,mm2                       ; mm6=(** 42 ** 43)
-        punpcklwd mm2,mm2                       ; mm2=(40 40 41 41)
-        punpckhwd mm7,mm3                       ; mm7=(** 62 ** 63)
-        punpcklwd mm3,mm3                       ; mm3=(60 60 61 61)
-
-        psrad     mm6,(DWORD_BIT-WORD_BIT)      ; mm6=in4H=(42 43)
-        psrad     mm2,(DWORD_BIT-WORD_BIT)      ; mm2=in4L=(40 41)
-        cvtpi2ps  xmm6,mm6                      ; xmm6=(42 43 ** **)
-        cvtpi2ps  xmm2,mm2                      ; xmm2=(40 41 ** **)
-        psrad     mm7,(DWORD_BIT-WORD_BIT)      ; mm7=in6H=(62 63)
-        psrad     mm3,(DWORD_BIT-WORD_BIT)      ; mm3=in6L=(60 61)
-        cvtpi2ps  xmm7,mm7                      ; xmm7=(62 63 ** **)
-        cvtpi2ps  xmm3,mm3                      ; xmm3=(60 61 ** **)
-
-        movlhps   xmm0,xmm4                     ; xmm0=in0=(00 01 02 03)
-        movlhps   xmm1,xmm5                     ; xmm1=in2=(20 21 22 23)
-        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movlhps   xmm2,xmm6                     ; xmm2=in4=(40 41 42 43)
-        movlhps   xmm3,xmm7                     ; xmm3=in6=(60 61 62 63)
-        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movq      mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq      mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq      mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq      mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        punpckhwd mm6,mm4                       ; mm6=(** 12 ** 13)
-        punpcklwd mm4,mm4                       ; mm4=(10 10 11 11)
-        punpckhwd mm2,mm0                       ; mm2=(** 32 ** 33)
-        punpcklwd mm0,mm0                       ; mm0=(30 30 31 31)
-
-        psrad     mm6,(DWORD_BIT-WORD_BIT)      ; mm6=in1H=(12 13)
-        psrad     mm4,(DWORD_BIT-WORD_BIT)      ; mm4=in1L=(10 11)
-        cvtpi2ps  xmm4,mm6                      ; xmm4=(12 13 ** **)
-        cvtpi2ps  xmm2,mm4                      ; xmm2=(10 11 ** **)
-        psrad     mm2,(DWORD_BIT-WORD_BIT)      ; mm2=in3H=(32 33)
-        psrad     mm0,(DWORD_BIT-WORD_BIT)      ; mm0=in3L=(30 31)
-        cvtpi2ps  xmm0,mm2                      ; xmm0=(32 33 ** **)
-        cvtpi2ps  xmm3,mm0                      ; xmm3=(30 31 ** **)
-
-        punpckhwd mm7,mm5                       ; mm7=(** 52 ** 53)
-        punpcklwd mm5,mm5                       ; mm5=(50 50 51 51)
-        punpckhwd mm3,mm1                       ; mm3=(** 72 ** 73)
-        punpcklwd mm1,mm1                       ; mm1=(70 70 71 71)
-
-        movlhps   xmm2,xmm4                     ; xmm2=in1=(10 11 12 13)
-        movlhps   xmm3,xmm0                     ; xmm3=in3=(30 31 32 33)
-
-        psrad     mm7,(DWORD_BIT-WORD_BIT)      ; mm7=in5H=(52 53)
-        psrad     mm5,(DWORD_BIT-WORD_BIT)      ; mm5=in5L=(50 51)
-        cvtpi2ps  xmm4,mm7                      ; xmm4=(52 53 ** **)
-        cvtpi2ps  xmm5,mm5                      ; xmm5=(50 51 ** **)
-        psrad     mm3,(DWORD_BIT-WORD_BIT)      ; mm3=in7H=(72 73)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)      ; mm1=in7L=(70 71)
-        cvtpi2ps  xmm0,mm3                      ; xmm0=(72 73 ** **)
-        cvtpi2ps  xmm1,mm1                      ; xmm1=(70 71 ** **)
-
-        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movlhps   xmm5,xmm4                     ; xmm5=in5=(50 51 52 53)
-        movlhps   xmm1,xmm0                     ; xmm1=in7=(70 71 72 73)
-        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
-        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
-        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
-        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
-        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
-        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
-        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
-        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
-        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
-        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
-
-        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
-        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm0,xmm7
-        movaps  xmm3,xmm5
-        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
-        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
-        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
-        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
-
-        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
-        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
-        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
-        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
-        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
-        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
-
-        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
-        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
-        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
-        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
-        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
-        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
-        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
-        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
-        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
-        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
-        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
-        add     edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
-        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
-        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
-        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
-        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps  xmm1,[GOTOFF(ebx,PD_0_125)]     ; xmm1=[PD_0_125]
-
-        mulps   xmm6,xmm1               ; descale(1/8)
-        mulps   xmm7,xmm1               ; descale(1/8)
-        mulps   xmm5,xmm1               ; descale(1/8)
-        mulps   xmm0,xmm1               ; descale(1/8)
-
-        movhlps   xmm3,xmm6
-        movhlps   xmm1,xmm7
-        cvtps2pi  mm0,xmm6              ; round to int32, mm0=data0L=(00 10)
-        cvtps2pi  mm1,xmm7              ; round to int32, mm1=data1L=(01 11)
-        cvtps2pi  mm2,xmm3              ; round to int32, mm2=data0H=(20 30)
-        cvtps2pi  mm3,xmm1              ; round to int32, mm3=data1H=(21 31)
-        packssdw  mm0,mm2               ; mm0=data0=(00 10 20 30)
-        packssdw  mm1,mm3               ; mm1=data1=(01 11 21 31)
-
-        movhlps   xmm6,xmm5
-        movhlps   xmm7,xmm0
-        cvtps2pi  mm4,xmm5              ; round to int32, mm4=data7L=(07 17)
-        cvtps2pi  mm5,xmm0              ; round to int32, mm5=data6L=(06 16)
-        cvtps2pi  mm6,xmm6              ; round to int32, mm6=data7H=(27 37)
-        cvtps2pi  mm7,xmm7              ; round to int32, mm7=data6H=(26 36)
-        packssdw  mm4,mm6               ; mm4=data7=(07 17 27 37)
-        packssdw  mm5,mm7               ; mm5=data6=(06 16 26 36)
-
-        packsswb  mm0,mm5               ; mm0=(00 10 20 30 06 16 26 36)
-        packsswb  mm1,mm4               ; mm1=(01 11 21 31 07 17 27 37)
-
-        movaps  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp2
-        movaps  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
-
-        movaps  xmm6,[GOTOFF(ebx,PD_0_125)]     ; xmm6=[PD_0_125]
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm5,xmm3
-        movaps  xmm0,xmm1
-        addps   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
-        addps   xmm1,xmm4               ; xmm1=data4=(04 14 24 34)
-        subps   xmm5,xmm2               ; xmm5=data5=(05 15 25 35)
-        subps   xmm0,xmm4               ; xmm0=data3=(03 13 23 33)
-
-        mulps   xmm3,xmm6               ; descale(1/8)
-        mulps   xmm1,xmm6               ; descale(1/8)
-        mulps   xmm5,xmm6               ; descale(1/8)
-        mulps   xmm0,xmm6               ; descale(1/8)
-
-        movhlps   xmm7,xmm3
-        movhlps   xmm2,xmm1
-        cvtps2pi  mm2,xmm3              ; round to int32, mm2=data2L=(02 12)
-        cvtps2pi  mm3,xmm1              ; round to int32, mm3=data4L=(04 14)
-        cvtps2pi  mm6,xmm7              ; round to int32, mm6=data2H=(22 32)
-        cvtps2pi  mm7,xmm2              ; round to int32, mm7=data4H=(24 34)
-        packssdw  mm2,mm6               ; mm2=data2=(02 12 22 32)
-        packssdw  mm3,mm7               ; mm3=data4=(04 14 24 34)
-
-        movhlps   xmm4,xmm5
-        movhlps   xmm6,xmm0
-        cvtps2pi  mm5,xmm5              ; round to int32, mm5=data5L=(05 15)
-        cvtps2pi  mm4,xmm0              ; round to int32, mm4=data3L=(03 13)
-        cvtps2pi  mm6,xmm4              ; round to int32, mm6=data5H=(25 35)
-        cvtps2pi  mm7,xmm6              ; round to int32, mm7=data3H=(23 33)
-        packssdw  mm5,mm6               ; mm5=data5=(05 15 25 35)
-        packssdw  mm4,mm7               ; mm4=data3=(03 13 23 33)
-
-        movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm6=[PB_CENTERJSAMP]
-
-        packsswb  mm2,mm3               ; mm2=(02 12 22 32 04 14 24 34)
-        packsswb  mm4,mm5               ; mm4=(03 13 23 33 05 15 25 35)
-
-        paddb     mm0,mm6
-        paddb     mm1,mm6
-        paddb     mm2,mm6
-        paddb     mm4,mm6
-
-        movq      mm7,mm0               ; transpose coefficients(phase 1)
-        punpcklbw mm0,mm1               ; mm0=(00 01 10 11 20 21 30 31)
-        punpckhbw mm7,mm1               ; mm7=(06 07 16 17 26 27 36 37)
-        movq      mm3,mm2               ; transpose coefficients(phase 1)
-        punpcklbw mm2,mm4               ; mm2=(02 03 12 13 22 23 32 33)
-        punpckhbw mm3,mm4               ; mm3=(04 05 14 15 24 25 34 35)
-
-        movq      mm5,mm0               ; transpose coefficients(phase 2)
-        punpcklwd mm0,mm2               ; mm0=(00 01 02 03 10 11 12 13)
-        punpckhwd mm5,mm2               ; mm5=(20 21 22 23 30 31 32 33)
-        movq      mm6,mm3               ; transpose coefficients(phase 2)
-        punpcklwd mm3,mm7               ; mm3=(04 05 06 07 14 15 16 17)
-        punpckhwd mm6,mm7               ; mm6=(24 25 26 27 34 35 36 37)
-
-        movq      mm1,mm0               ; transpose coefficients(phase 3)
-        punpckldq mm0,mm3               ; mm0=(00 01 02 03 04 05 06 07)
-        punpckhdq mm1,mm3               ; mm1=(10 11 12 13 14 15 16 17)
-        movq      mm4,mm5               ; transpose coefficients(phase 3)
-        punpckldq mm5,mm6               ; mm5=(20 21 22 23 24 25 26 27)
-        punpckhdq mm4,mm6               ; mm4=(30 31 32 33 34 35 36 37)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
-        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
-        add     edi, byte 4*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctflt-sse2-64.asm b/media/libjpeg/simd/jidctflt-sse2-64.asm
deleted file mode 100644
index bdda05d97c..0000000000
--- a/media/libjpeg/simd/jidctflt-sse2-64.asm
+++ /dev/null
@@ -1,482 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_float_sse2)
-
-EXTN(jconst_idct_float_sse2):
-
-PD_1_414        times 4 dd  1.414213562373095048801689
-PD_1_847        times 4 dd  1.847759065022573512256366
-PD_1_082        times 4 dd  1.082392200292393968799446
-PD_M2_613       times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void *dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp    rbp+0
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-                                        ; FAST_FLOAT workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_float_sse2)
-
-EXTN(jsimd_idct_float_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [workspace]
-        collect_args
-        push    rbx
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-        lea     rdi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     rcx, DCTSIZE/4                          ; ctr
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1,xmm2
-        por     xmm3,xmm4
-        por     xmm5,xmm6
-        por     xmm1,xmm3
-        por     xmm5,xmm7
-        por     xmm1,xmm5
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    rax,rax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
-
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm1,xmm0
-        movaps  xmm2,xmm0
-        movaps  xmm3,xmm0
-
-        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
-        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
-        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
-        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
-        jmp     near .nextcolumn
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
-        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
-        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
-
-        punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
-        punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
-        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
-        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
-
-        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[rel PD_1_414]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
-        punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
-        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
-        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
-
-        punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
-        punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
-        psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
-        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
-        cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
-
-        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
-        mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
-        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
-        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
-        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
-        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
-        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
-        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
-        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
-        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
-
-        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
-        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm0,xmm7
-        movaps  xmm3,xmm5
-        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
-        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
-        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
-        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
-
-        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
-        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
-        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
-        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
-        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
-        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
-
-        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
-        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
-        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
-        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
-        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
-
-        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
-        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
-        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
-        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
-        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
-        movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-        add     rsi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
-        add     rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
-        dec     rcx                                     ; ctr
-        jnz     near .columnloop
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     rax, [original_rbp]
-        lea     rsi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-        mov     rcx, DCTSIZE/4                          ; ctr
-.rowloop:
-
-        ; -- Even part
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[rel PD_1_414]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
-        mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
-        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
-        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
-        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps  xmm1,[rel PD_RNDINT_MAGIC]      ; xmm1=[rel PD_RNDINT_MAGIC]
-        pcmpeqd xmm3,xmm3
-        psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-        addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
-        addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
-        addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
-        addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
-        pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
-        pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
-        pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
-        pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
-        por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
-        por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
-
-        movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm7,xmm1
-        movaps  xmm5,xmm3
-        addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
-        addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
-        subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
-        subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
-
-        movaps  xmm2,[rel PD_RNDINT_MAGIC]      ; xmm2=[rel PD_RNDINT_MAGIC]
-        pcmpeqd xmm4,xmm4
-        psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-        addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
-        addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
-        addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
-        addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
-        pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
-        pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
-        pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
-        pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
-        por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
-        por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
-
-        movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
-
-        packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
-        packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
-        paddb     xmm6,xmm2
-        paddb     xmm1,xmm2
-
-        movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
-        punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
-        movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
-        punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
-        pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
-        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        mov     rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
-        movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
-
-        add     rsi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
-        add     rdi, byte 4*SIZEOF_JSAMPROW
-        dec     rcx                             ; ctr
-        jnz     near .rowloop
-
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctflt-sse2.asm b/media/libjpeg/simd/jidctflt-sse2.asm
deleted file mode 100644
index a15a9c1111..0000000000
--- a/media/libjpeg/simd/jidctflt-sse2.asm
+++ /dev/null
@@ -1,497 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_float_sse2)
-
-EXTN(jconst_idct_float_sse2):
-
-PD_1_414        times 4 dd  1.414213562373095048801689
-PD_1_847        times 4 dd  1.847759065022573512256366
-PD_1_082        times 4 dd  1.082392200292393968799446
-PD_M2_613       times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-                                        ; FAST_FLOAT workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_float_sse2)
-
-EXTN(jsimd_idct_float_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     xmm1,xmm2
-        por     xmm3,xmm4
-        por     xmm5,xmm6
-        por     xmm1,xmm3
-        por     xmm5,xmm7
-        por     xmm1,xmm5
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
-
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm1,xmm0
-        movaps  xmm2,xmm0
-        movaps  xmm3,xmm0
-
-        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
-        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
-        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
-        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
-        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
-        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
-
-        punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
-        punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
-        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
-        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
-
-        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
-        punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
-        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
-        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
-
-        punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
-        punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
-        psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
-        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
-        cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
-
-        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
-        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
-        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
-        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
-        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
-        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
-        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
-        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
-        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
-        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
-
-        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
-        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm0,xmm7
-        movaps  xmm3,xmm5
-        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
-        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
-        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
-        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
-
-        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
-        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
-        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
-        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
-        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
-        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
-
-        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
-        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
-        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
-        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
-        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
-        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
-        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
-        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
-        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
-        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
-        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
-        add     edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
-        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
-        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
-        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
-        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps  xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm1=[PD_RNDINT_MAGIC]
-        pcmpeqd xmm3,xmm3
-        psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-        addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
-        addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
-        addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
-        addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
-        pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
-        pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
-        pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
-        pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
-        por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
-        por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
-
-        movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm7,xmm1
-        movaps  xmm5,xmm3
-        addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
-        addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
-        subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
-        subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
-
-        movaps  xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm2=[PD_RNDINT_MAGIC]
-        pcmpeqd xmm4,xmm4
-        psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-        addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
-        addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
-        addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
-        addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
-        pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
-        pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
-        pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
-        pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
-        por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
-        por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
-
-        movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
-
-        packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
-        packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
-        paddb     xmm6,xmm2
-        paddb     xmm1,xmm2
-
-        movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
-        punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
-        movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
-        punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
-        pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
-        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
-        movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
-        add     edi, byte 4*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctfst-mmx.asm b/media/libjpeg/simd/jidctfst-mmx.asm
deleted file mode 100644
index 6e95bfbcaf..0000000000
--- a/media/libjpeg/simd/jidctfst-mmx.asm
+++ /dev/null
@@ -1,499 +0,0 @@
-;
-; jidctfst.asm - fast integer IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-%define PASS1_BITS      2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082 equ     277             ; FIX(1.082392200)
-F_1_414 equ     362             ; FIX(1.414213562)
-F_1_847 equ     473             ; FIX(1.847759065)
-F_2_613 equ     669             ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
-F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_idct_ifast_mmx)
-
-EXTN(jconst_idct_ifast_mmx):
-
-PW_F1414        times 4 dw  F_1_414 << CONST_SHIFT
-PW_F1847        times 4 dw  F_1_847 << CONST_SHIFT
-PW_MF1613       times 4 dw -F_1_613 << CONST_SHIFT
-PW_F1082        times 4 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_mmx (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
-                                        ; JCOEF workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_ifast_mmx)
-
-EXTN(jsimd_idct_ifast_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; JCOEF *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     mm1,mm0
-        packsswb mm1,mm1
-        movd    eax,mm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
-        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
-        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
-
-        movq      mm1,mm0
-        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
-        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
-        movq      mm3,mm2
-        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
-        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        psubw   mm0,mm2                 ; mm0=tmp11
-        psubw   mm1,mm3
-        paddw   mm4,mm2                 ; mm4=tmp10
-        paddw   mm5,mm3                 ; mm5=tmp13
-
-        psllw   mm1,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm1,[GOTOFF(ebx,PW_F1414)]
-        psubw   mm1,mm5                 ; mm1=tmp12
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        psubw   mm4,mm5                 ; mm4=tmp3
-        psubw   mm0,mm1                 ; mm0=tmp2
-        paddw   mm6,mm5                 ; mm6=tmp0
-        paddw   mm7,mm1                 ; mm7=tmp1
-
-        movq    MMWORD [wk(1)], mm4     ; wk(1)=tmp3
-        movq    MMWORD [wk(0)], mm0     ; wk(0)=tmp2
-
-        ; -- Odd part
-
-        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movq    mm4,mm2
-        movq    mm0,mm5
-        psubw   mm2,mm1                 ; mm2=z12
-        psubw   mm5,mm3                 ; mm5=z10
-        paddw   mm4,mm1                 ; mm4=z11
-        paddw   mm0,mm3                 ; mm0=z13
-
-        movq    mm1,mm5                 ; mm1=z10(unscaled)
-        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   mm5,PRE_MULTIPLY_SCALE_BITS
-
-        movq    mm3,mm4
-        psubw   mm4,mm0
-        paddw   mm3,mm0                 ; mm3=tmp7
-
-        psllw   mm4,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm4,[GOTOFF(ebx,PW_F1414)]      ; mm4=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movq    mm0,mm5
-        paddw   mm5,mm2
-        pmulhw  mm5,[GOTOFF(ebx,PW_F1847)]      ; mm5=z5
-        pmulhw  mm0,[GOTOFF(ebx,PW_MF1613)]
-        pmulhw  mm2,[GOTOFF(ebx,PW_F1082)]
-        psubw   mm0,mm1
-        psubw   mm2,mm5                 ; mm2=tmp10
-        paddw   mm0,mm5                 ; mm0=tmp12
-
-        ; -- Final output stage
-
-        psubw   mm0,mm3                 ; mm0=tmp6
-        movq    mm1,mm6
-        movq    mm5,mm7
-        paddw   mm6,mm3                 ; mm6=data0=(00 01 02 03)
-        paddw   mm7,mm0                 ; mm7=data1=(10 11 12 13)
-        psubw   mm1,mm3                 ; mm1=data7=(70 71 72 73)
-        psubw   mm5,mm0                 ; mm5=data6=(60 61 62 63)
-        psubw   mm4,mm0                 ; mm4=tmp5
-
-        movq      mm3,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
-        punpckhwd mm3,mm7               ; mm3=(02 12 03 13)
-        movq      mm0,mm5               ; transpose coefficients(phase 1)
-        punpcklwd mm5,mm1               ; mm5=(60 70 61 71)
-        punpckhwd mm0,mm1               ; mm0=(62 72 63 73)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp2
-        movq    mm1, MMWORD [wk(1)]     ; mm1=tmp3
-
-        movq    MMWORD [wk(0)], mm5     ; wk(0)=(60 70 61 71)
-        movq    MMWORD [wk(1)], mm0     ; wk(1)=(62 72 63 73)
-
-        paddw   mm2,mm4                 ; mm2=tmp4
-        movq    mm5,mm7
-        movq    mm0,mm1
-        paddw   mm7,mm4                 ; mm7=data2=(20 21 22 23)
-        paddw   mm1,mm2                 ; mm1=data4=(40 41 42 43)
-        psubw   mm5,mm4                 ; mm5=data5=(50 51 52 53)
-        psubw   mm0,mm2                 ; mm0=data3=(30 31 32 33)
-
-        movq      mm4,mm7               ; transpose coefficients(phase 1)
-        punpcklwd mm7,mm0               ; mm7=(20 30 21 31)
-        punpckhwd mm4,mm0               ; mm4=(22 32 23 33)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm5               ; mm1=(40 50 41 51)
-        punpckhwd mm2,mm5               ; mm2=(42 52 43 53)
-
-        movq      mm0,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm7               ; mm6=(00 10 20 30)
-        punpckhdq mm0,mm7               ; mm0=(01 11 21 31)
-        movq      mm5,mm3               ; transpose coefficients(phase 2)
-        punpckldq mm3,mm4               ; mm3=(02 12 22 32)
-        punpckhdq mm5,mm4               ; mm5=(03 13 23 33)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=(60 70 61 71)
-        movq    mm4, MMWORD [wk(1)]     ; mm4=(62 72 63 73)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
-
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm7               ; mm1=(40 50 60 70)
-        punpckhdq mm6,mm7               ; mm6=(41 51 61 71)
-        movq      mm0,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm4               ; mm2=(42 52 62 72)
-        punpckhdq mm0,mm4               ; mm0=(43 53 63 73)
-
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_IFAST_MULT_TYPE      ; quantptr
-        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; JCOEF *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        psubw   mm0,mm2                 ; mm0=tmp11
-        psubw   mm1,mm3
-        paddw   mm4,mm2                 ; mm4=tmp10
-        paddw   mm5,mm3                 ; mm5=tmp13
-
-        psllw   mm1,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm1,[GOTOFF(ebx,PW_F1414)]
-        psubw   mm1,mm5                 ; mm1=tmp12
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        psubw   mm4,mm5                 ; mm4=tmp3
-        psubw   mm0,mm1                 ; mm0=tmp2
-        paddw   mm6,mm5                 ; mm6=tmp0
-        paddw   mm7,mm1                 ; mm7=tmp1
-
-        movq    MMWORD [wk(1)], mm4     ; wk(1)=tmp3
-        movq    MMWORD [wk(0)], mm0     ; wk(0)=tmp2
-
-        ; -- Odd part
-
-        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        movq    mm4,mm2
-        movq    mm0,mm5
-        psubw   mm2,mm1                 ; mm2=z12
-        psubw   mm5,mm3                 ; mm5=z10
-        paddw   mm4,mm1                 ; mm4=z11
-        paddw   mm0,mm3                 ; mm0=z13
-
-        movq    mm1,mm5                 ; mm1=z10(unscaled)
-        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   mm5,PRE_MULTIPLY_SCALE_BITS
-
-        movq    mm3,mm4
-        psubw   mm4,mm0
-        paddw   mm3,mm0                 ; mm3=tmp7
-
-        psllw   mm4,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm4,[GOTOFF(ebx,PW_F1414)]      ; mm4=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movq    mm0,mm5
-        paddw   mm5,mm2
-        pmulhw  mm5,[GOTOFF(ebx,PW_F1847)]      ; mm5=z5
-        pmulhw  mm0,[GOTOFF(ebx,PW_MF1613)]
-        pmulhw  mm2,[GOTOFF(ebx,PW_F1082)]
-        psubw   mm0,mm1
-        psubw   mm2,mm5                 ; mm2=tmp10
-        paddw   mm0,mm5                 ; mm0=tmp12
-
-        ; -- Final output stage
-
-        psubw   mm0,mm3                 ; mm0=tmp6
-        movq    mm1,mm6
-        movq    mm5,mm7
-        paddw   mm6,mm3                 ; mm6=data0=(00 10 20 30)
-        paddw   mm7,mm0                 ; mm7=data1=(01 11 21 31)
-        psraw   mm6,(PASS1_BITS+3)      ; descale
-        psraw   mm7,(PASS1_BITS+3)      ; descale
-        psubw   mm1,mm3                 ; mm1=data7=(07 17 27 37)
-        psubw   mm5,mm0                 ; mm5=data6=(06 16 26 36)
-        psraw   mm1,(PASS1_BITS+3)      ; descale
-        psraw   mm5,(PASS1_BITS+3)      ; descale
-        psubw   mm4,mm0                 ; mm4=tmp5
-
-        packsswb  mm6,mm5               ; mm6=(00 10 20 30 06 16 26 36)
-        packsswb  mm7,mm1               ; mm7=(01 11 21 31 07 17 27 37)
-
-        movq    mm3, MMWORD [wk(0)]     ; mm3=tmp2
-        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp3
-
-        paddw   mm2,mm4                 ; mm2=tmp4
-        movq    mm5,mm3
-        movq    mm1,mm0
-        paddw   mm3,mm4                 ; mm3=data2=(02 12 22 32)
-        paddw   mm0,mm2                 ; mm0=data4=(04 14 24 34)
-        psraw   mm3,(PASS1_BITS+3)      ; descale
-        psraw   mm0,(PASS1_BITS+3)      ; descale
-        psubw   mm5,mm4                 ; mm5=data5=(05 15 25 35)
-        psubw   mm1,mm2                 ; mm1=data3=(03 13 23 33)
-        psraw   mm5,(PASS1_BITS+3)      ; descale
-        psraw   mm1,(PASS1_BITS+3)      ; descale
-
-        movq      mm4,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm4=[PB_CENTERJSAMP]
-
-        packsswb  mm3,mm0               ; mm3=(02 12 22 32 04 14 24 34)
-        packsswb  mm1,mm5               ; mm1=(03 13 23 33 05 15 25 35)
-
-        paddb     mm6,mm4
-        paddb     mm7,mm4
-        paddb     mm3,mm4
-        paddb     mm1,mm4
-
-        movq      mm2,mm6               ; transpose coefficients(phase 1)
-        punpcklbw mm6,mm7               ; mm6=(00 01 10 11 20 21 30 31)
-        punpckhbw mm2,mm7               ; mm2=(06 07 16 17 26 27 36 37)
-        movq      mm0,mm3               ; transpose coefficients(phase 1)
-        punpcklbw mm3,mm1               ; mm3=(02 03 12 13 22 23 32 33)
-        punpckhbw mm0,mm1               ; mm0=(04 05 14 15 24 25 34 35)
-
-        movq      mm5,mm6               ; transpose coefficients(phase 2)
-        punpcklwd mm6,mm3               ; mm6=(00 01 02 03 10 11 12 13)
-        punpckhwd mm5,mm3               ; mm5=(20 21 22 23 30 31 32 33)
-        movq      mm4,mm0               ; transpose coefficients(phase 2)
-        punpcklwd mm0,mm2               ; mm0=(04 05 06 07 14 15 16 17)
-        punpckhwd mm4,mm2               ; mm4=(24 25 26 27 34 35 36 37)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 3)
-        punpckldq mm6,mm0               ; mm6=(00 01 02 03 04 05 06 07)
-        punpckhdq mm7,mm0               ; mm7=(10 11 12 13 14 15 16 17)
-        movq      mm1,mm5               ; transpose coefficients(phase 3)
-        punpckldq mm5,mm4               ; mm5=(20 21 22 23 24 25 26 27)
-        punpckhdq mm1,mm4               ; mm1=(30 31 32 33 34 35 36 37)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
-        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 4*SIZEOF_JCOEF        ; wsptr
-        add     edi, byte 4*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctfst-sse2-64.asm b/media/libjpeg/simd/jidctfst-sse2-64.asm
deleted file mode 100644
index 48846426d2..0000000000
--- a/media/libjpeg/simd/jidctfst-sse2-64.asm
+++ /dev/null
@@ -1,491 +0,0 @@
-;
-; jidctfst.asm - fast integer IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-%define PASS1_BITS      2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082 equ     277             ; FIX(1.082392200)
-F_1_414 equ     362             ; FIX(1.414213562)
-F_1_847 equ     473             ; FIX(1.847759065)
-F_2_613 equ     669             ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
-F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_idct_ifast_sse2)
-
-EXTN(jconst_idct_ifast_sse2):
-
-PW_F1414        times 8 dw  F_1_414 << CONST_SHIFT
-PW_F1847        times 8 dw  F_1_847 << CONST_SHIFT
-PW_MF1613       times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082        times 8 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = jpeg_component_info *compptr
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp    rbp+0
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_idct_ifast_sse2)
-
-EXTN(jsimd_idct_ifast_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1,xmm0
-        packsswb xmm1,xmm1
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    rax,rax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa    xmm7,xmm0             ; xmm0=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm7,xmm7             ; xmm7=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm6,xmm0,0x00          ; xmm6=col0=(00 00 00 00 00 00 00 00)
-        pshufd  xmm2,xmm0,0x55          ; xmm2=col1=(01 01 01 01 01 01 01 01)
-        pshufd  xmm5,xmm0,0xAA          ; xmm5=col2=(02 02 02 02 02 02 02 02)
-        pshufd  xmm0,xmm0,0xFF          ; xmm0=col3=(03 03 03 03 03 03 03 03)
-        pshufd  xmm1,xmm7,0x00          ; xmm1=col4=(04 04 04 04 04 04 04 04)
-        pshufd  xmm4,xmm7,0x55          ; xmm4=col5=(05 05 05 05 05 05 05 05)
-        pshufd  xmm3,xmm7,0xAA          ; xmm3=col6=(06 06 06 06 06 06 06 06)
-        pshufd  xmm7,xmm7,0xFF          ; xmm7=col7=(07 07 07 07 07 07 07 07)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=col1
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=col3
-        jmp     near .column_end
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movdqa  xmm4,xmm0
-        movdqa  xmm5,xmm1
-        psubw   xmm0,xmm2               ; xmm0=tmp11
-        psubw   xmm1,xmm3
-        paddw   xmm4,xmm2               ; xmm4=tmp10
-        paddw   xmm5,xmm3               ; xmm5=tmp13
-
-        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm1,[rel PW_F1414]
-        psubw   xmm1,xmm5               ; xmm1=tmp12
-
-        movdqa  xmm6,xmm4
-        movdqa  xmm7,xmm0
-        psubw   xmm4,xmm5               ; xmm4=tmp3
-        psubw   xmm0,xmm1               ; xmm0=tmp2
-        paddw   xmm6,xmm5               ; xmm6=tmp0
-        paddw   xmm7,xmm1               ; xmm7=tmp1
-
-        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
-        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
-
-        ; -- Odd part
-
-        movdqa  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movdqa  xmm4,xmm2
-        movdqa  xmm0,xmm5
-        psubw   xmm2,xmm1               ; xmm2=z12
-        psubw   xmm5,xmm3               ; xmm5=z10
-        paddw   xmm4,xmm1               ; xmm4=z11
-        paddw   xmm0,xmm3               ; xmm0=z13
-
-        movdqa  xmm1,xmm5               ; xmm1=z10(unscaled)
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-
-        movdqa  xmm3,xmm4
-        psubw   xmm4,xmm0
-        paddw   xmm3,xmm0               ; xmm3=tmp7
-
-        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm4,[rel PW_F1414]     ; xmm4=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movdqa  xmm0,xmm5
-        paddw   xmm5,xmm2
-        pmulhw  xmm5,[rel PW_F1847]     ; xmm5=z5
-        pmulhw  xmm0,[rel PW_MF1613]
-        pmulhw  xmm2,[rel PW_F1082]
-        psubw   xmm0,xmm1
-        psubw   xmm2,xmm5               ; xmm2=tmp10
-        paddw   xmm0,xmm5               ; xmm0=tmp12
-
-        ; -- Final output stage
-
-        psubw   xmm0,xmm3               ; xmm0=tmp6
-        movdqa  xmm1,xmm6
-        movdqa  xmm5,xmm7
-        paddw   xmm6,xmm3               ; xmm6=data0=(00 01 02 03 04 05 06 07)
-        paddw   xmm7,xmm0               ; xmm7=data1=(10 11 12 13 14 15 16 17)
-        psubw   xmm1,xmm3               ; xmm1=data7=(70 71 72 73 74 75 76 77)
-        psubw   xmm5,xmm0               ; xmm5=data6=(60 61 62 63 64 65 66 67)
-        psubw   xmm4,xmm0               ; xmm4=tmp5
-
-        movdqa    xmm3,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm3,xmm7             ; xmm3=(04 14 05 15 06 16 07 17)
-        movdqa    xmm0,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm1             ; xmm5=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm0,xmm1             ; xmm0=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
-
-        paddw   xmm2,xmm4               ; xmm2=tmp4
-        movdqa  xmm5,xmm7
-        movdqa  xmm0,xmm1
-        paddw   xmm7,xmm4               ; xmm7=data2=(20 21 22 23 24 25 26 27)
-        paddw   xmm1,xmm2               ; xmm1=data4=(40 41 42 43 44 45 46 47)
-        psubw   xmm5,xmm4               ; xmm5=data5=(50 51 52 53 54 55 56 57)
-        psubw   xmm0,xmm2               ; xmm0=data3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm0             ; xmm7=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm4,xmm0             ; xmm4=(24 34 25 35 26 36 27 37)
-        movdqa    xmm2,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm5             ; xmm1=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm5             ; xmm2=(44 54 45 55 46 56 47 57)
-
-        movdqa    xmm0,xmm3             ; transpose coefficients(phase 2)
-        punpckldq xmm3,xmm4             ; xmm3=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm0,xmm4             ; xmm0=(06 16 26 36 07 17 27 37)
-        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm7             ; xmm6=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm5,xmm7             ; xmm5=(02 12 22 32 03 13 23 33)
-
-        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
-        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
-
-        movdqa    xmm3,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm4             ; xmm1=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm3,xmm4             ; xmm3=(42 52 62 72 43 53 63 73)
-        movdqa    xmm0,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm7             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm0,xmm7             ; xmm0=(46 56 66 76 47 57 67 77)
-
-        movdqa     xmm4,xmm6            ; transpose coefficients(phase 3)
-        punpcklqdq xmm6,xmm1            ; xmm6=col0=(00 10 20 30 40 50 60 70)
-        punpckhqdq xmm4,xmm1            ; xmm4=col1=(01 11 21 31 41 51 61 71)
-        movdqa     xmm7,xmm5            ; transpose coefficients(phase 3)
-        punpcklqdq xmm5,xmm3            ; xmm5=col2=(02 12 22 32 42 52 62 72)
-        punpckhqdq xmm7,xmm3            ; xmm7=col3=(03 13 23 33 43 53 63 73)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
-        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=col1
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=col3
-
-        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm2            ; xmm1=col4=(04 14 24 34 44 54 64 74)
-        punpckhqdq xmm4,xmm2            ; xmm4=col5=(05 15 25 35 45 55 65 75)
-        movdqa     xmm7,xmm3            ; transpose coefficients(phase 3)
-        punpcklqdq xmm3,xmm0            ; xmm3=col6=(06 16 26 36 46 56 66 76)
-        punpckhqdq xmm7,xmm0            ; xmm7=col7=(07 17 27 37 47 57 67 77)
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     rax, [original_rbp]
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-
-        ; -- Even part
-
-        ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
-        movdqa  xmm2,xmm6
-        movdqa  xmm0,xmm5
-        psubw   xmm6,xmm1               ; xmm6=tmp11
-        psubw   xmm5,xmm3
-        paddw   xmm2,xmm1               ; xmm2=tmp10
-        paddw   xmm0,xmm3               ; xmm0=tmp13
-
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[rel PW_F1414]
-        psubw   xmm5,xmm0               ; xmm5=tmp12
-
-        movdqa  xmm1,xmm2
-        movdqa  xmm3,xmm6
-        psubw   xmm2,xmm0               ; xmm2=tmp3
-        psubw   xmm6,xmm5               ; xmm6=tmp2
-        paddw   xmm1,xmm0               ; xmm1=tmp0
-        paddw   xmm3,xmm5               ; xmm3=tmp1
-
-        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=col1
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=col3
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
-
-        ; -- Odd part
-
-        ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
-        movdqa  xmm2,xmm0
-        movdqa  xmm6,xmm4
-        psubw   xmm0,xmm7               ; xmm0=z12
-        psubw   xmm4,xmm5               ; xmm4=z10
-        paddw   xmm2,xmm7               ; xmm2=z11
-        paddw   xmm6,xmm5               ; xmm6=z13
-
-        movdqa  xmm7,xmm4               ; xmm7=z10(unscaled)
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
-
-        movdqa  xmm5,xmm2
-        psubw   xmm2,xmm6
-        paddw   xmm5,xmm6               ; xmm5=tmp7
-
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm2,[rel PW_F1414]     ; xmm2=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movdqa  xmm6,xmm4
-        paddw   xmm4,xmm0
-        pmulhw  xmm4,[rel PW_F1847]     ; xmm4=z5
-        pmulhw  xmm6,[rel PW_MF1613]
-        pmulhw  xmm0,[rel PW_F1082]
-        psubw   xmm6,xmm7
-        psubw   xmm0,xmm4               ; xmm0=tmp10
-        paddw   xmm6,xmm4               ; xmm6=tmp12
-
-        ; -- Final output stage
-
-        psubw   xmm6,xmm5               ; xmm6=tmp6
-        movdqa  xmm7,xmm1
-        movdqa  xmm4,xmm3
-        paddw   xmm1,xmm5               ; xmm1=data0=(00 10 20 30 40 50 60 70)
-        paddw   xmm3,xmm6               ; xmm3=data1=(01 11 21 31 41 51 61 71)
-        psraw   xmm1,(PASS1_BITS+3)     ; descale
-        psraw   xmm3,(PASS1_BITS+3)     ; descale
-        psubw   xmm7,xmm5               ; xmm7=data7=(07 17 27 37 47 57 67 77)
-        psubw   xmm4,xmm6               ; xmm4=data6=(06 16 26 36 46 56 66 76)
-        psraw   xmm7,(PASS1_BITS+3)     ; descale
-        psraw   xmm4,(PASS1_BITS+3)     ; descale
-        psubw   xmm2,xmm6               ; xmm2=tmp5
-
-        packsswb  xmm1,xmm4     ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        packsswb  xmm3,xmm7     ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
-        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
-
-        paddw   xmm0,xmm2               ; xmm0=tmp4
-        movdqa  xmm4,xmm5
-        movdqa  xmm7,xmm6
-        paddw   xmm5,xmm2               ; xmm5=data2=(02 12 22 32 42 52 62 72)
-        paddw   xmm6,xmm0               ; xmm6=data4=(04 14 24 34 44 54 64 74)
-        psraw   xmm5,(PASS1_BITS+3)     ; descale
-        psraw   xmm6,(PASS1_BITS+3)     ; descale
-        psubw   xmm4,xmm2               ; xmm4=data5=(05 15 25 35 45 55 65 75)
-        psubw   xmm7,xmm0               ; xmm7=data3=(03 13 23 33 43 53 63 73)
-        psraw   xmm4,(PASS1_BITS+3)     ; descale
-        psraw   xmm7,(PASS1_BITS+3)     ; descale
-
-        movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
-
-        packsswb  xmm5,xmm6     ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-        packsswb  xmm7,xmm4     ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-        paddb     xmm1,xmm2
-        paddb     xmm3,xmm2
-        paddb     xmm5,xmm2
-        paddb     xmm7,xmm2
-
-        movdqa    xmm0,xmm1     ; transpose coefficients(phase 1)
-        punpcklbw xmm1,xmm3     ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-        punpckhbw xmm0,xmm3     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-        movdqa    xmm6,xmm5     ; transpose coefficients(phase 1)
-        punpcklbw xmm5,xmm7     ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-        punpckhbw xmm6,xmm7     ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-        movdqa    xmm4,xmm1     ; transpose coefficients(phase 2)
-        punpcklwd xmm1,xmm5     ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm5     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-        movdqa    xmm2,xmm6     ; transpose coefficients(phase 2)
-        punpcklwd xmm6,xmm0     ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-        punpckhwd xmm2,xmm0     ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-        movdqa    xmm3,xmm1     ; transpose coefficients(phase 3)
-        punpckldq xmm1,xmm6     ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm3,xmm6     ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-        movdqa    xmm7,xmm4     ; transpose coefficients(phase 3)
-        punpckldq xmm4,xmm2     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-        punpckhdq xmm7,xmm2     ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-        pshufd  xmm5,xmm1,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm0,xmm3,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-        pshufd  xmm6,xmm4,0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-        pshufd  xmm2,xmm7,0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-        mov     rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
-
-        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-        mov     rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctfst-sse2.asm b/media/libjpeg/simd/jidctfst-sse2.asm
deleted file mode 100644
index f591e55f0f..0000000000
--- a/media/libjpeg/simd/jidctfst-sse2.asm
+++ /dev/null
@@ -1,501 +0,0 @@
-;
-; jidctfst.asm - fast integer IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-%define PASS1_BITS      2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082 equ     277             ; FIX(1.082392200)
-F_1_414 equ     362             ; FIX(1.414213562)
-F_1_847 equ     473             ; FIX(1.847759065)
-F_2_613 equ     669             ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
-F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_idct_ifast_sse2)
-
-EXTN(jconst_idct_ifast_sse2):
-
-PW_F1414        times 8 dw  F_1_414 << CONST_SHIFT
-PW_F1847        times 8 dw  F_1_847 << CONST_SHIFT
-PW_MF1613       times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082        times 8 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_idct_ifast_sse2)
-
-EXTN(jsimd_idct_ifast_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     xmm1,xmm0
-        packsswb xmm1,xmm1
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa    xmm7,xmm0             ; xmm0=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm7,xmm7             ; xmm7=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm6,xmm0,0x00          ; xmm6=col0=(00 00 00 00 00 00 00 00)
-        pshufd  xmm2,xmm0,0x55          ; xmm2=col1=(01 01 01 01 01 01 01 01)
-        pshufd  xmm5,xmm0,0xAA          ; xmm5=col2=(02 02 02 02 02 02 02 02)
-        pshufd  xmm0,xmm0,0xFF          ; xmm0=col3=(03 03 03 03 03 03 03 03)
-        pshufd  xmm1,xmm7,0x00          ; xmm1=col4=(04 04 04 04 04 04 04 04)
-        pshufd  xmm4,xmm7,0x55          ; xmm4=col5=(05 05 05 05 05 05 05 05)
-        pshufd  xmm3,xmm7,0xAA          ; xmm3=col6=(06 06 06 06 06 06 06 06)
-        pshufd  xmm7,xmm7,0xFF          ; xmm7=col7=(07 07 07 07 07 07 07 07)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=col1
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=col3
-        jmp     near .column_end
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movdqa  xmm4,xmm0
-        movdqa  xmm5,xmm1
-        psubw   xmm0,xmm2               ; xmm0=tmp11
-        psubw   xmm1,xmm3
-        paddw   xmm4,xmm2               ; xmm4=tmp10
-        paddw   xmm5,xmm3               ; xmm5=tmp13
-
-        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm1,[GOTOFF(ebx,PW_F1414)]
-        psubw   xmm1,xmm5               ; xmm1=tmp12
-
-        movdqa  xmm6,xmm4
-        movdqa  xmm7,xmm0
-        psubw   xmm4,xmm5               ; xmm4=tmp3
-        psubw   xmm0,xmm1               ; xmm0=tmp2
-        paddw   xmm6,xmm5               ; xmm6=tmp0
-        paddw   xmm7,xmm1               ; xmm7=tmp1
-
-        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
-        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
-
-        ; -- Odd part
-
-        movdqa  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movdqa  xmm4,xmm2
-        movdqa  xmm0,xmm5
-        psubw   xmm2,xmm1               ; xmm2=z12
-        psubw   xmm5,xmm3               ; xmm5=z10
-        paddw   xmm4,xmm1               ; xmm4=z11
-        paddw   xmm0,xmm3               ; xmm0=z13
-
-        movdqa  xmm1,xmm5               ; xmm1=z10(unscaled)
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-
-        movdqa  xmm3,xmm4
-        psubw   xmm4,xmm0
-        paddw   xmm3,xmm0               ; xmm3=tmp7
-
-        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm4,[GOTOFF(ebx,PW_F1414)]     ; xmm4=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movdqa  xmm0,xmm5
-        paddw   xmm5,xmm2
-        pmulhw  xmm5,[GOTOFF(ebx,PW_F1847)]     ; xmm5=z5
-        pmulhw  xmm0,[GOTOFF(ebx,PW_MF1613)]
-        pmulhw  xmm2,[GOTOFF(ebx,PW_F1082)]
-        psubw   xmm0,xmm1
-        psubw   xmm2,xmm5               ; xmm2=tmp10
-        paddw   xmm0,xmm5               ; xmm0=tmp12
-
-        ; -- Final output stage
-
-        psubw   xmm0,xmm3               ; xmm0=tmp6
-        movdqa  xmm1,xmm6
-        movdqa  xmm5,xmm7
-        paddw   xmm6,xmm3               ; xmm6=data0=(00 01 02 03 04 05 06 07)
-        paddw   xmm7,xmm0               ; xmm7=data1=(10 11 12 13 14 15 16 17)
-        psubw   xmm1,xmm3               ; xmm1=data7=(70 71 72 73 74 75 76 77)
-        psubw   xmm5,xmm0               ; xmm5=data6=(60 61 62 63 64 65 66 67)
-        psubw   xmm4,xmm0               ; xmm4=tmp5
-
-        movdqa    xmm3,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm3,xmm7             ; xmm3=(04 14 05 15 06 16 07 17)
-        movdqa    xmm0,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm1             ; xmm5=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm0,xmm1             ; xmm0=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
-
-        paddw   xmm2,xmm4               ; xmm2=tmp4
-        movdqa  xmm5,xmm7
-        movdqa  xmm0,xmm1
-        paddw   xmm7,xmm4               ; xmm7=data2=(20 21 22 23 24 25 26 27)
-        paddw   xmm1,xmm2               ; xmm1=data4=(40 41 42 43 44 45 46 47)
-        psubw   xmm5,xmm4               ; xmm5=data5=(50 51 52 53 54 55 56 57)
-        psubw   xmm0,xmm2               ; xmm0=data3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm0             ; xmm7=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm4,xmm0             ; xmm4=(24 34 25 35 26 36 27 37)
-        movdqa    xmm2,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm5             ; xmm1=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm5             ; xmm2=(44 54 45 55 46 56 47 57)
-
-        movdqa    xmm0,xmm3             ; transpose coefficients(phase 2)
-        punpckldq xmm3,xmm4             ; xmm3=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm0,xmm4             ; xmm0=(06 16 26 36 07 17 27 37)
-        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm7             ; xmm6=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm5,xmm7             ; xmm5=(02 12 22 32 03 13 23 33)
-
-        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
-        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
-
-        movdqa    xmm3,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm4             ; xmm1=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm3,xmm4             ; xmm3=(42 52 62 72 43 53 63 73)
-        movdqa    xmm0,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm7             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm0,xmm7             ; xmm0=(46 56 66 76 47 57 67 77)
-
-        movdqa     xmm4,xmm6            ; transpose coefficients(phase 3)
-        punpcklqdq xmm6,xmm1            ; xmm6=col0=(00 10 20 30 40 50 60 70)
-        punpckhqdq xmm4,xmm1            ; xmm4=col1=(01 11 21 31 41 51 61 71)
-        movdqa     xmm7,xmm5            ; transpose coefficients(phase 3)
-        punpcklqdq xmm5,xmm3            ; xmm5=col2=(02 12 22 32 42 52 62 72)
-        punpckhqdq xmm7,xmm3            ; xmm7=col3=(03 13 23 33 43 53 63 73)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
-        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=col1
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=col3
-
-        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm2            ; xmm1=col4=(04 14 24 34 44 54 64 74)
-        punpckhqdq xmm4,xmm2            ; xmm4=col5=(05 15 25 35 45 55 65 75)
-        movdqa     xmm7,xmm3            ; transpose coefficients(phase 3)
-        punpcklqdq xmm3,xmm0            ; xmm3=col6=(06 16 26 36 46 56 66 76)
-        punpckhqdq xmm7,xmm0            ; xmm7=col7=(07 17 27 37 47 57 67 77)
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-
-        ; -- Even part
-
-        ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
-        movdqa  xmm2,xmm6
-        movdqa  xmm0,xmm5
-        psubw   xmm6,xmm1               ; xmm6=tmp11
-        psubw   xmm5,xmm3
-        paddw   xmm2,xmm1               ; xmm2=tmp10
-        paddw   xmm0,xmm3               ; xmm0=tmp13
-
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[GOTOFF(ebx,PW_F1414)]
-        psubw   xmm5,xmm0               ; xmm5=tmp12
-
-        movdqa  xmm1,xmm2
-        movdqa  xmm3,xmm6
-        psubw   xmm2,xmm0               ; xmm2=tmp3
-        psubw   xmm6,xmm5               ; xmm6=tmp2
-        paddw   xmm1,xmm0               ; xmm1=tmp0
-        paddw   xmm3,xmm5               ; xmm3=tmp1
-
-        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=col1
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=col3
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
-
-        ; -- Odd part
-
-        ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
-        movdqa  xmm2,xmm0
-        movdqa  xmm6,xmm4
-        psubw   xmm0,xmm7               ; xmm0=z12
-        psubw   xmm4,xmm5               ; xmm4=z10
-        paddw   xmm2,xmm7               ; xmm2=z11
-        paddw   xmm6,xmm5               ; xmm6=z13
-
-        movdqa  xmm7,xmm4               ; xmm7=z10(unscaled)
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
-
-        movdqa  xmm5,xmm2
-        psubw   xmm2,xmm6
-        paddw   xmm5,xmm6               ; xmm5=tmp7
-
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm2,[GOTOFF(ebx,PW_F1414)]     ; xmm2=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movdqa  xmm6,xmm4
-        paddw   xmm4,xmm0
-        pmulhw  xmm4,[GOTOFF(ebx,PW_F1847)]     ; xmm4=z5
-        pmulhw  xmm6,[GOTOFF(ebx,PW_MF1613)]
-        pmulhw  xmm0,[GOTOFF(ebx,PW_F1082)]
-        psubw   xmm6,xmm7
-        psubw   xmm0,xmm4               ; xmm0=tmp10
-        paddw   xmm6,xmm4               ; xmm6=tmp12
-
-        ; -- Final output stage
-
-        psubw   xmm6,xmm5               ; xmm6=tmp6
-        movdqa  xmm7,xmm1
-        movdqa  xmm4,xmm3
-        paddw   xmm1,xmm5               ; xmm1=data0=(00 10 20 30 40 50 60 70)
-        paddw   xmm3,xmm6               ; xmm3=data1=(01 11 21 31 41 51 61 71)
-        psraw   xmm1,(PASS1_BITS+3)     ; descale
-        psraw   xmm3,(PASS1_BITS+3)     ; descale
-        psubw   xmm7,xmm5               ; xmm7=data7=(07 17 27 37 47 57 67 77)
-        psubw   xmm4,xmm6               ; xmm4=data6=(06 16 26 36 46 56 66 76)
-        psraw   xmm7,(PASS1_BITS+3)     ; descale
-        psraw   xmm4,(PASS1_BITS+3)     ; descale
-        psubw   xmm2,xmm6               ; xmm2=tmp5
-
-        packsswb  xmm1,xmm4     ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        packsswb  xmm3,xmm7     ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
-        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
-
-        paddw   xmm0,xmm2               ; xmm0=tmp4
-        movdqa  xmm4,xmm5
-        movdqa  xmm7,xmm6
-        paddw   xmm5,xmm2               ; xmm5=data2=(02 12 22 32 42 52 62 72)
-        paddw   xmm6,xmm0               ; xmm6=data4=(04 14 24 34 44 54 64 74)
-        psraw   xmm5,(PASS1_BITS+3)     ; descale
-        psraw   xmm6,(PASS1_BITS+3)     ; descale
-        psubw   xmm4,xmm2               ; xmm4=data5=(05 15 25 35 45 55 65 75)
-        psubw   xmm7,xmm0               ; xmm7=data3=(03 13 23 33 43 53 63 73)
-        psraw   xmm4,(PASS1_BITS+3)     ; descale
-        psraw   xmm7,(PASS1_BITS+3)     ; descale
-
-        movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
-
-        packsswb  xmm5,xmm6     ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-        packsswb  xmm7,xmm4     ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-        paddb     xmm1,xmm2
-        paddb     xmm3,xmm2
-        paddb     xmm5,xmm2
-        paddb     xmm7,xmm2
-
-        movdqa    xmm0,xmm1     ; transpose coefficients(phase 1)
-        punpcklbw xmm1,xmm3     ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-        punpckhbw xmm0,xmm3     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-        movdqa    xmm6,xmm5     ; transpose coefficients(phase 1)
-        punpcklbw xmm5,xmm7     ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-        punpckhbw xmm6,xmm7     ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-        movdqa    xmm4,xmm1     ; transpose coefficients(phase 2)
-        punpcklwd xmm1,xmm5     ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm5     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-        movdqa    xmm2,xmm6     ; transpose coefficients(phase 2)
-        punpcklwd xmm6,xmm0     ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-        punpckhwd xmm2,xmm0     ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-        movdqa    xmm3,xmm1     ; transpose coefficients(phase 3)
-        punpckldq xmm1,xmm6     ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm3,xmm6     ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-        movdqa    xmm7,xmm4     ; transpose coefficients(phase 3)
-        punpckldq xmm4,xmm2     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-        punpckhdq xmm7,xmm2     ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-        pshufd  xmm5,xmm1,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm0,xmm3,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-        pshufd  xmm6,xmm4,0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-        pshufd  xmm2,xmm7,0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-        mov     edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
-
-        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
-        mov     edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctint-mmx.asm b/media/libjpeg/simd/jidctint-mmx.asm
deleted file mode 100644
index 5bd198120b..0000000000
--- a/media/libjpeg/simd/jidctint-mmx.asm
+++ /dev/null
@@ -1,851 +0,0 @@
-;
-; jidctint.asm - accurate integer IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_islow_mmx)
-
-EXTN(jconst_idct_islow_mmx):
-
-PW_F130_F054    times 2 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 2 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 2 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 2 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 2 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 2 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 2 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 2 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 2 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 2 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_mmx (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          12
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
-                                        ; JCOEF workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_islow_mmx)
-
-EXTN(jsimd_idct_islow_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; JCOEF *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     mm1,mm0
-        packsswb mm1,mm1
-        movd    eax,mm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   mm0,PASS1_BITS
-
-        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
-        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
-        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
-
-        movq      mm1,mm0
-        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
-        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
-        movq      mm3,mm2
-        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
-        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movq      mm4,mm1               ; mm1=in2=z2
-        movq      mm5,mm1
-        punpcklwd mm4,mm3               ; mm3=in6=z3
-        punpckhwd mm5,mm3
-        movq      mm1,mm4
-        movq      mm3,mm5
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=tmp3L
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]        ; mm5=tmp3H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=tmp2L
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]       ; mm3=tmp2H
-
-        movq      mm6,mm0
-        paddw     mm0,mm2               ; mm0=in0+in4
-        psubw     mm6,mm2               ; mm6=in0-in4
-
-        pxor      mm7,mm7
-        pxor      mm2,mm2
-        punpcklwd mm7,mm0               ; mm7=tmp0L
-        punpckhwd mm2,mm0               ; mm2=tmp0H
-        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
-        psrad     mm2,(16-CONST_BITS)   ; psrad mm2,16 & pslld mm2,CONST_BITS
-
-        movq    mm0,mm7
-        paddd   mm7,mm4                 ; mm7=tmp10L
-        psubd   mm0,mm4                 ; mm0=tmp13L
-        movq    mm4,mm2
-        paddd   mm2,mm5                 ; mm2=tmp10H
-        psubd   mm4,mm5                 ; mm4=tmp13H
-
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
-        movq    MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
-        movq    MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
-
-        pxor      mm5,mm5
-        pxor      mm7,mm7
-        punpcklwd mm5,mm6               ; mm5=tmp1L
-        punpckhwd mm7,mm6               ; mm7=tmp1H
-        psrad     mm5,(16-CONST_BITS)   ; psrad mm5,16 & pslld mm5,CONST_BITS
-        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
-
-        movq    mm2,mm5
-        paddd   mm5,mm1                 ; mm5=tmp11L
-        psubd   mm2,mm1                 ; mm2=tmp12L
-        movq    mm0,mm7
-        paddd   mm7,mm3                 ; mm7=tmp11H
-        psubd   mm0,mm3                 ; mm0=tmp12H
-
-        movq    MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
-        movq    MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
-        movq    MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
-        movq    MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movq    mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movq    mm5,mm6
-        movq    mm7,mm4
-        paddw   mm5,mm3                 ; mm5=z3
-        paddw   mm7,mm1                 ; mm7=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movq      mm2,mm5
-        movq      mm0,mm5
-        punpcklwd mm2,mm7
-        punpckhwd mm0,mm7
-        movq      mm5,mm2
-        movq      mm7,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]       ; mm2=z3L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]       ; mm0=z3H
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]        ; mm5=z4L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]        ; mm7=z4H
-
-        movq    MMWORD [wk(10)], mm2    ; wk(10)=z3L
-        movq    MMWORD [wk(11)], mm0    ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movq      mm2,mm3
-        movq      mm0,mm3
-        punpcklwd mm2,mm4
-        punpckhwd mm0,mm4
-        movq      mm3,mm2
-        movq      mm4,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm2=tmp0L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm0=tmp0H
-        pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]       ; mm3=tmp3L
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]       ; mm4=tmp3H
-
-        paddd   mm2, MMWORD [wk(10)]    ; mm2=tmp0L
-        paddd   mm0, MMWORD [wk(11)]    ; mm0=tmp0H
-        paddd   mm3,mm5                 ; mm3=tmp3L
-        paddd   mm4,mm7                 ; mm4=tmp3H
-
-        movq    MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
-        movq    MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
-
-        movq      mm2,mm1
-        movq      mm0,mm1
-        punpcklwd mm2,mm6
-        punpckhwd mm0,mm6
-        movq      mm1,mm2
-        movq      mm6,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm2=tmp1L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm0=tmp1H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]       ; mm1=tmp2L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]       ; mm6=tmp2H
-
-        paddd   mm2,mm5                 ; mm2=tmp1L
-        paddd   mm0,mm7                 ; mm0=tmp1H
-        paddd   mm1, MMWORD [wk(10)]    ; mm1=tmp2L
-        paddd   mm6, MMWORD [wk(11)]    ; mm6=tmp2H
-
-        movq    MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
-        movq    MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp10L
-        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp10H
-
-        movq    mm2,mm5
-        movq    mm0,mm7
-        paddd   mm5,mm3                 ; mm5=data0L
-        paddd   mm7,mm4                 ; mm7=data0H
-        psubd   mm2,mm3                 ; mm2=data7L
-        psubd   mm0,mm4                 ; mm0=data7H
-
-        movq    mm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1]
-
-        paddd   mm5,mm3
-        paddd   mm7,mm3
-        psrad   mm5,DESCALE_P1
-        psrad   mm7,DESCALE_P1
-        paddd   mm2,mm3
-        paddd   mm0,mm3
-        psrad   mm2,DESCALE_P1
-        psrad   mm0,DESCALE_P1
-
-        packssdw  mm5,mm7               ; mm5=data0=(00 01 02 03)
-        packssdw  mm2,mm0               ; mm2=data7=(70 71 72 73)
-
-        movq    mm4, MMWORD [wk(4)]     ; mm4=tmp11L
-        movq    mm3, MMWORD [wk(5)]     ; mm3=tmp11H
-
-        movq    mm7,mm4
-        movq    mm0,mm3
-        paddd   mm4,mm1                 ; mm4=data1L
-        paddd   mm3,mm6                 ; mm3=data1H
-        psubd   mm7,mm1                 ; mm7=data6L
-        psubd   mm0,mm6                 ; mm0=data6H
-
-        movq    mm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1]
-
-        paddd   mm4,mm1
-        paddd   mm3,mm1
-        psrad   mm4,DESCALE_P1
-        psrad   mm3,DESCALE_P1
-        paddd   mm7,mm1
-        paddd   mm0,mm1
-        psrad   mm7,DESCALE_P1
-        psrad   mm0,DESCALE_P1
-
-        packssdw  mm4,mm3               ; mm4=data1=(10 11 12 13)
-        packssdw  mm7,mm0               ; mm7=data6=(60 61 62 63)
-
-        movq      mm6,mm5               ; transpose coefficients(phase 1)
-        punpcklwd mm5,mm4               ; mm5=(00 10 01 11)
-        punpckhwd mm6,mm4               ; mm6=(02 12 03 13)
-        movq      mm1,mm7               ; transpose coefficients(phase 1)
-        punpcklwd mm7,mm2               ; mm7=(60 70 61 71)
-        punpckhwd mm1,mm2               ; mm1=(62 72 63 73)
-
-        movq    mm3, MMWORD [wk(6)]     ; mm3=tmp12L
-        movq    mm0, MMWORD [wk(7)]     ; mm0=tmp12H
-        movq    mm4, MMWORD [wk(10)]    ; mm4=tmp1L
-        movq    mm2, MMWORD [wk(11)]    ; mm2=tmp1H
-
-        movq    MMWORD [wk(0)], mm5     ; wk(0)=(00 10 01 11)
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=(02 12 03 13)
-        movq    MMWORD [wk(4)], mm7     ; wk(4)=(60 70 61 71)
-        movq    MMWORD [wk(5)], mm1     ; wk(5)=(62 72 63 73)
-
-        movq    mm5,mm3
-        movq    mm6,mm0
-        paddd   mm3,mm4                 ; mm3=data2L
-        paddd   mm0,mm2                 ; mm0=data2H
-        psubd   mm5,mm4                 ; mm5=data5L
-        psubd   mm6,mm2                 ; mm6=data5H
-
-        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1]
-
-        paddd   mm3,mm7
-        paddd   mm0,mm7
-        psrad   mm3,DESCALE_P1
-        psrad   mm0,DESCALE_P1
-        paddd   mm5,mm7
-        paddd   mm6,mm7
-        psrad   mm5,DESCALE_P1
-        psrad   mm6,DESCALE_P1
-
-        packssdw  mm3,mm0               ; mm3=data2=(20 21 22 23)
-        packssdw  mm5,mm6               ; mm5=data5=(50 51 52 53)
-
-        movq    mm1, MMWORD [wk(2)]     ; mm1=tmp13L
-        movq    mm4, MMWORD [wk(3)]     ; mm4=tmp13H
-        movq    mm2, MMWORD [wk(8)]     ; mm2=tmp0L
-        movq    mm7, MMWORD [wk(9)]     ; mm7=tmp0H
-
-        movq    mm0,mm1
-        movq    mm6,mm4
-        paddd   mm1,mm2                 ; mm1=data3L
-        paddd   mm4,mm7                 ; mm4=data3H
-        psubd   mm0,mm2                 ; mm0=data4L
-        psubd   mm6,mm7                 ; mm6=data4H
-
-        movq    mm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1]
-
-        paddd   mm1,mm2
-        paddd   mm4,mm2
-        psrad   mm1,DESCALE_P1
-        psrad   mm4,DESCALE_P1
-        paddd   mm0,mm2
-        paddd   mm6,mm2
-        psrad   mm0,DESCALE_P1
-        psrad   mm6,DESCALE_P1
-
-        packssdw  mm1,mm4               ; mm1=data3=(30 31 32 33)
-        packssdw  mm0,mm6               ; mm0=data4=(40 41 42 43)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=(00 10 01 11)
-        movq    mm2, MMWORD [wk(1)]     ; mm2=(02 12 03 13)
-
-        movq      mm4,mm3               ; transpose coefficients(phase 1)
-        punpcklwd mm3,mm1               ; mm3=(20 30 21 31)
-        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
-        movq      mm6,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm5               ; mm0=(40 50 41 51)
-        punpckhwd mm6,mm5               ; mm6=(42 52 43 53)
-
-        movq      mm1,mm7               ; transpose coefficients(phase 2)
-        punpckldq mm7,mm3               ; mm7=(00 10 20 30)
-        punpckhdq mm1,mm3               ; mm1=(01 11 21 31)
-        movq      mm5,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm4               ; mm2=(02 12 22 32)
-        punpckhdq mm5,mm4               ; mm5=(03 13 23 33)
-
-        movq    mm3, MMWORD [wk(4)]     ; mm3=(60 70 61 71)
-        movq    mm4, MMWORD [wk(5)]     ; mm4=(62 72 63 73)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
-
-        movq      mm7,mm0               ; transpose coefficients(phase 2)
-        punpckldq mm0,mm3               ; mm0=(40 50 60 70)
-        punpckhdq mm7,mm3               ; mm7=(41 51 61 71)
-        movq      mm1,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm4               ; mm6=(42 52 62 72)
-        punpckhdq mm1,mm4               ; mm1=(43 53 63 73)
-
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_ISLOW_MULT_TYPE      ; quantptr
-        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; JCOEF *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movq      mm4,mm1               ; mm1=in2=z2
-        movq      mm5,mm1
-        punpcklwd mm4,mm3               ; mm3=in6=z3
-        punpckhwd mm5,mm3
-        movq      mm1,mm4
-        movq      mm3,mm5
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=tmp3L
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]        ; mm5=tmp3H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=tmp2L
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]       ; mm3=tmp2H
-
-        movq      mm6,mm0
-        paddw     mm0,mm2               ; mm0=in0+in4
-        psubw     mm6,mm2               ; mm6=in0-in4
-
-        pxor      mm7,mm7
-        pxor      mm2,mm2
-        punpcklwd mm7,mm0               ; mm7=tmp0L
-        punpckhwd mm2,mm0               ; mm2=tmp0H
-        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
-        psrad     mm2,(16-CONST_BITS)   ; psrad mm2,16 & pslld mm2,CONST_BITS
-
-        movq    mm0,mm7
-        paddd   mm7,mm4                 ; mm7=tmp10L
-        psubd   mm0,mm4                 ; mm0=tmp13L
-        movq    mm4,mm2
-        paddd   mm2,mm5                 ; mm2=tmp10H
-        psubd   mm4,mm5                 ; mm4=tmp13H
-
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
-        movq    MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
-        movq    MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
-
-        pxor      mm5,mm5
-        pxor      mm7,mm7
-        punpcklwd mm5,mm6               ; mm5=tmp1L
-        punpckhwd mm7,mm6               ; mm7=tmp1H
-        psrad     mm5,(16-CONST_BITS)   ; psrad mm5,16 & pslld mm5,CONST_BITS
-        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
-
-        movq    mm2,mm5
-        paddd   mm5,mm1                 ; mm5=tmp11L
-        psubd   mm2,mm1                 ; mm2=tmp12L
-        movq    mm0,mm7
-        paddd   mm7,mm3                 ; mm7=tmp11H
-        psubd   mm0,mm3                 ; mm0=tmp12H
-
-        movq    MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
-        movq    MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
-        movq    MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
-        movq    MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movq    mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        movq    mm5,mm6
-        movq    mm7,mm4
-        paddw   mm5,mm3                 ; mm5=z3
-        paddw   mm7,mm1                 ; mm7=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movq      mm2,mm5
-        movq      mm0,mm5
-        punpcklwd mm2,mm7
-        punpckhwd mm0,mm7
-        movq      mm5,mm2
-        movq      mm7,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]       ; mm2=z3L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]       ; mm0=z3H
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]        ; mm5=z4L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]        ; mm7=z4H
-
-        movq    MMWORD [wk(10)], mm2    ; wk(10)=z3L
-        movq    MMWORD [wk(11)], mm0    ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movq      mm2,mm3
-        movq      mm0,mm3
-        punpcklwd mm2,mm4
-        punpckhwd mm0,mm4
-        movq      mm3,mm2
-        movq      mm4,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm2=tmp0L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm0=tmp0H
-        pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]       ; mm3=tmp3L
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]       ; mm4=tmp3H
-
-        paddd   mm2, MMWORD [wk(10)]    ; mm2=tmp0L
-        paddd   mm0, MMWORD [wk(11)]    ; mm0=tmp0H
-        paddd   mm3,mm5                 ; mm3=tmp3L
-        paddd   mm4,mm7                 ; mm4=tmp3H
-
-        movq    MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
-        movq    MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
-
-        movq      mm2,mm1
-        movq      mm0,mm1
-        punpcklwd mm2,mm6
-        punpckhwd mm0,mm6
-        movq      mm1,mm2
-        movq      mm6,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm2=tmp1L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm0=tmp1H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]       ; mm1=tmp2L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]       ; mm6=tmp2H
-
-        paddd   mm2,mm5                 ; mm2=tmp1L
-        paddd   mm0,mm7                 ; mm0=tmp1H
-        paddd   mm1, MMWORD [wk(10)]    ; mm1=tmp2L
-        paddd   mm6, MMWORD [wk(11)]    ; mm6=tmp2H
-
-        movq    MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
-        movq    MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp10L
-        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp10H
-
-        movq    mm2,mm5
-        movq    mm0,mm7
-        paddd   mm5,mm3                 ; mm5=data0L
-        paddd   mm7,mm4                 ; mm7=data0H
-        psubd   mm2,mm3                 ; mm2=data7L
-        psubd   mm0,mm4                 ; mm0=data7H
-
-        movq    mm3,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2]
-
-        paddd   mm5,mm3
-        paddd   mm7,mm3
-        psrad   mm5,DESCALE_P2
-        psrad   mm7,DESCALE_P2
-        paddd   mm2,mm3
-        paddd   mm0,mm3
-        psrad   mm2,DESCALE_P2
-        psrad   mm0,DESCALE_P2
-
-        packssdw  mm5,mm7               ; mm5=data0=(00 10 20 30)
-        packssdw  mm2,mm0               ; mm2=data7=(07 17 27 37)
-
-        movq    mm4, MMWORD [wk(4)]     ; mm4=tmp11L
-        movq    mm3, MMWORD [wk(5)]     ; mm3=tmp11H
-
-        movq    mm7,mm4
-        movq    mm0,mm3
-        paddd   mm4,mm1                 ; mm4=data1L
-        paddd   mm3,mm6                 ; mm3=data1H
-        psubd   mm7,mm1                 ; mm7=data6L
-        psubd   mm0,mm6                 ; mm0=data6H
-
-        movq    mm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2]
-
-        paddd   mm4,mm1
-        paddd   mm3,mm1
-        psrad   mm4,DESCALE_P2
-        psrad   mm3,DESCALE_P2
-        paddd   mm7,mm1
-        paddd   mm0,mm1
-        psrad   mm7,DESCALE_P2
-        psrad   mm0,DESCALE_P2
-
-        packssdw  mm4,mm3               ; mm4=data1=(01 11 21 31)
-        packssdw  mm7,mm0               ; mm7=data6=(06 16 26 36)
-
-        packsswb  mm5,mm7               ; mm5=(00 10 20 30 06 16 26 36)
-        packsswb  mm4,mm2               ; mm4=(01 11 21 31 07 17 27 37)
-
-        movq    mm6, MMWORD [wk(6)]     ; mm6=tmp12L
-        movq    mm1, MMWORD [wk(7)]     ; mm1=tmp12H
-        movq    mm3, MMWORD [wk(10)]    ; mm3=tmp1L
-        movq    mm0, MMWORD [wk(11)]    ; mm0=tmp1H
-
-        movq    MMWORD [wk(0)], mm5     ; wk(0)=(00 10 20 30 06 16 26 36)
-        movq    MMWORD [wk(1)], mm4     ; wk(1)=(01 11 21 31 07 17 27 37)
-
-        movq    mm7,mm6
-        movq    mm2,mm1
-        paddd   mm6,mm3                 ; mm6=data2L
-        paddd   mm1,mm0                 ; mm1=data2H
-        psubd   mm7,mm3                 ; mm7=data5L
-        psubd   mm2,mm0                 ; mm2=data5H
-
-        movq    mm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2]
-
-        paddd   mm6,mm5
-        paddd   mm1,mm5
-        psrad   mm6,DESCALE_P2
-        psrad   mm1,DESCALE_P2
-        paddd   mm7,mm5
-        paddd   mm2,mm5
-        psrad   mm7,DESCALE_P2
-        psrad   mm2,DESCALE_P2
-
-        packssdw  mm6,mm1               ; mm6=data2=(02 12 22 32)
-        packssdw  mm7,mm2               ; mm7=data5=(05 15 25 35)
-
-        movq    mm4, MMWORD [wk(2)]     ; mm4=tmp13L
-        movq    mm3, MMWORD [wk(3)]     ; mm3=tmp13H
-        movq    mm0, MMWORD [wk(8)]     ; mm0=tmp0L
-        movq    mm5, MMWORD [wk(9)]     ; mm5=tmp0H
-
-        movq    mm1,mm4
-        movq    mm2,mm3
-        paddd   mm4,mm0                 ; mm4=data3L
-        paddd   mm3,mm5                 ; mm3=data3H
-        psubd   mm1,mm0                 ; mm1=data4L
-        psubd   mm2,mm5                 ; mm2=data4H
-
-        movq    mm0,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2]
-
-        paddd   mm4,mm0
-        paddd   mm3,mm0
-        psrad   mm4,DESCALE_P2
-        psrad   mm3,DESCALE_P2
-        paddd   mm1,mm0
-        paddd   mm2,mm0
-        psrad   mm1,DESCALE_P2
-        psrad   mm2,DESCALE_P2
-
-        movq      mm5,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm5=[PB_CENTERJSAMP]
-
-        packssdw  mm4,mm3               ; mm4=data3=(03 13 23 33)
-        packssdw  mm1,mm2               ; mm1=data4=(04 14 24 34)
-
-        movq      mm0, MMWORD [wk(0)]   ; mm0=(00 10 20 30 06 16 26 36)
-        movq      mm3, MMWORD [wk(1)]   ; mm3=(01 11 21 31 07 17 27 37)
-
-        packsswb  mm6,mm1               ; mm6=(02 12 22 32 04 14 24 34)
-        packsswb  mm4,mm7               ; mm4=(03 13 23 33 05 15 25 35)
-
-        paddb     mm0,mm5
-        paddb     mm3,mm5
-        paddb     mm6,mm5
-        paddb     mm4,mm5
-
-        movq      mm2,mm0               ; transpose coefficients(phase 1)
-        punpcklbw mm0,mm3               ; mm0=(00 01 10 11 20 21 30 31)
-        punpckhbw mm2,mm3               ; mm2=(06 07 16 17 26 27 36 37)
-        movq      mm1,mm6               ; transpose coefficients(phase 1)
-        punpcklbw mm6,mm4               ; mm6=(02 03 12 13 22 23 32 33)
-        punpckhbw mm1,mm4               ; mm1=(04 05 14 15 24 25 34 35)
-
-        movq      mm7,mm0               ; transpose coefficients(phase 2)
-        punpcklwd mm0,mm6               ; mm0=(00 01 02 03 10 11 12 13)
-        punpckhwd mm7,mm6               ; mm7=(20 21 22 23 30 31 32 33)
-        movq      mm5,mm1               ; transpose coefficients(phase 2)
-        punpcklwd mm1,mm2               ; mm1=(04 05 06 07 14 15 16 17)
-        punpckhwd mm5,mm2               ; mm5=(24 25 26 27 34 35 36 37)
-
-        movq      mm3,mm0               ; transpose coefficients(phase 3)
-        punpckldq mm0,mm1               ; mm0=(00 01 02 03 04 05 06 07)
-        punpckhdq mm3,mm1               ; mm3=(10 11 12 13 14 15 16 17)
-        movq      mm4,mm7               ; transpose coefficients(phase 3)
-        punpckldq mm7,mm5               ; mm7=(20 21 22 23 24 25 26 27)
-        punpckhdq mm4,mm5               ; mm4=(30 31 32 33 34 35 36 37)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
-        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 4*SIZEOF_JCOEF        ; wsptr
-        add     edi, byte 4*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctint-sse2-64.asm b/media/libjpeg/simd/jidctint-sse2-64.asm
deleted file mode 100644
index afe1d6a73b..0000000000
--- a/media/libjpeg/simd/jidctint-sse2-64.asm
+++ /dev/null
@@ -1,847 +0,0 @@
-;
-; jidctint.asm - accurate integer IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_islow_sse2)
-
-EXTN(jconst_idct_islow_sse2):
-
-PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = jpeg_component_info *compptr
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp    rbp+0
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          12
-
-        align   16
-        global  EXTN(jsimd_idct_islow_sse2)
-
-EXTN(jsimd_idct_islow_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1,xmm0
-        packsswb xmm1,xmm1
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    rax,rax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   xmm5,PASS1_BITS
-
-        movdqa    xmm4,xmm5             ; xmm5=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm5,xmm5             ; xmm5=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm4,xmm4             ; xmm4=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm7,xmm5,0x00          ; xmm7=col0=(00 00 00 00 00 00 00 00)
-        pshufd  xmm6,xmm5,0x55          ; xmm6=col1=(01 01 01 01 01 01 01 01)
-        pshufd  xmm1,xmm5,0xAA          ; xmm1=col2=(02 02 02 02 02 02 02 02)
-        pshufd  xmm5,xmm5,0xFF          ; xmm5=col3=(03 03 03 03 03 03 03 03)
-        pshufd  xmm0,xmm4,0x00          ; xmm0=col4=(04 04 04 04 04 04 04 04)
-        pshufd  xmm3,xmm4,0x55          ; xmm3=col5=(05 05 05 05 05 05 05 05)
-        pshufd  xmm2,xmm4,0xAA          ; xmm2=col6=(06 06 06 06 06 06 06 06)
-        pshufd  xmm4,xmm4,0xFF          ; xmm4=col7=(07 07 07 07 07 07 07 07)
-
-        movdqa  XMMWORD [wk(8)], xmm6   ; wk(8)=col1
-        movdqa  XMMWORD [wk(9)], xmm5   ; wk(9)=col3
-        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
-        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
-        jmp     near .column_end
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movdqa    xmm4,xmm1             ; xmm1=in2=z2
-        movdqa    xmm5,xmm1
-        punpcklwd xmm4,xmm3             ; xmm3=in6=z3
-        punpckhwd xmm5,xmm3
-        movdqa    xmm1,xmm4
-        movdqa    xmm3,xmm5
-        pmaddwd   xmm4,[rel PW_F130_F054]       ; xmm4=tmp3L
-        pmaddwd   xmm5,[rel PW_F130_F054]       ; xmm5=tmp3H
-        pmaddwd   xmm1,[rel PW_F054_MF130]      ; xmm1=tmp2L
-        pmaddwd   xmm3,[rel PW_F054_MF130]      ; xmm3=tmp2H
-
-        movdqa    xmm6,xmm0
-        paddw     xmm0,xmm2             ; xmm0=in0+in4
-        psubw     xmm6,xmm2             ; xmm6=in0-in4
-
-        pxor      xmm7,xmm7
-        pxor      xmm2,xmm2
-        punpcklwd xmm7,xmm0             ; xmm7=tmp0L
-        punpckhwd xmm2,xmm0             ; xmm2=tmp0H
-        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-        psrad     xmm2,(16-CONST_BITS)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
-        movdqa  xmm0,xmm7
-        paddd   xmm7,xmm4               ; xmm7=tmp10L
-        psubd   xmm0,xmm4               ; xmm0=tmp13L
-        movdqa  xmm4,xmm2
-        paddd   xmm2,xmm5               ; xmm2=tmp10H
-        psubd   xmm4,xmm5               ; xmm4=tmp13H
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
-        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
-        movdqa  XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
-        movdqa  XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
-
-        pxor      xmm5,xmm5
-        pxor      xmm7,xmm7
-        punpcklwd xmm5,xmm6             ; xmm5=tmp1L
-        punpckhwd xmm7,xmm6             ; xmm7=tmp1H
-        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
-        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
-        movdqa  xmm2,xmm5
-        paddd   xmm5,xmm1               ; xmm5=tmp11L
-        psubd   xmm2,xmm1               ; xmm2=tmp12L
-        movdqa  xmm0,xmm7
-        paddd   xmm7,xmm3               ; xmm7=tmp11H
-        psubd   xmm0,xmm3               ; xmm0=tmp12H
-
-        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
-        movdqa  XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
-        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
-        movdqa  XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movdqa  xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm7,xmm4
-        paddw   xmm5,xmm3               ; xmm5=z3
-        paddw   xmm7,xmm1               ; xmm7=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm2,xmm5
-        movdqa    xmm0,xmm5
-        punpcklwd xmm2,xmm7
-        punpckhwd xmm0,xmm7
-        movdqa    xmm5,xmm2
-        movdqa    xmm7,xmm0
-        pmaddwd   xmm2,[rel PW_MF078_F117]      ; xmm2=z3L
-        pmaddwd   xmm0,[rel PW_MF078_F117]      ; xmm0=z3H
-        pmaddwd   xmm5,[rel PW_F117_F078]       ; xmm5=z4L
-        pmaddwd   xmm7,[rel PW_F117_F078]       ; xmm7=z4H
-
-        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
-        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movdqa    xmm2,xmm3
-        movdqa    xmm0,xmm3
-        punpcklwd xmm2,xmm4
-        punpckhwd xmm0,xmm4
-        movdqa    xmm3,xmm2
-        movdqa    xmm4,xmm0
-        pmaddwd   xmm2,[rel PW_MF060_MF089]     ; xmm2=tmp0L
-        pmaddwd   xmm0,[rel PW_MF060_MF089]     ; xmm0=tmp0H
-        pmaddwd   xmm3,[rel PW_MF089_F060]      ; xmm3=tmp3L
-        pmaddwd   xmm4,[rel PW_MF089_F060]      ; xmm4=tmp3H
-
-        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
-        paddd   xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
-        paddd   xmm3,xmm5               ; xmm3=tmp3L
-        paddd   xmm4,xmm7               ; xmm4=tmp3H
-
-        movdqa  XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
-        movdqa  XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
-
-        movdqa    xmm2,xmm1
-        movdqa    xmm0,xmm1
-        punpcklwd xmm2,xmm6
-        punpckhwd xmm0,xmm6
-        movdqa    xmm1,xmm2
-        movdqa    xmm6,xmm0
-        pmaddwd   xmm2,[rel PW_MF050_MF256]     ; xmm2=tmp1L
-        pmaddwd   xmm0,[rel PW_MF050_MF256]     ; xmm0=tmp1H
-        pmaddwd   xmm1,[rel PW_MF256_F050]      ; xmm1=tmp2L
-        pmaddwd   xmm6,[rel PW_MF256_F050]      ; xmm6=tmp2H
-
-        paddd   xmm2,xmm5               ; xmm2=tmp1L
-        paddd   xmm0,xmm7               ; xmm0=tmp1H
-        paddd   xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
-        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
-
-        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
-        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
-        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
-
-        movdqa  xmm2,xmm5
-        movdqa  xmm0,xmm7
-        paddd   xmm5,xmm3               ; xmm5=data0L
-        paddd   xmm7,xmm4               ; xmm7=data0H
-        psubd   xmm2,xmm3               ; xmm2=data7L
-        psubd   xmm0,xmm4               ; xmm0=data7H
-
-        movdqa  xmm3,[rel PD_DESCALE_P1]        ; xmm3=[rel PD_DESCALE_P1]
-
-        paddd   xmm5,xmm3
-        paddd   xmm7,xmm3
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm7,DESCALE_P1
-        paddd   xmm2,xmm3
-        paddd   xmm0,xmm3
-        psrad   xmm2,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm5,xmm7             ; xmm5=data0=(00 01 02 03 04 05 06 07)
-        packssdw  xmm2,xmm0             ; xmm2=data7=(70 71 72 73 74 75 76 77)
-
-        movdqa  xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
-        movdqa  xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
-
-        movdqa  xmm7,xmm4
-        movdqa  xmm0,xmm3
-        paddd   xmm4,xmm1               ; xmm4=data1L
-        paddd   xmm3,xmm6               ; xmm3=data1H
-        psubd   xmm7,xmm1               ; xmm7=data6L
-        psubd   xmm0,xmm6               ; xmm0=data6H
-
-        movdqa  xmm1,[rel PD_DESCALE_P1]        ; xmm1=[rel PD_DESCALE_P1]
-
-        paddd   xmm4,xmm1
-        paddd   xmm3,xmm1
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm3,DESCALE_P1
-        paddd   xmm7,xmm1
-        paddd   xmm0,xmm1
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm4,xmm3             ; xmm4=data1=(10 11 12 13 14 15 16 17)
-        packssdw  xmm7,xmm0             ; xmm7=data6=(60 61 62 63 64 65 66 67)
-
-        movdqa    xmm6,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm4             ; xmm5=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm6,xmm4             ; xmm6=(04 14 05 15 06 16 07 17)
-        movdqa    xmm1,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm2             ; xmm7=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm1,xmm2             ; xmm1=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
-        movdqa  xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
-        movdqa  xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
-        movdqa  xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
-        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
-        movdqa  XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm5,xmm3
-        movdqa  xmm6,xmm0
-        paddd   xmm3,xmm4               ; xmm3=data2L
-        paddd   xmm0,xmm2               ; xmm0=data2H
-        psubd   xmm5,xmm4               ; xmm5=data5L
-        psubd   xmm6,xmm2               ; xmm6=data5H
-
-        movdqa  xmm7,[rel PD_DESCALE_P1]        ; xmm7=[rel PD_DESCALE_P1]
-
-        paddd   xmm3,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm3,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-        paddd   xmm5,xmm7
-        paddd   xmm6,xmm7
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-
-        packssdw  xmm3,xmm0             ; xmm3=data2=(20 21 22 23 24 25 26 27)
-        packssdw  xmm5,xmm6             ; xmm5=data5=(50 51 52 53 54 55 56 57)
-
-        movdqa  xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
-        movdqa  xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
-        movdqa  xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
-        movdqa  xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
-
-        movdqa  xmm0,xmm1
-        movdqa  xmm6,xmm4
-        paddd   xmm1,xmm2               ; xmm1=data3L
-        paddd   xmm4,xmm7               ; xmm4=data3H
-        psubd   xmm0,xmm2               ; xmm0=data4L
-        psubd   xmm6,xmm7               ; xmm6=data4H
-
-        movdqa  xmm2,[rel PD_DESCALE_P1]        ; xmm2=[rel PD_DESCALE_P1]
-
-        paddd   xmm1,xmm2
-        paddd   xmm4,xmm2
-        psrad   xmm1,DESCALE_P1
-        psrad   xmm4,DESCALE_P1
-        paddd   xmm0,xmm2
-        paddd   xmm6,xmm2
-        psrad   xmm0,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-
-        packssdw  xmm1,xmm4             ; xmm1=data3=(30 31 32 33 34 35 36 37)
-        packssdw  xmm0,xmm6             ; xmm0=data4=(40 41 42 43 44 45 46 47)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
-        movdqa  xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
-
-        movdqa    xmm4,xmm3             ; transpose coefficients(phase 1)
-        punpcklwd xmm3,xmm1             ; xmm3=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm4,xmm1             ; xmm4=(24 34 25 35 26 36 27 37)
-        movdqa    xmm6,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm5             ; xmm0=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm6,xmm5             ; xmm6=(44 54 45 55 46 56 47 57)
-
-        movdqa    xmm1,xmm7             ; transpose coefficients(phase 2)
-        punpckldq xmm7,xmm3             ; xmm7=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm1,xmm3             ; xmm1=(02 12 22 32 03 13 23 33)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm4             ; xmm2=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm5,xmm4             ; xmm5=(06 16 26 36 07 17 27 37)
-
-        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
-        movdqa  xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
-
-        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
-        movdqa  XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
-
-        movdqa    xmm2,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm3             ; xmm0=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm2,xmm3             ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm4             ; xmm6=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm5,xmm4             ; xmm5=(46 56 66 76 47 57 67 77)
-
-        movdqa     xmm3,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm0            ; xmm7=col0=(00 10 20 30 40 50 60 70)
-        punpckhqdq xmm3,xmm0            ; xmm3=col1=(01 11 21 31 41 51 61 71)
-        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm2            ; xmm1=col2=(02 12 22 32 42 52 62 72)
-        punpckhqdq xmm4,xmm2            ; xmm4=col3=(03 13 23 33 43 53 63 73)
-
-        movdqa  xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
-        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa  XMMWORD [wk(8)], xmm3   ; wk(8)=col1
-        movdqa  XMMWORD [wk(9)], xmm4   ; wk(9)=col3
-
-        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=col4=(04 14 24 34 44 54 64 74)
-        punpckhqdq xmm3,xmm6            ; xmm3=col5=(05 15 25 35 45 55 65 75)
-        movdqa     xmm4,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm5            ; xmm2=col6=(06 16 26 36 46 56 66 76)
-        punpckhqdq xmm4,xmm5            ; xmm4=col7=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
-        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     rax, [original_rbp]
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-
-        ; -- Even part
-
-        ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movdqa    xmm6,xmm1             ; xmm1=in2=z2
-        movdqa    xmm5,xmm1
-        punpcklwd xmm6,xmm2             ; xmm2=in6=z3
-        punpckhwd xmm5,xmm2
-        movdqa    xmm1,xmm6
-        movdqa    xmm2,xmm5
-        pmaddwd   xmm6,[rel PW_F130_F054]       ; xmm6=tmp3L
-        pmaddwd   xmm5,[rel PW_F130_F054]       ; xmm5=tmp3H
-        pmaddwd   xmm1,[rel PW_F054_MF130]      ; xmm1=tmp2L
-        pmaddwd   xmm2,[rel PW_F054_MF130]      ; xmm2=tmp2H
-
-        movdqa    xmm3,xmm7
-        paddw     xmm7,xmm0             ; xmm7=in0+in4
-        psubw     xmm3,xmm0             ; xmm3=in0-in4
-
-        pxor      xmm4,xmm4
-        pxor      xmm0,xmm0
-        punpcklwd xmm4,xmm7             ; xmm4=tmp0L
-        punpckhwd xmm0,xmm7             ; xmm0=tmp0H
-        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-        psrad     xmm0,(16-CONST_BITS)  ; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
-        movdqa  xmm7,xmm4
-        paddd   xmm4,xmm6               ; xmm4=tmp10L
-        psubd   xmm7,xmm6               ; xmm7=tmp13L
-        movdqa  xmm6,xmm0
-        paddd   xmm0,xmm5               ; xmm0=tmp10H
-        psubd   xmm6,xmm5               ; xmm6=tmp13H
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
-        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
-        movdqa  XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
-
-        pxor      xmm5,xmm5
-        pxor      xmm4,xmm4
-        punpcklwd xmm5,xmm3             ; xmm5=tmp1L
-        punpckhwd xmm4,xmm3             ; xmm4=tmp1H
-        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
-        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
-        movdqa  xmm0,xmm5
-        paddd   xmm5,xmm1               ; xmm5=tmp11L
-        psubd   xmm0,xmm1               ; xmm0=tmp12L
-        movdqa  xmm7,xmm4
-        paddd   xmm4,xmm2               ; xmm4=tmp11H
-        psubd   xmm7,xmm2               ; xmm7=tmp12H
-
-        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
-        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
-        movdqa  XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
-        movdqa  XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movdqa  xmm6, XMMWORD [wk(9)]   ; xmm6=col3
-        movdqa  xmm3, XMMWORD [wk(8)]   ; xmm3=col1
-        movdqa  xmm1, XMMWORD [wk(11)]  ; xmm1=col7
-        movdqa  xmm2, XMMWORD [wk(10)]  ; xmm2=col5
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm4,xmm3
-        paddw   xmm5,xmm1               ; xmm5=z3
-        paddw   xmm4,xmm2               ; xmm4=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm0,xmm5
-        movdqa    xmm7,xmm5
-        punpcklwd xmm0,xmm4
-        punpckhwd xmm7,xmm4
-        movdqa    xmm5,xmm0
-        movdqa    xmm4,xmm7
-        pmaddwd   xmm0,[rel PW_MF078_F117]      ; xmm0=z3L
-        pmaddwd   xmm7,[rel PW_MF078_F117]      ; xmm7=z3H
-        pmaddwd   xmm5,[rel PW_F117_F078]       ; xmm5=z4L
-        pmaddwd   xmm4,[rel PW_F117_F078]       ; xmm4=z4H
-
-        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
-        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movdqa    xmm0,xmm1
-        movdqa    xmm7,xmm1
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm7,xmm3
-        movdqa    xmm1,xmm0
-        movdqa    xmm3,xmm7
-        pmaddwd   xmm0,[rel PW_MF060_MF089]     ; xmm0=tmp0L
-        pmaddwd   xmm7,[rel PW_MF060_MF089]     ; xmm7=tmp0H
-        pmaddwd   xmm1,[rel PW_MF089_F060]      ; xmm1=tmp3L
-        pmaddwd   xmm3,[rel PW_MF089_F060]      ; xmm3=tmp3H
-
-        paddd   xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
-        paddd   xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
-        paddd   xmm1,xmm5               ; xmm1=tmp3L
-        paddd   xmm3,xmm4               ; xmm3=tmp3H
-
-        movdqa  XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
-        movdqa  XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
-
-        movdqa    xmm0,xmm2
-        movdqa    xmm7,xmm2
-        punpcklwd xmm0,xmm6
-        punpckhwd xmm7,xmm6
-        movdqa    xmm2,xmm0
-        movdqa    xmm6,xmm7
-        pmaddwd   xmm0,[rel PW_MF050_MF256]     ; xmm0=tmp1L
-        pmaddwd   xmm7,[rel PW_MF050_MF256]     ; xmm7=tmp1H
-        pmaddwd   xmm2,[rel PW_MF256_F050]      ; xmm2=tmp2L
-        pmaddwd   xmm6,[rel PW_MF256_F050]      ; xmm6=tmp2H
-
-        paddd   xmm0,xmm5               ; xmm0=tmp1L
-        paddd   xmm7,xmm4               ; xmm7=tmp1H
-        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
-        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
-
-        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
-        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
-        movdqa  xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
-
-        movdqa  xmm0,xmm5
-        movdqa  xmm7,xmm4
-        paddd   xmm5,xmm1               ; xmm5=data0L
-        paddd   xmm4,xmm3               ; xmm4=data0H
-        psubd   xmm0,xmm1               ; xmm0=data7L
-        psubd   xmm7,xmm3               ; xmm7=data7H
-
-        movdqa  xmm1,[rel PD_DESCALE_P2]        ; xmm1=[rel PD_DESCALE_P2]
-
-        paddd   xmm5,xmm1
-        paddd   xmm4,xmm1
-        psrad   xmm5,DESCALE_P2
-        psrad   xmm4,DESCALE_P2
-        paddd   xmm0,xmm1
-        paddd   xmm7,xmm1
-        psrad   xmm0,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm5,xmm4             ; xmm5=data0=(00 10 20 30 40 50 60 70)
-        packssdw  xmm0,xmm7             ; xmm0=data7=(07 17 27 37 47 57 67 77)
-
-        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
-        movdqa  xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm7,xmm1
-        paddd   xmm3,xmm2               ; xmm3=data1L
-        paddd   xmm1,xmm6               ; xmm1=data1H
-        psubd   xmm4,xmm2               ; xmm4=data6L
-        psubd   xmm7,xmm6               ; xmm7=data6H
-
-        movdqa  xmm2,[rel PD_DESCALE_P2]        ; xmm2=[rel PD_DESCALE_P2]
-
-        paddd   xmm3,xmm2
-        paddd   xmm1,xmm2
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm4,xmm2
-        paddd   xmm7,xmm2
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm3,xmm1             ; xmm3=data1=(01 11 21 31 41 51 61 71)
-        packssdw  xmm4,xmm7             ; xmm4=data6=(06 16 26 36 46 56 66 76)
-
-        packsswb  xmm5,xmm4             ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        packsswb  xmm3,xmm0             ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
-        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
-        movdqa  xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
-        movdqa  xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm4,xmm6
-        movdqa  xmm0,xmm2
-        paddd   xmm6,xmm1               ; xmm6=data2L
-        paddd   xmm2,xmm7               ; xmm2=data2H
-        psubd   xmm4,xmm1               ; xmm4=data5L
-        psubd   xmm0,xmm7               ; xmm0=data5H
-
-        movdqa  xmm5,[rel PD_DESCALE_P2]        ; xmm5=[rel PD_DESCALE_P2]
-
-        paddd   xmm6,xmm5
-        paddd   xmm2,xmm5
-        psrad   xmm6,DESCALE_P2
-        psrad   xmm2,DESCALE_P2
-        paddd   xmm4,xmm5
-        paddd   xmm0,xmm5
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm0,DESCALE_P2
-
-        packssdw  xmm6,xmm2             ; xmm6=data2=(02 12 22 32 42 52 62 72)
-        packssdw  xmm4,xmm0             ; xmm4=data5=(05 15 25 35 45 55 65 75)
-
-        movdqa  xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
-        movdqa  xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
-        movdqa  xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
-        movdqa  xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
-
-        movdqa  xmm2,xmm3
-        movdqa  xmm0,xmm1
-        paddd   xmm3,xmm7               ; xmm3=data3L
-        paddd   xmm1,xmm5               ; xmm1=data3H
-        psubd   xmm2,xmm7               ; xmm2=data4L
-        psubd   xmm0,xmm5               ; xmm0=data4H
-
-        movdqa  xmm7,[rel PD_DESCALE_P2]        ; xmm7=[rel PD_DESCALE_P2]
-
-        paddd   xmm3,xmm7
-        paddd   xmm1,xmm7
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm2,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm2,DESCALE_P2
-        psrad   xmm0,DESCALE_P2
-
-        movdqa    xmm5,[rel PB_CENTERJSAMP]     ; xmm5=[rel PB_CENTERJSAMP]
-
-        packssdw  xmm3,xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
-        packssdw  xmm2,xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
-
-        movdqa    xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        packsswb  xmm6,xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-        packsswb  xmm3,xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-        paddb     xmm7,xmm5
-        paddb     xmm1,xmm5
-        paddb     xmm6,xmm5
-        paddb     xmm3,xmm5
-
-        movdqa    xmm0,xmm7     ; transpose coefficients(phase 1)
-        punpcklbw xmm7,xmm1     ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-        punpckhbw xmm0,xmm1     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-        movdqa    xmm2,xmm6     ; transpose coefficients(phase 1)
-        punpcklbw xmm6,xmm3     ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-        punpckhbw xmm2,xmm3     ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-        movdqa    xmm4,xmm7     ; transpose coefficients(phase 2)
-        punpcklwd xmm7,xmm6     ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm6     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-        movdqa    xmm5,xmm2     ; transpose coefficients(phase 2)
-        punpcklwd xmm2,xmm0     ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-        punpckhwd xmm5,xmm0     ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-        movdqa    xmm1,xmm7     ; transpose coefficients(phase 3)
-        punpckldq xmm7,xmm2     ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm1,xmm2     ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-        movdqa    xmm3,xmm4     ; transpose coefficients(phase 3)
-        punpckldq xmm4,xmm5     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-        punpckhdq xmm3,xmm5     ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-        pshufd  xmm6,xmm7,0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm0,xmm1,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-        pshufd  xmm2,xmm4,0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-        pshufd  xmm5,xmm3,0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
-        mov     rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
-        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-        mov     rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctint-sse2.asm b/media/libjpeg/simd/jidctint-sse2.asm
deleted file mode 100644
index 6c7e7d9b4f..0000000000
--- a/media/libjpeg/simd/jidctint-sse2.asm
+++ /dev/null
@@ -1,858 +0,0 @@
-;
-; jidctint.asm - accurate integer IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_islow_sse2)
-
-EXTN(jconst_idct_islow_sse2):
-
-PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          12
-
-        align   16
-        global  EXTN(jsimd_idct_islow_sse2)
-
-EXTN(jsimd_idct_islow_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     xmm1,xmm0
-        packsswb xmm1,xmm1
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   xmm5,PASS1_BITS
-
-        movdqa    xmm4,xmm5             ; xmm5=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm5,xmm5             ; xmm5=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm4,xmm4             ; xmm4=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm7,xmm5,0x00          ; xmm7=col0=(00 00 00 00 00 00 00 00)
-        pshufd  xmm6,xmm5,0x55          ; xmm6=col1=(01 01 01 01 01 01 01 01)
-        pshufd  xmm1,xmm5,0xAA          ; xmm1=col2=(02 02 02 02 02 02 02 02)
-        pshufd  xmm5,xmm5,0xFF          ; xmm5=col3=(03 03 03 03 03 03 03 03)
-        pshufd  xmm0,xmm4,0x00          ; xmm0=col4=(04 04 04 04 04 04 04 04)
-        pshufd  xmm3,xmm4,0x55          ; xmm3=col5=(05 05 05 05 05 05 05 05)
-        pshufd  xmm2,xmm4,0xAA          ; xmm2=col6=(06 06 06 06 06 06 06 06)
-        pshufd  xmm4,xmm4,0xFF          ; xmm4=col7=(07 07 07 07 07 07 07 07)
-
-        movdqa  XMMWORD [wk(8)], xmm6   ; wk(8)=col1
-        movdqa  XMMWORD [wk(9)], xmm5   ; wk(9)=col3
-        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
-        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
-        jmp     near .column_end
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movdqa    xmm4,xmm1             ; xmm1=in2=z2
-        movdqa    xmm5,xmm1
-        punpcklwd xmm4,xmm3             ; xmm3=in6=z3
-        punpckhwd xmm5,xmm3
-        movdqa    xmm1,xmm4
-        movdqa    xmm3,xmm5
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]       ; xmm4=tmp3L
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]       ; xmm5=tmp3H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm1=tmp2L
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm3=tmp2H
-
-        movdqa    xmm6,xmm0
-        paddw     xmm0,xmm2             ; xmm0=in0+in4
-        psubw     xmm6,xmm2             ; xmm6=in0-in4
-
-        pxor      xmm7,xmm7
-        pxor      xmm2,xmm2
-        punpcklwd xmm7,xmm0             ; xmm7=tmp0L
-        punpckhwd xmm2,xmm0             ; xmm2=tmp0H
-        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-        psrad     xmm2,(16-CONST_BITS)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
-        movdqa  xmm0,xmm7
-        paddd   xmm7,xmm4               ; xmm7=tmp10L
-        psubd   xmm0,xmm4               ; xmm0=tmp13L
-        movdqa  xmm4,xmm2
-        paddd   xmm2,xmm5               ; xmm2=tmp10H
-        psubd   xmm4,xmm5               ; xmm4=tmp13H
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
-        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
-        movdqa  XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
-        movdqa  XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
-
-        pxor      xmm5,xmm5
-        pxor      xmm7,xmm7
-        punpcklwd xmm5,xmm6             ; xmm5=tmp1L
-        punpckhwd xmm7,xmm6             ; xmm7=tmp1H
-        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
-        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
-        movdqa  xmm2,xmm5
-        paddd   xmm5,xmm1               ; xmm5=tmp11L
-        psubd   xmm2,xmm1               ; xmm2=tmp12L
-        movdqa  xmm0,xmm7
-        paddd   xmm7,xmm3               ; xmm7=tmp11H
-        psubd   xmm0,xmm3               ; xmm0=tmp12H
-
-        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
-        movdqa  XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
-        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
-        movdqa  XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movdqa  xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm7,xmm4
-        paddw   xmm5,xmm3               ; xmm5=z3
-        paddw   xmm7,xmm1               ; xmm7=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm2,xmm5
-        movdqa    xmm0,xmm5
-        punpcklwd xmm2,xmm7
-        punpckhwd xmm0,xmm7
-        movdqa    xmm5,xmm2
-        movdqa    xmm7,xmm0
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm2=z3L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm0=z3H
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]       ; xmm5=z4L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_F117_F078)]       ; xmm7=z4H
-
-        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
-        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movdqa    xmm2,xmm3
-        movdqa    xmm0,xmm3
-        punpcklwd xmm2,xmm4
-        punpckhwd xmm0,xmm4
-        movdqa    xmm3,xmm2
-        movdqa    xmm4,xmm0
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm2=tmp0L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm0=tmp0H
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm3=tmp3L
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm4=tmp3H
-
-        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
-        paddd   xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
-        paddd   xmm3,xmm5               ; xmm3=tmp3L
-        paddd   xmm4,xmm7               ; xmm4=tmp3H
-
-        movdqa  XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
-        movdqa  XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
-
-        movdqa    xmm2,xmm1
-        movdqa    xmm0,xmm1
-        punpcklwd xmm2,xmm6
-        punpckhwd xmm0,xmm6
-        movdqa    xmm1,xmm2
-        movdqa    xmm6,xmm0
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm2=tmp1L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm0=tmp1H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm1=tmp2L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm6=tmp2H
-
-        paddd   xmm2,xmm5               ; xmm2=tmp1L
-        paddd   xmm0,xmm7               ; xmm0=tmp1H
-        paddd   xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
-        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
-
-        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
-        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
-        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
-
-        movdqa  xmm2,xmm5
-        movdqa  xmm0,xmm7
-        paddd   xmm5,xmm3               ; xmm5=data0L
-        paddd   xmm7,xmm4               ; xmm7=data0H
-        psubd   xmm2,xmm3               ; xmm2=data7L
-        psubd   xmm0,xmm4               ; xmm0=data7H
-
-        movdqa  xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm3=[PD_DESCALE_P1]
-
-        paddd   xmm5,xmm3
-        paddd   xmm7,xmm3
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm7,DESCALE_P1
-        paddd   xmm2,xmm3
-        paddd   xmm0,xmm3
-        psrad   xmm2,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm5,xmm7             ; xmm5=data0=(00 01 02 03 04 05 06 07)
-        packssdw  xmm2,xmm0             ; xmm2=data7=(70 71 72 73 74 75 76 77)
-
-        movdqa  xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
-        movdqa  xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
-
-        movdqa  xmm7,xmm4
-        movdqa  xmm0,xmm3
-        paddd   xmm4,xmm1               ; xmm4=data1L
-        paddd   xmm3,xmm6               ; xmm3=data1H
-        psubd   xmm7,xmm1               ; xmm7=data6L
-        psubd   xmm0,xmm6               ; xmm0=data6H
-
-        movdqa  xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm1=[PD_DESCALE_P1]
-
-        paddd   xmm4,xmm1
-        paddd   xmm3,xmm1
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm3,DESCALE_P1
-        paddd   xmm7,xmm1
-        paddd   xmm0,xmm1
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm4,xmm3             ; xmm4=data1=(10 11 12 13 14 15 16 17)
-        packssdw  xmm7,xmm0             ; xmm7=data6=(60 61 62 63 64 65 66 67)
-
-        movdqa    xmm6,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm4             ; xmm5=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm6,xmm4             ; xmm6=(04 14 05 15 06 16 07 17)
-        movdqa    xmm1,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm2             ; xmm7=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm1,xmm2             ; xmm1=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
-        movdqa  xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
-        movdqa  xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
-        movdqa  xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
-        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
-        movdqa  XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm5,xmm3
-        movdqa  xmm6,xmm0
-        paddd   xmm3,xmm4               ; xmm3=data2L
-        paddd   xmm0,xmm2               ; xmm0=data2H
-        psubd   xmm5,xmm4               ; xmm5=data5L
-        psubd   xmm6,xmm2               ; xmm6=data5H
-
-        movdqa  xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm7=[PD_DESCALE_P1]
-
-        paddd   xmm3,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm3,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-        paddd   xmm5,xmm7
-        paddd   xmm6,xmm7
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-
-        packssdw  xmm3,xmm0             ; xmm3=data2=(20 21 22 23 24 25 26 27)
-        packssdw  xmm5,xmm6             ; xmm5=data5=(50 51 52 53 54 55 56 57)
-
-        movdqa  xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
-        movdqa  xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
-        movdqa  xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
-        movdqa  xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
-
-        movdqa  xmm0,xmm1
-        movdqa  xmm6,xmm4
-        paddd   xmm1,xmm2               ; xmm1=data3L
-        paddd   xmm4,xmm7               ; xmm4=data3H
-        psubd   xmm0,xmm2               ; xmm0=data4L
-        psubd   xmm6,xmm7               ; xmm6=data4H
-
-        movdqa  xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm2=[PD_DESCALE_P1]
-
-        paddd   xmm1,xmm2
-        paddd   xmm4,xmm2
-        psrad   xmm1,DESCALE_P1
-        psrad   xmm4,DESCALE_P1
-        paddd   xmm0,xmm2
-        paddd   xmm6,xmm2
-        psrad   xmm0,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-
-        packssdw  xmm1,xmm4             ; xmm1=data3=(30 31 32 33 34 35 36 37)
-        packssdw  xmm0,xmm6             ; xmm0=data4=(40 41 42 43 44 45 46 47)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
-        movdqa  xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
-
-        movdqa    xmm4,xmm3             ; transpose coefficients(phase 1)
-        punpcklwd xmm3,xmm1             ; xmm3=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm4,xmm1             ; xmm4=(24 34 25 35 26 36 27 37)
-        movdqa    xmm6,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm5             ; xmm0=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm6,xmm5             ; xmm6=(44 54 45 55 46 56 47 57)
-
-        movdqa    xmm1,xmm7             ; transpose coefficients(phase 2)
-        punpckldq xmm7,xmm3             ; xmm7=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm1,xmm3             ; xmm1=(02 12 22 32 03 13 23 33)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm4             ; xmm2=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm5,xmm4             ; xmm5=(06 16 26 36 07 17 27 37)
-
-        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
-        movdqa  xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
-
-        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
-        movdqa  XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
-
-        movdqa    xmm2,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm3             ; xmm0=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm2,xmm3             ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm4             ; xmm6=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm5,xmm4             ; xmm5=(46 56 66 76 47 57 67 77)
-
-        movdqa     xmm3,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm0            ; xmm7=col0=(00 10 20 30 40 50 60 70)
-        punpckhqdq xmm3,xmm0            ; xmm3=col1=(01 11 21 31 41 51 61 71)
-        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm2            ; xmm1=col2=(02 12 22 32 42 52 62 72)
-        punpckhqdq xmm4,xmm2            ; xmm4=col3=(03 13 23 33 43 53 63 73)
-
-        movdqa  xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
-        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa  XMMWORD [wk(8)], xmm3   ; wk(8)=col1
-        movdqa  XMMWORD [wk(9)], xmm4   ; wk(9)=col3
-
-        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=col4=(04 14 24 34 44 54 64 74)
-        punpckhqdq xmm3,xmm6            ; xmm3=col5=(05 15 25 35 45 55 65 75)
-        movdqa     xmm4,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm5            ; xmm2=col6=(06 16 26 36 46 56 66 76)
-        punpckhqdq xmm4,xmm5            ; xmm4=col7=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
-        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-
-        ; -- Even part
-
-        ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movdqa    xmm6,xmm1             ; xmm1=in2=z2
-        movdqa    xmm5,xmm1
-        punpcklwd xmm6,xmm2             ; xmm2=in6=z3
-        punpckhwd xmm5,xmm2
-        movdqa    xmm1,xmm6
-        movdqa    xmm2,xmm5
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]       ; xmm6=tmp3L
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]       ; xmm5=tmp3H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm1=tmp2L
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm2=tmp2H
-
-        movdqa    xmm3,xmm7
-        paddw     xmm7,xmm0             ; xmm7=in0+in4
-        psubw     xmm3,xmm0             ; xmm3=in0-in4
-
-        pxor      xmm4,xmm4
-        pxor      xmm0,xmm0
-        punpcklwd xmm4,xmm7             ; xmm4=tmp0L
-        punpckhwd xmm0,xmm7             ; xmm0=tmp0H
-        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-        psrad     xmm0,(16-CONST_BITS)  ; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
-        movdqa  xmm7,xmm4
-        paddd   xmm4,xmm6               ; xmm4=tmp10L
-        psubd   xmm7,xmm6               ; xmm7=tmp13L
-        movdqa  xmm6,xmm0
-        paddd   xmm0,xmm5               ; xmm0=tmp10H
-        psubd   xmm6,xmm5               ; xmm6=tmp13H
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
-        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
-        movdqa  XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
-
-        pxor      xmm5,xmm5
-        pxor      xmm4,xmm4
-        punpcklwd xmm5,xmm3             ; xmm5=tmp1L
-        punpckhwd xmm4,xmm3             ; xmm4=tmp1H
-        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
-        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
-        movdqa  xmm0,xmm5
-        paddd   xmm5,xmm1               ; xmm5=tmp11L
-        psubd   xmm0,xmm1               ; xmm0=tmp12L
-        movdqa  xmm7,xmm4
-        paddd   xmm4,xmm2               ; xmm4=tmp11H
-        psubd   xmm7,xmm2               ; xmm7=tmp12H
-
-        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
-        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
-        movdqa  XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
-        movdqa  XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movdqa  xmm6, XMMWORD [wk(9)]   ; xmm6=col3
-        movdqa  xmm3, XMMWORD [wk(8)]   ; xmm3=col1
-        movdqa  xmm1, XMMWORD [wk(11)]  ; xmm1=col7
-        movdqa  xmm2, XMMWORD [wk(10)]  ; xmm2=col5
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm4,xmm3
-        paddw   xmm5,xmm1               ; xmm5=z3
-        paddw   xmm4,xmm2               ; xmm4=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm0,xmm5
-        movdqa    xmm7,xmm5
-        punpcklwd xmm0,xmm4
-        punpckhwd xmm7,xmm4
-        movdqa    xmm5,xmm0
-        movdqa    xmm4,xmm7
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm0=z3L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm7=z3H
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]       ; xmm5=z4L
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F117_F078)]       ; xmm4=z4H
-
-        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
-        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movdqa    xmm0,xmm1
-        movdqa    xmm7,xmm1
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm7,xmm3
-        movdqa    xmm1,xmm0
-        movdqa    xmm3,xmm7
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm0=tmp0L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm7=tmp0H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm1=tmp3L
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm3=tmp3H
-
-        paddd   xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
-        paddd   xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
-        paddd   xmm1,xmm5               ; xmm1=tmp3L
-        paddd   xmm3,xmm4               ; xmm3=tmp3H
-
-        movdqa  XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
-        movdqa  XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
-
-        movdqa    xmm0,xmm2
-        movdqa    xmm7,xmm2
-        punpcklwd xmm0,xmm6
-        punpckhwd xmm7,xmm6
-        movdqa    xmm2,xmm0
-        movdqa    xmm6,xmm7
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm0=tmp1L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm7=tmp1H
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm2=tmp2L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm6=tmp2H
-
-        paddd   xmm0,xmm5               ; xmm0=tmp1L
-        paddd   xmm7,xmm4               ; xmm7=tmp1H
-        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
-        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
-
-        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
-        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
-        movdqa  xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
-
-        movdqa  xmm0,xmm5
-        movdqa  xmm7,xmm4
-        paddd   xmm5,xmm1               ; xmm5=data0L
-        paddd   xmm4,xmm3               ; xmm4=data0H
-        psubd   xmm0,xmm1               ; xmm0=data7L
-        psubd   xmm7,xmm3               ; xmm7=data7H
-
-        movdqa  xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm1=[PD_DESCALE_P2]
-
-        paddd   xmm5,xmm1
-        paddd   xmm4,xmm1
-        psrad   xmm5,DESCALE_P2
-        psrad   xmm4,DESCALE_P2
-        paddd   xmm0,xmm1
-        paddd   xmm7,xmm1
-        psrad   xmm0,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm5,xmm4             ; xmm5=data0=(00 10 20 30 40 50 60 70)
-        packssdw  xmm0,xmm7             ; xmm0=data7=(07 17 27 37 47 57 67 77)
-
-        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
-        movdqa  xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm7,xmm1
-        paddd   xmm3,xmm2               ; xmm3=data1L
-        paddd   xmm1,xmm6               ; xmm1=data1H
-        psubd   xmm4,xmm2               ; xmm4=data6L
-        psubd   xmm7,xmm6               ; xmm7=data6H
-
-        movdqa  xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm2=[PD_DESCALE_P2]
-
-        paddd   xmm3,xmm2
-        paddd   xmm1,xmm2
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm4,xmm2
-        paddd   xmm7,xmm2
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm3,xmm1             ; xmm3=data1=(01 11 21 31 41 51 61 71)
-        packssdw  xmm4,xmm7             ; xmm4=data6=(06 16 26 36 46 56 66 76)
-
-        packsswb  xmm5,xmm4             ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        packsswb  xmm3,xmm0             ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
-        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
-        movdqa  xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
-        movdqa  xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm4,xmm6
-        movdqa  xmm0,xmm2
-        paddd   xmm6,xmm1               ; xmm6=data2L
-        paddd   xmm2,xmm7               ; xmm2=data2H
-        psubd   xmm4,xmm1               ; xmm4=data5L
-        psubd   xmm0,xmm7               ; xmm0=data5H
-
-        movdqa  xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm5=[PD_DESCALE_P2]
-
-        paddd   xmm6,xmm5
-        paddd   xmm2,xmm5
-        psrad   xmm6,DESCALE_P2
-        psrad   xmm2,DESCALE_P2
-        paddd   xmm4,xmm5
-        paddd   xmm0,xmm5
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm0,DESCALE_P2
-
-        packssdw  xmm6,xmm2             ; xmm6=data2=(02 12 22 32 42 52 62 72)
-        packssdw  xmm4,xmm0             ; xmm4=data5=(05 15 25 35 45 55 65 75)
-
-        movdqa  xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
-        movdqa  xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
-        movdqa  xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
-        movdqa  xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
-
-        movdqa  xmm2,xmm3
-        movdqa  xmm0,xmm1
-        paddd   xmm3,xmm7               ; xmm3=data3L
-        paddd   xmm1,xmm5               ; xmm1=data3H
-        psubd   xmm2,xmm7               ; xmm2=data4L
-        psubd   xmm0,xmm5               ; xmm0=data4H
-
-        movdqa  xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm7=[PD_DESCALE_P2]
-
-        paddd   xmm3,xmm7
-        paddd   xmm1,xmm7
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm2,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm2,DESCALE_P2
-        psrad   xmm0,DESCALE_P2
-
-        movdqa    xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm5=[PB_CENTERJSAMP]
-
-        packssdw  xmm3,xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
-        packssdw  xmm2,xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
-
-        movdqa    xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        packsswb  xmm6,xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-        packsswb  xmm3,xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-        paddb     xmm7,xmm5
-        paddb     xmm1,xmm5
-        paddb     xmm6,xmm5
-        paddb     xmm3,xmm5
-
-        movdqa    xmm0,xmm7     ; transpose coefficients(phase 1)
-        punpcklbw xmm7,xmm1     ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-        punpckhbw xmm0,xmm1     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-        movdqa    xmm2,xmm6     ; transpose coefficients(phase 1)
-        punpcklbw xmm6,xmm3     ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-        punpckhbw xmm2,xmm3     ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-        movdqa    xmm4,xmm7     ; transpose coefficients(phase 2)
-        punpcklwd xmm7,xmm6     ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm6     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-        movdqa    xmm5,xmm2     ; transpose coefficients(phase 2)
-        punpcklwd xmm2,xmm0     ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-        punpckhwd xmm5,xmm0     ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-        movdqa    xmm1,xmm7     ; transpose coefficients(phase 3)
-        punpckldq xmm7,xmm2     ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm1,xmm2     ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-        movdqa    xmm3,xmm4     ; transpose coefficients(phase 3)
-        punpckldq xmm4,xmm5     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-        punpckhdq xmm3,xmm5     ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-        pshufd  xmm6,xmm7,0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm0,xmm1,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-        pshufd  xmm2,xmm4,0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-        pshufd  xmm5,xmm3,0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
-        mov     edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
-        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
-        mov     edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctred-mmx.asm b/media/libjpeg/simd/jidctred-mmx.asm
deleted file mode 100644
index ba054e31a0..0000000000
--- a/media/libjpeg/simd/jidctred-mmx.asm
+++ /dev/null
@@ -1,705 +0,0 @@
-;
-; jidctred.asm - reduced-size IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211 equ      1730           ; FIX(0.211164243)
-F_0_509 equ      4176           ; FIX(0.509795579)
-F_0_601 equ      4926           ; FIX(0.601344887)
-F_0_720 equ      5906           ; FIX(0.720959822)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_850 equ      6967           ; FIX(0.850430095)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_061 equ      8697           ; FIX(1.061594337)
-F_1_272 equ     10426           ; FIX(1.272758580)
-F_1_451 equ     11893           ; FIX(1.451774981)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_2_172 equ     17799           ; FIX(2.172734803)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_624 equ     29692           ; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
-F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
-F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
-F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
-F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
-F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_red_mmx)
-
-EXTN(jconst_idct_red_mmx):
-
-PW_F184_MF076   times 2 dw  F_1_847,-F_0_765
-PW_F256_F089    times 2 dw  F_2_562, F_0_899
-PW_F106_MF217   times 2 dw  F_1_061,-F_2_172
-PW_MF060_MF050  times 2 dw -F_0_601,-F_0_509
-PW_F145_MF021   times 2 dw  F_1_451,-F_0_211
-PW_F362_MF127   times 2 dw  F_3_624,-F_1_272
-PW_F085_MF072   times 2 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_mmx (void *dct_table, JCOEFPTR coef_block,
-;                     JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
-                                        ; JCOEF workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_4x4_mmx)
-
-EXTN(jsimd_idct_4x4_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; JCOEF *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     mm0,mm1
-        packsswb mm0,mm0
-        movd    eax,mm0
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   mm0,PASS1_BITS
-
-        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
-        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
-        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
-
-        movq      mm1,mm0
-        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
-        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
-        movq      mm3,mm2
-        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
-        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movq      mm4,mm0
-        movq      mm5,mm0
-        punpcklwd mm4,mm1
-        punpckhwd mm5,mm1
-        movq      mm0,mm4
-        movq      mm1,mm5
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]        ; mm4=(tmp2L)
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]        ; mm5=(tmp2H)
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]       ; mm0=(tmp0L)
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]       ; mm1=(tmp0H)
-
-        movq      mm6,mm2
-        movq      mm7,mm2
-        punpcklwd mm6,mm3
-        punpckhwd mm7,mm3
-        movq      mm2,mm6
-        movq      mm3,mm7
-        pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm6=(tmp2L)
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm7=(tmp2H)
-        pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]       ; mm2=(tmp0L)
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]       ; mm3=(tmp0H)
-
-        paddd   mm6,mm4                 ; mm6=tmp2L
-        paddd   mm7,mm5                 ; mm7=tmp2H
-        paddd   mm2,mm0                 ; mm2=tmp0L
-        paddd   mm3,mm1                 ; mm3=tmp0H
-
-        movq    MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
-        movq    MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
-
-        ; -- Even part
-
-        movq    mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        pxor      mm1,mm1
-        pxor      mm2,mm2
-        punpcklwd mm1,mm4               ; mm1=tmp0L
-        punpckhwd mm2,mm4               ; mm2=tmp0H
-        psrad     mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
-        psrad     mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
-
-        movq      mm3,mm5               ; mm5=in2=z2
-        punpcklwd mm5,mm0               ; mm0=in6=z3
-        punpckhwd mm3,mm0
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]       ; mm5=tmp2L
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]       ; mm3=tmp2H
-
-        movq    mm4,mm1
-        movq    mm0,mm2
-        paddd   mm1,mm5                 ; mm1=tmp10L
-        paddd   mm2,mm3                 ; mm2=tmp10H
-        psubd   mm4,mm5                 ; mm4=tmp12L
-        psubd   mm0,mm3                 ; mm0=tmp12H
-
-        ; -- Final output stage
-
-        movq    mm5,mm1
-        movq    mm3,mm2
-        paddd   mm1,mm6                 ; mm1=data0L
-        paddd   mm2,mm7                 ; mm2=data0H
-        psubd   mm5,mm6                 ; mm5=data3L
-        psubd   mm3,mm7                 ; mm3=data3H
-
-        movq    mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]       ; mm6=[PD_DESCALE_P1_4]
-
-        paddd   mm1,mm6
-        paddd   mm2,mm6
-        psrad   mm1,DESCALE_P1_4
-        psrad   mm2,DESCALE_P1_4
-        paddd   mm5,mm6
-        paddd   mm3,mm6
-        psrad   mm5,DESCALE_P1_4
-        psrad   mm3,DESCALE_P1_4
-
-        packssdw  mm1,mm2               ; mm1=data0=(00 01 02 03)
-        packssdw  mm5,mm3               ; mm5=data3=(30 31 32 33)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp0L
-        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp0H
-
-        movq    mm2,mm4
-        movq    mm3,mm0
-        paddd   mm4,mm7                 ; mm4=data1L
-        paddd   mm0,mm6                 ; mm0=data1H
-        psubd   mm2,mm7                 ; mm2=data2L
-        psubd   mm3,mm6                 ; mm3=data2H
-
-        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]       ; mm7=[PD_DESCALE_P1_4]
-
-        paddd   mm4,mm7
-        paddd   mm0,mm7
-        psrad   mm4,DESCALE_P1_4
-        psrad   mm0,DESCALE_P1_4
-        paddd   mm2,mm7
-        paddd   mm3,mm7
-        psrad   mm2,DESCALE_P1_4
-        psrad   mm3,DESCALE_P1_4
-
-        packssdw  mm4,mm0               ; mm4=data1=(10 11 12 13)
-        packssdw  mm2,mm3               ; mm2=data2=(20 21 22 23)
-
-        movq      mm6,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm4               ; mm1=(00 10 01 11)
-        punpckhwd mm6,mm4               ; mm6=(02 12 03 13)
-        movq      mm7,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm5               ; mm2=(20 30 21 31)
-        punpckhwd mm7,mm5               ; mm7=(22 32 23 33)
-
-        movq      mm0,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm2               ; mm1=(00 10 20 30)
-        punpckhdq mm0,mm2               ; mm0=(01 11 21 31)
-        movq      mm3,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm7               ; mm6=(02 12 22 32)
-        punpckhdq mm3,mm7               ; mm3=(03 13 23 33)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_ISLOW_MULT_TYPE      ; quantptr
-        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; JCOEF *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        movq      mm4,mm0
-        movq      mm5,mm0
-        punpcklwd mm4,mm1
-        punpckhwd mm5,mm1
-        movq      mm0,mm4
-        movq      mm1,mm5
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]        ; mm4=(tmp2L)
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]        ; mm5=(tmp2H)
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]       ; mm0=(tmp0L)
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]       ; mm1=(tmp0H)
-
-        movq      mm6,mm2
-        movq      mm7,mm2
-        punpcklwd mm6,mm3
-        punpckhwd mm7,mm3
-        movq      mm2,mm6
-        movq      mm3,mm7
-        pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm6=(tmp2L)
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm7=(tmp2H)
-        pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]       ; mm2=(tmp0L)
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]       ; mm3=(tmp0H)
-
-        paddd   mm6,mm4                 ; mm6=tmp2L
-        paddd   mm7,mm5                 ; mm7=tmp2H
-        paddd   mm2,mm0                 ; mm2=tmp0L
-        paddd   mm3,mm1                 ; mm3=tmp0H
-
-        movq    MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
-        movq    MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
-
-        ; -- Even part
-
-        movq    mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        pxor      mm1,mm1
-        pxor      mm2,mm2
-        punpcklwd mm1,mm4               ; mm1=tmp0L
-        punpckhwd mm2,mm4               ; mm2=tmp0H
-        psrad     mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
-        psrad     mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
-
-        movq      mm3,mm5               ; mm5=in2=z2
-        punpcklwd mm5,mm0               ; mm0=in6=z3
-        punpckhwd mm3,mm0
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]       ; mm5=tmp2L
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]       ; mm3=tmp2H
-
-        movq    mm4,mm1
-        movq    mm0,mm2
-        paddd   mm1,mm5                 ; mm1=tmp10L
-        paddd   mm2,mm3                 ; mm2=tmp10H
-        psubd   mm4,mm5                 ; mm4=tmp12L
-        psubd   mm0,mm3                 ; mm0=tmp12H
-
-        ; -- Final output stage
-
-        movq    mm5,mm1
-        movq    mm3,mm2
-        paddd   mm1,mm6                 ; mm1=data0L
-        paddd   mm2,mm7                 ; mm2=data0H
-        psubd   mm5,mm6                 ; mm5=data3L
-        psubd   mm3,mm7                 ; mm3=data3H
-
-        movq    mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]       ; mm6=[PD_DESCALE_P2_4]
-
-        paddd   mm1,mm6
-        paddd   mm2,mm6
-        psrad   mm1,DESCALE_P2_4
-        psrad   mm2,DESCALE_P2_4
-        paddd   mm5,mm6
-        paddd   mm3,mm6
-        psrad   mm5,DESCALE_P2_4
-        psrad   mm3,DESCALE_P2_4
-
-        packssdw  mm1,mm2               ; mm1=data0=(00 10 20 30)
-        packssdw  mm5,mm3               ; mm5=data3=(03 13 23 33)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp0L
-        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp0H
-
-        movq    mm2,mm4
-        movq    mm3,mm0
-        paddd   mm4,mm7                 ; mm4=data1L
-        paddd   mm0,mm6                 ; mm0=data1H
-        psubd   mm2,mm7                 ; mm2=data2L
-        psubd   mm3,mm6                 ; mm3=data2H
-
-        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]       ; mm7=[PD_DESCALE_P2_4]
-
-        paddd   mm4,mm7
-        paddd   mm0,mm7
-        psrad   mm4,DESCALE_P2_4
-        psrad   mm0,DESCALE_P2_4
-        paddd   mm2,mm7
-        paddd   mm3,mm7
-        psrad   mm2,DESCALE_P2_4
-        psrad   mm3,DESCALE_P2_4
-
-        packssdw  mm4,mm0               ; mm4=data1=(01 11 21 31)
-        packssdw  mm2,mm3               ; mm2=data2=(02 12 22 32)
-
-        movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm6=[PB_CENTERJSAMP]
-
-        packsswb  mm1,mm2               ; mm1=(00 10 20 30 02 12 22 32)
-        packsswb  mm4,mm5               ; mm4=(01 11 21 31 03 13 23 33)
-        paddb     mm1,mm6
-        paddb     mm4,mm6
-
-        movq      mm7,mm1               ; transpose coefficients(phase 1)
-        punpcklbw mm1,mm4               ; mm1=(00 01 10 11 20 21 30 31)
-        punpckhbw mm7,mm4               ; mm7=(02 03 12 13 22 23 32 33)
-
-        movq      mm0,mm1               ; transpose coefficients(phase 2)
-        punpcklwd mm1,mm7               ; mm1=(00 01 02 03 10 11 12 13)
-        punpckhwd mm0,mm7               ; mm0=(20 21 22 23 30 31 32 33)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        movd    DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-        movd    DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
-
-        psrlq   mm1,4*BYTE_BIT
-        psrlq   mm0,4*BYTE_BIT
-
-        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movd    DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-        movd    DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_mmx (void *dct_table, JCOEFPTR coef_block,
-;                     JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-        align   16
-        global  EXTN(jsimd_idct_2x2_mmx)
-
-EXTN(jsimd_idct_2x2_mmx):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     edx, POINTER [dct_table(ebp)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(ebp)]         ; inptr
-
-        ; | input:                  | result:        |
-        ; | 00 01 ** 03 ** 05 ** 07 |                |
-        ; | 10 11 ** 13 ** 15 ** 17 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-        ; | 50 51 ** 53 ** 55 ** 57 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 70 71 ** 73 ** 75 ** 77 |                |
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
-        ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
-
-        pcmpeqd   mm7,mm7
-        pslld     mm7,WORD_BIT          ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
-
-        movq      mm4,mm0               ; mm4=(10 11 ** 13)
-        movq      mm5,mm2               ; mm5=(50 51 ** 53)
-        punpcklwd mm4,mm1               ; mm4=(10 30 11 31)
-        punpcklwd mm5,mm3               ; mm5=(50 70 51 71)
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-        psrld   mm0,WORD_BIT            ; mm0=(11 -- 13 --)
-        pand    mm1,mm7                 ; mm1=(-- 31 -- 33)
-        psrld   mm2,WORD_BIT            ; mm2=(51 -- 53 --)
-        pand    mm3,mm7                 ; mm3=(-- 71 -- 73)
-        por     mm0,mm1                 ; mm0=(11 31 13 33)
-        por     mm2,mm3                 ; mm2=(51 71 53 73)
-        pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd   mm4,mm5                 ; mm4=tmp0[col0 col1]
-
-        movq    mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
-        pmullw  mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
-        pmullw  mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
-        ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
-
-        psrld   mm6,WORD_BIT            ; mm6=(15 -- 17 --)
-        pand    mm1,mm7                 ; mm1=(-- 35 -- 37)
-        psrld   mm3,WORD_BIT            ; mm3=(55 -- 57 --)
-        pand    mm5,mm7                 ; mm5=(-- 75 -- 77)
-        por     mm6,mm1                 ; mm6=(15 35 17 37)
-        por     mm3,mm5                 ; mm3=(55 75 57 77)
-        pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd   mm0,mm2                 ; mm0=tmp0[col1 col3]
-        paddd   mm6,mm3                 ; mm6=tmp0[col5 col7]
-
-        ; -- Even part
-
-        movq    mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
-        pmullw  mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
-
-        movq    mm2,mm1                         ; mm2=(00 01 ** 03)
-        pslld   mm1,WORD_BIT                    ; mm1=(-- 00 -- **)
-        psrad   mm1,(WORD_BIT-CONST_BITS-2)     ; mm1=tmp10[col0 ****]
-
-        pand    mm2,mm7                         ; mm2=(-- 01 -- 03)
-        pand    mm5,mm7                         ; mm5=(-- 05 -- 07)
-        psrad   mm2,(WORD_BIT-CONST_BITS-2)     ; mm2=tmp10[col1 col3]
-        psrad   mm5,(WORD_BIT-CONST_BITS-2)     ; mm5=tmp10[col5 col7]
-
-        ; -- Final output stage
-
-        movq      mm3,mm1
-        paddd     mm1,mm4               ; mm1=data0[col0 ****]=(A0 **)
-        psubd     mm3,mm4               ; mm3=data1[col0 ****]=(B0 **)
-        punpckldq mm1,mm3               ; mm1=(A0 B0)
-
-        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]       ; mm7=[PD_DESCALE_P1_2]
-
-        movq    mm4,mm2
-        movq    mm3,mm5
-        paddd   mm2,mm0                 ; mm2=data0[col1 col3]=(A1 A3)
-        paddd   mm5,mm6                 ; mm5=data0[col5 col7]=(A5 A7)
-        psubd   mm4,mm0                 ; mm4=data1[col1 col3]=(B1 B3)
-        psubd   mm3,mm6                 ; mm3=data1[col5 col7]=(B5 B7)
-
-        paddd   mm1,mm7
-        psrad   mm1,DESCALE_P1_2
-
-        paddd   mm2,mm7
-        paddd   mm5,mm7
-        psrad   mm2,DESCALE_P1_2
-        psrad   mm5,DESCALE_P1_2
-        paddd   mm4,mm7
-        paddd   mm3,mm7
-        psrad   mm4,DESCALE_P1_2
-        psrad   mm3,DESCALE_P1_2
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     edi, JSAMPARRAY [output_buf(ebp)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(ebp)]
-
-        ; | input:| result:|
-        ; | A0 B0 |        |
-        ; | A1 B1 | C0 C1  |
-        ; | A3 B3 | D0 D1  |
-        ; | A5 B5 |        |
-        ; | A7 B7 |        |
-
-        ; -- Odd part
-
-        packssdw  mm2,mm4               ; mm2=(A1 A3 B1 B3)
-        packssdw  mm5,mm3               ; mm5=(A5 A7 B5 B7)
-        pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd     mm2,mm5               ; mm2=tmp0[row0 row1]
-
-        ; -- Even part
-
-        pslld     mm1,(CONST_BITS+2)    ; mm1=tmp10[row0 row1]
-
-        ; -- Final output stage
-
-        movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]     ; mm0=[PD_DESCALE_P2_2]
-
-        movq      mm6,mm1
-        paddd     mm1,mm2               ; mm1=data0[row0 row1]=(C0 C1)
-        psubd     mm6,mm2               ; mm6=data1[row0 row1]=(D0 D1)
-
-        paddd     mm1,mm0
-        paddd     mm6,mm0
-        psrad     mm1,DESCALE_P2_2
-        psrad     mm6,DESCALE_P2_2
-
-        movq      mm7,mm1               ; transpose coefficients
-        punpckldq mm1,mm6               ; mm1=(C0 D0)
-        punpckhdq mm7,mm6               ; mm7=(C1 D1)
-
-        packssdw  mm1,mm7               ; mm1=(C0 D0 C1 D1)
-        packsswb  mm1,mm1               ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
-        paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-        movd    ecx,mm1
-        movd    ebx,mm1                 ; ebx=(C0 D0 C1 D1)
-        shr     ecx,2*BYTE_BIT          ; ecx=(C1 D1 -- --)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     WORD [edx+eax*SIZEOF_JSAMPLE], bx
-        mov     WORD [esi+eax*SIZEOF_JSAMPLE], cx
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctred-sse2-64.asm b/media/libjpeg/simd/jidctred-sse2-64.asm
deleted file mode 100644
index a54bbe24e0..0000000000
--- a/media/libjpeg/simd/jidctred-sse2-64.asm
+++ /dev/null
@@ -1,575 +0,0 @@
-;
-; jidctred.asm - reduced-size IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211 equ      1730           ; FIX(0.211164243)
-F_0_509 equ      4176           ; FIX(0.509795579)
-F_0_601 equ      4926           ; FIX(0.601344887)
-F_0_720 equ      5906           ; FIX(0.720959822)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_850 equ      6967           ; FIX(0.850430095)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_061 equ      8697           ; FIX(1.061594337)
-F_1_272 equ     10426           ; FIX(1.272758580)
-F_1_451 equ     11893           ; FIX(1.451774981)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_2_172 equ     17799           ; FIX(2.172734803)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_624 equ     29692           ; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
-F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
-F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
-F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
-F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
-F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_red_sse2)
-
-EXTN(jconst_idct_red_sse2):
-
-PW_F184_MF076   times 4 dw  F_1_847,-F_0_765
-PW_F256_F089    times 4 dw  F_2_562, F_0_899
-PW_F106_MF217   times 4 dw  F_1_061,-F_2_172
-PW_MF060_MF050  times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021   times 4 dw  F_1_451,-F_0_211
-PW_F362_MF127   times 4 dw  F_3_624,-F_1_272
-PW_F085_MF072   times 4 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 4 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 4 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 4 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 4 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void *dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp    rbp+0
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_idct_4x4_sse2)
-
-EXTN(jsimd_idct_4x4_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0,xmm1
-        packsswb xmm0,xmm0
-        packsswb xmm0,xmm0
-        movd    eax,xmm0
-        test    rax,rax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   xmm0,PASS1_BITS
-
-        movdqa    xmm3,xmm0     ; xmm0=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm0,xmm0     ; xmm0=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm3,xmm3     ; xmm3=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm1,xmm0,0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
-        pshufd  xmm0,xmm0,0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
-        pshufd  xmm6,xmm3,0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
-        pshufd  xmm3,xmm3,0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
-        jmp     near .column_end
-%endif
-.columnDCT:
-
-        ; -- Odd part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa    xmm4,xmm0
-        movdqa    xmm5,xmm0
-        punpcklwd xmm4,xmm1
-        punpckhwd xmm5,xmm1
-        movdqa    xmm0,xmm4
-        movdqa    xmm1,xmm5
-        pmaddwd   xmm4,[rel PW_F256_F089]       ; xmm4=(tmp2L)
-        pmaddwd   xmm5,[rel PW_F256_F089]       ; xmm5=(tmp2H)
-        pmaddwd   xmm0,[rel PW_F106_MF217]      ; xmm0=(tmp0L)
-        pmaddwd   xmm1,[rel PW_F106_MF217]      ; xmm1=(tmp0H)
-
-        movdqa    xmm6,xmm2
-        movdqa    xmm7,xmm2
-        punpcklwd xmm6,xmm3
-        punpckhwd xmm7,xmm3
-        movdqa    xmm2,xmm6
-        movdqa    xmm3,xmm7
-        pmaddwd   xmm6,[rel PW_MF060_MF050]     ; xmm6=(tmp2L)
-        pmaddwd   xmm7,[rel PW_MF060_MF050]     ; xmm7=(tmp2H)
-        pmaddwd   xmm2,[rel PW_F145_MF021]      ; xmm2=(tmp0L)
-        pmaddwd   xmm3,[rel PW_F145_MF021]      ; xmm3=(tmp0H)
-
-        paddd   xmm6,xmm4               ; xmm6=tmp2L
-        paddd   xmm7,xmm5               ; xmm7=tmp2H
-        paddd   xmm2,xmm0               ; xmm2=tmp0L
-        paddd   xmm3,xmm1               ; xmm3=tmp0H
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
-
-        ; -- Even part
-
-        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        pxor      xmm1,xmm1
-        pxor      xmm2,xmm2
-        punpcklwd xmm1,xmm4             ; xmm1=tmp0L
-        punpckhwd xmm2,xmm4             ; xmm2=tmp0H
-        psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
-        psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
-        movdqa    xmm3,xmm5             ; xmm5=in2=z2
-        punpcklwd xmm5,xmm0             ; xmm0=in6=z3
-        punpckhwd xmm3,xmm0
-        pmaddwd   xmm5,[rel PW_F184_MF076]      ; xmm5=tmp2L
-        pmaddwd   xmm3,[rel PW_F184_MF076]      ; xmm3=tmp2H
-
-        movdqa  xmm4,xmm1
-        movdqa  xmm0,xmm2
-        paddd   xmm1,xmm5               ; xmm1=tmp10L
-        paddd   xmm2,xmm3               ; xmm2=tmp10H
-        psubd   xmm4,xmm5               ; xmm4=tmp12L
-        psubd   xmm0,xmm3               ; xmm0=tmp12H
-
-        ; -- Final output stage
-
-        movdqa  xmm5,xmm1
-        movdqa  xmm3,xmm2
-        paddd   xmm1,xmm6               ; xmm1=data0L
-        paddd   xmm2,xmm7               ; xmm2=data0H
-        psubd   xmm5,xmm6               ; xmm5=data3L
-        psubd   xmm3,xmm7               ; xmm3=data3H
-
-        movdqa  xmm6,[rel PD_DESCALE_P1_4]      ; xmm6=[rel PD_DESCALE_P1_4]
-
-        paddd   xmm1,xmm6
-        paddd   xmm2,xmm6
-        psrad   xmm1,DESCALE_P1_4
-        psrad   xmm2,DESCALE_P1_4
-        paddd   xmm5,xmm6
-        paddd   xmm3,xmm6
-        psrad   xmm5,DESCALE_P1_4
-        psrad   xmm3,DESCALE_P1_4
-
-        packssdw  xmm1,xmm2             ; xmm1=data0=(00 01 02 03 04 05 06 07)
-        packssdw  xmm5,xmm3             ; xmm5=data3=(30 31 32 33 34 35 36 37)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
-
-        movdqa  xmm2,xmm4
-        movdqa  xmm3,xmm0
-        paddd   xmm4,xmm7               ; xmm4=data1L
-        paddd   xmm0,xmm6               ; xmm0=data1H
-        psubd   xmm2,xmm7               ; xmm2=data2L
-        psubd   xmm3,xmm6               ; xmm3=data2H
-
-        movdqa  xmm7,[rel PD_DESCALE_P1_4]      ; xmm7=[rel PD_DESCALE_P1_4]
-
-        paddd   xmm4,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm4,DESCALE_P1_4
-        psrad   xmm0,DESCALE_P1_4
-        paddd   xmm2,xmm7
-        paddd   xmm3,xmm7
-        psrad   xmm2,DESCALE_P1_4
-        psrad   xmm3,DESCALE_P1_4
-
-        packssdw  xmm4,xmm0             ; xmm4=data1=(10 11 12 13 14 15 16 17)
-        packssdw  xmm2,xmm3             ; xmm2=data2=(20 21 22 23 24 25 26 27)
-
-        movdqa    xmm6,xmm1     ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm4     ; xmm1=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm6,xmm4     ; xmm6=(04 14 05 15 06 16 07 17)
-        movdqa    xmm7,xmm2     ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm5     ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm7,xmm5     ; xmm7=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm0,xmm1     ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm2     ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm0,xmm2     ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
-        movdqa    xmm3,xmm6     ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm7     ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm3,xmm7     ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     rax, [original_rbp]
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-
-        ; -- Even part
-
-        pxor      xmm4,xmm4
-        punpcklwd xmm4,xmm1             ; xmm4=tmp0
-        psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
-
-        ; -- Odd part
-
-        punpckhwd xmm1,xmm0
-        punpckhwd xmm6,xmm3
-        movdqa    xmm5,xmm1
-        movdqa    xmm2,xmm6
-        pmaddwd   xmm1,[rel PW_F256_F089]       ; xmm1=(tmp2)
-        pmaddwd   xmm6,[rel PW_MF060_MF050]     ; xmm6=(tmp2)
-        pmaddwd   xmm5,[rel PW_F106_MF217]      ; xmm5=(tmp0)
-        pmaddwd   xmm2,[rel PW_F145_MF021]      ; xmm2=(tmp0)
-
-        paddd     xmm6,xmm1             ; xmm6=tmp2
-        paddd     xmm2,xmm5             ; xmm2=tmp0
-
-        ; -- Even part
-
-        punpcklwd xmm0,xmm3
-        pmaddwd   xmm0,[rel PW_F184_MF076]      ; xmm0=tmp2
-
-        movdqa    xmm7,xmm4
-        paddd     xmm4,xmm0             ; xmm4=tmp10
-        psubd     xmm7,xmm0             ; xmm7=tmp12
-
-        ; -- Final output stage
-
-        movdqa  xmm1,[rel PD_DESCALE_P2_4]      ; xmm1=[rel PD_DESCALE_P2_4]
-
-        movdqa  xmm5,xmm4
-        movdqa  xmm3,xmm7
-        paddd   xmm4,xmm6               ; xmm4=data0=(00 10 20 30)
-        paddd   xmm7,xmm2               ; xmm7=data1=(01 11 21 31)
-        psubd   xmm5,xmm6               ; xmm5=data3=(03 13 23 33)
-        psubd   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
-
-        paddd   xmm4,xmm1
-        paddd   xmm7,xmm1
-        psrad   xmm4,DESCALE_P2_4
-        psrad   xmm7,DESCALE_P2_4
-        paddd   xmm5,xmm1
-        paddd   xmm3,xmm1
-        psrad   xmm5,DESCALE_P2_4
-        psrad   xmm3,DESCALE_P2_4
-
-        packssdw  xmm4,xmm3             ; xmm4=(00 10 20 30 02 12 22 32)
-        packssdw  xmm7,xmm5             ; xmm7=(01 11 21 31 03 13 23 33)
-
-        movdqa    xmm0,xmm4             ; transpose coefficients(phase 1)
-        punpcklwd xmm4,xmm7             ; xmm4=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm0,xmm7             ; xmm0=(02 03 12 13 22 23 32 33)
-
-        movdqa    xmm6,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm0             ; xmm4=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm6,xmm0             ; xmm6=(20 21 22 23 30 31 32 33)
-
-        packsswb  xmm4,xmm6             ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
-        paddb     xmm4,[rel PB_CENTERJSAMP]
-
-        pshufd    xmm2,xmm4,0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
-        pshufd    xmm1,xmm4,0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
-        pshufd    xmm3,xmm4,0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        movd    XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-        movd    XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-        mov     rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-        movd    XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
-        movd    XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void *dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-        align   16
-        global  EXTN(jsimd_idct_2x2_sse2)
-
-EXTN(jsimd_idct_2x2_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-
-        ; | input:                  | result:        |
-        ; | 00 01 ** 03 ** 05 ** 07 |                |
-        ; | 10 11 ** 13 ** 15 ** 17 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-        ; | 50 51 ** 53 ** 55 ** 57 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 70 71 ** 73 ** 75 ** 77 |                |
-
-        ; -- Odd part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
-        ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
-
-        pcmpeqd   xmm7,xmm7
-        pslld     xmm7,WORD_BIT         ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
-
-        movdqa    xmm4,xmm0             ; xmm4=(10 11 ** 13 ** 15 ** 17)
-        movdqa    xmm5,xmm2             ; xmm5=(50 51 ** 53 ** 55 ** 57)
-        punpcklwd xmm4,xmm1             ; xmm4=(10 30 11 31 ** ** 13 33)
-        punpcklwd xmm5,xmm3             ; xmm5=(50 70 51 71 ** ** 53 73)
-        pmaddwd   xmm4,[rel PW_F362_MF127]
-        pmaddwd   xmm5,[rel PW_F085_MF072]
-
-        psrld   xmm0,WORD_BIT           ; xmm0=(11 -- 13 -- 15 -- 17 --)
-        pand    xmm1,xmm7               ; xmm1=(-- 31 -- 33 -- 35 -- 37)
-        psrld   xmm2,WORD_BIT           ; xmm2=(51 -- 53 -- 55 -- 57 --)
-        pand    xmm3,xmm7               ; xmm3=(-- 71 -- 73 -- 75 -- 77)
-        por     xmm0,xmm1               ; xmm0=(11 31 13 33 15 35 17 37)
-        por     xmm2,xmm3               ; xmm2=(51 71 53 73 55 75 57 77)
-        pmaddwd xmm0,[rel PW_F362_MF127]
-        pmaddwd xmm2,[rel PW_F085_MF072]
-
-        paddd   xmm4,xmm5               ; xmm4=tmp0[col0 col1 **** col3]
-        paddd   xmm0,xmm2               ; xmm0=tmp0[col1 col3 col5 col7]
-
-        ; -- Even part
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; xmm6=(00 01 ** 03 ** 05 ** 07)
-
-        movdqa  xmm1,xmm6               ; xmm1=(00 01 ** 03 ** 05 ** 07)
-        pslld   xmm6,WORD_BIT           ; xmm6=(-- 00 -- ** -- ** -- **)
-        pand    xmm1,xmm7               ; xmm1=(-- 01 -- 03 -- 05 -- 07)
-        psrad   xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
-        psrad   xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
-
-        ; -- Final output stage
-
-        movdqa  xmm3,xmm6
-        movdqa  xmm5,xmm1
-        paddd   xmm6,xmm4       ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
-        paddd   xmm1,xmm0       ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
-        psubd   xmm3,xmm4       ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
-        psubd   xmm5,xmm0       ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
-
-        movdqa  xmm2,[rel PD_DESCALE_P1_2]      ; xmm2=[rel PD_DESCALE_P1_2]
-
-        punpckldq  xmm6,xmm3            ; xmm6=(A0 B0 ** **)
-
-        movdqa     xmm7,xmm1
-        punpcklqdq xmm1,xmm5            ; xmm1=(A1 A3 B1 B3)
-        punpckhqdq xmm7,xmm5            ; xmm7=(A5 A7 B5 B7)
-
-        paddd   xmm6,xmm2
-        psrad   xmm6,DESCALE_P1_2
-
-        paddd   xmm1,xmm2
-        paddd   xmm7,xmm2
-        psrad   xmm1,DESCALE_P1_2
-        psrad   xmm7,DESCALE_P1_2
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-
-        ; | input:| result:|
-        ; | A0 B0 |        |
-        ; | A1 B1 | C0 C1  |
-        ; | A3 B3 | D0 D1  |
-        ; | A5 B5 |        |
-        ; | A7 B7 |        |
-
-        ; -- Odd part
-
-        packssdw  xmm1,xmm1             ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
-        packssdw  xmm7,xmm7             ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
-        pmaddwd   xmm1,[rel PW_F362_MF127]
-        pmaddwd   xmm7,[rel PW_F085_MF072]
-
-        paddd     xmm1,xmm7             ; xmm1=tmp0[row0 row1 row0 row1]
-
-        ; -- Even part
-
-        pslld     xmm6,(CONST_BITS+2)   ; xmm6=tmp10[row0 row1 **** ****]
-
-        ; -- Final output stage
-
-        movdqa    xmm4,xmm6
-        paddd     xmm6,xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
-        psubd     xmm4,xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
-
-        punpckldq xmm6,xmm4     ; xmm6=(C0 D0 C1 D1)
-
-        paddd     xmm6,[rel PD_DESCALE_P2_2]
-        psrad     xmm6,DESCALE_P2_2
-
-        packssdw  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
-        packsswb  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
-        paddb     xmm6,[rel PB_CENTERJSAMP]
-
-        pextrw  ebx,xmm6,0x00           ; ebx=(C0 D0 -- --)
-        pextrw  ecx,xmm6,0x01           ; ecx=(C1 D1 -- --)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        mov     WORD [rdx+rax*SIZEOF_JSAMPLE], bx
-        mov     WORD [rsi+rax*SIZEOF_JSAMPLE], cx
-
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctred-sse2.asm b/media/libjpeg/simd/jidctred-sse2.asm
deleted file mode 100644
index 232d9838d6..0000000000
--- a/media/libjpeg/simd/jidctred-sse2.asm
+++ /dev/null
@@ -1,593 +0,0 @@
-;
-; jidctred.asm - reduced-size IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211 equ      1730           ; FIX(0.211164243)
-F_0_509 equ      4176           ; FIX(0.509795579)
-F_0_601 equ      4926           ; FIX(0.601344887)
-F_0_720 equ      5906           ; FIX(0.720959822)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_850 equ      6967           ; FIX(0.850430095)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_061 equ      8697           ; FIX(1.061594337)
-F_1_272 equ     10426           ; FIX(1.272758580)
-F_1_451 equ     11893           ; FIX(1.451774981)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_2_172 equ     17799           ; FIX(2.172734803)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_624 equ     29692           ; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
-F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
-F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
-F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
-F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
-F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_red_sse2)
-
-EXTN(jconst_idct_red_sse2):
-
-PW_F184_MF076   times 4 dw  F_1_847,-F_0_765
-PW_F256_F089    times 4 dw  F_2_562, F_0_899
-PW_F106_MF217   times 4 dw  F_1_061,-F_2_172
-PW_MF060_MF050  times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021   times 4 dw  F_1_451,-F_0_211
-PW_F362_MF127   times 4 dw  F_3_624,-F_1_272
-PW_F085_MF072   times 4 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 4 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 4 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 4 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 4 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_idct_4x4_sse2)
-
-EXTN(jsimd_idct_4x4_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     xmm0,xmm1
-        packsswb xmm0,xmm0
-        packsswb xmm0,xmm0
-        movd    eax,xmm0
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   xmm0,PASS1_BITS
-
-        movdqa    xmm3,xmm0     ; xmm0=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm0,xmm0     ; xmm0=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm3,xmm3     ; xmm3=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm1,xmm0,0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
-        pshufd  xmm0,xmm0,0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
-        pshufd  xmm6,xmm3,0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
-        pshufd  xmm3,xmm3,0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
-        jmp     near .column_end
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Odd part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa    xmm4,xmm0
-        movdqa    xmm5,xmm0
-        punpcklwd xmm4,xmm1
-        punpckhwd xmm5,xmm1
-        movdqa    xmm0,xmm4
-        movdqa    xmm1,xmm5
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F256_F089)]       ; xmm4=(tmp2L)
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F256_F089)]       ; xmm5=(tmp2H)
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_F106_MF217)]      ; xmm0=(tmp0L)
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F106_MF217)]      ; xmm1=(tmp0H)
-
-        movdqa    xmm6,xmm2
-        movdqa    xmm7,xmm2
-        punpcklwd xmm6,xmm3
-        punpckhwd xmm7,xmm3
-        movdqa    xmm2,xmm6
-        movdqa    xmm3,xmm7
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]     ; xmm6=(tmp2L)
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF050)]     ; xmm7=(tmp2H)
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]      ; xmm2=(tmp0L)
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_F145_MF021)]      ; xmm3=(tmp0H)
-
-        paddd   xmm6,xmm4               ; xmm6=tmp2L
-        paddd   xmm7,xmm5               ; xmm7=tmp2H
-        paddd   xmm2,xmm0               ; xmm2=tmp0L
-        paddd   xmm3,xmm1               ; xmm3=tmp0H
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
-
-        ; -- Even part
-
-        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        pxor      xmm1,xmm1
-        pxor      xmm2,xmm2
-        punpcklwd xmm1,xmm4             ; xmm1=tmp0L
-        punpckhwd xmm2,xmm4             ; xmm2=tmp0H
-        psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
-        psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
-        movdqa    xmm3,xmm5             ; xmm5=in2=z2
-        punpcklwd xmm5,xmm0             ; xmm0=in6=z3
-        punpckhwd xmm3,xmm0
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F184_MF076)]      ; xmm5=tmp2L
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_F184_MF076)]      ; xmm3=tmp2H
-
-        movdqa  xmm4,xmm1
-        movdqa  xmm0,xmm2
-        paddd   xmm1,xmm5               ; xmm1=tmp10L
-        paddd   xmm2,xmm3               ; xmm2=tmp10H
-        psubd   xmm4,xmm5               ; xmm4=tmp12L
-        psubd   xmm0,xmm3               ; xmm0=tmp12H
-
-        ; -- Final output stage
-
-        movdqa  xmm5,xmm1
-        movdqa  xmm3,xmm2
-        paddd   xmm1,xmm6               ; xmm1=data0L
-        paddd   xmm2,xmm7               ; xmm2=data0H
-        psubd   xmm5,xmm6               ; xmm5=data3L
-        psubd   xmm3,xmm7               ; xmm3=data3H
-
-        movdqa  xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]      ; xmm6=[PD_DESCALE_P1_4]
-
-        paddd   xmm1,xmm6
-        paddd   xmm2,xmm6
-        psrad   xmm1,DESCALE_P1_4
-        psrad   xmm2,DESCALE_P1_4
-        paddd   xmm5,xmm6
-        paddd   xmm3,xmm6
-        psrad   xmm5,DESCALE_P1_4
-        psrad   xmm3,DESCALE_P1_4
-
-        packssdw  xmm1,xmm2             ; xmm1=data0=(00 01 02 03 04 05 06 07)
-        packssdw  xmm5,xmm3             ; xmm5=data3=(30 31 32 33 34 35 36 37)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
-
-        movdqa  xmm2,xmm4
-        movdqa  xmm3,xmm0
-        paddd   xmm4,xmm7               ; xmm4=data1L
-        paddd   xmm0,xmm6               ; xmm0=data1H
-        psubd   xmm2,xmm7               ; xmm2=data2L
-        psubd   xmm3,xmm6               ; xmm3=data2H
-
-        movdqa  xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]      ; xmm7=[PD_DESCALE_P1_4]
-
-        paddd   xmm4,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm4,DESCALE_P1_4
-        psrad   xmm0,DESCALE_P1_4
-        paddd   xmm2,xmm7
-        paddd   xmm3,xmm7
-        psrad   xmm2,DESCALE_P1_4
-        psrad   xmm3,DESCALE_P1_4
-
-        packssdw  xmm4,xmm0             ; xmm4=data1=(10 11 12 13 14 15 16 17)
-        packssdw  xmm2,xmm3             ; xmm2=data2=(20 21 22 23 24 25 26 27)
-
-        movdqa    xmm6,xmm1     ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm4     ; xmm1=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm6,xmm4     ; xmm6=(04 14 05 15 06 16 07 17)
-        movdqa    xmm7,xmm2     ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm5     ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm7,xmm5     ; xmm7=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm0,xmm1     ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm2     ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm0,xmm2     ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
-        movdqa    xmm3,xmm6     ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm7     ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm3,xmm7     ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     eax, [original_ebp]
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-
-        ; -- Even part
-
-        pxor      xmm4,xmm4
-        punpcklwd xmm4,xmm1             ; xmm4=tmp0
-        psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
-
-        ; -- Odd part
-
-        punpckhwd xmm1,xmm0
-        punpckhwd xmm6,xmm3
-        movdqa    xmm5,xmm1
-        movdqa    xmm2,xmm6
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F256_F089)]       ; xmm1=(tmp2)
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]     ; xmm6=(tmp2)
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F106_MF217)]      ; xmm5=(tmp0)
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]      ; xmm2=(tmp0)
-
-        paddd     xmm6,xmm1             ; xmm6=tmp2
-        paddd     xmm2,xmm5             ; xmm2=tmp0
-
-        ; -- Even part
-
-        punpcklwd xmm0,xmm3
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_F184_MF076)]      ; xmm0=tmp2
-
-        movdqa    xmm7,xmm4
-        paddd     xmm4,xmm0             ; xmm4=tmp10
-        psubd     xmm7,xmm0             ; xmm7=tmp12
-
-        ; -- Final output stage
-
-        movdqa  xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)]      ; xmm1=[PD_DESCALE_P2_4]
-
-        movdqa  xmm5,xmm4
-        movdqa  xmm3,xmm7
-        paddd   xmm4,xmm6               ; xmm4=data0=(00 10 20 30)
-        paddd   xmm7,xmm2               ; xmm7=data1=(01 11 21 31)
-        psubd   xmm5,xmm6               ; xmm5=data3=(03 13 23 33)
-        psubd   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
-
-        paddd   xmm4,xmm1
-        paddd   xmm7,xmm1
-        psrad   xmm4,DESCALE_P2_4
-        psrad   xmm7,DESCALE_P2_4
-        paddd   xmm5,xmm1
-        paddd   xmm3,xmm1
-        psrad   xmm5,DESCALE_P2_4
-        psrad   xmm3,DESCALE_P2_4
-
-        packssdw  xmm4,xmm3             ; xmm4=(00 10 20 30 02 12 22 32)
-        packssdw  xmm7,xmm5             ; xmm7=(01 11 21 31 03 13 23 33)
-
-        movdqa    xmm0,xmm4             ; transpose coefficients(phase 1)
-        punpcklwd xmm4,xmm7             ; xmm4=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm0,xmm7             ; xmm0=(02 03 12 13 22 23 32 33)
-
-        movdqa    xmm6,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm0             ; xmm4=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm6,xmm0             ; xmm6=(20 21 22 23 30 31 32 33)
-
-        packsswb  xmm4,xmm6             ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
-        paddb     xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-        pshufd    xmm2,xmm4,0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
-        pshufd    xmm1,xmm4,0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
-        pshufd    xmm3,xmm4,0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movd    XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-        movd    XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
-        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movd    XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
-        movd    XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-        align   16
-        global  EXTN(jsimd_idct_2x2_sse2)
-
-EXTN(jsimd_idct_2x2_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     edx, POINTER [dct_table(ebp)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(ebp)]         ; inptr
-
-        ; | input:                  | result:        |
-        ; | 00 01 ** 03 ** 05 ** 07 |                |
-        ; | 10 11 ** 13 ** 15 ** 17 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-        ; | 50 51 ** 53 ** 55 ** 57 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 70 71 ** 73 ** 75 ** 77 |                |
-
-        ; -- Odd part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
-        ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
-
-        pcmpeqd   xmm7,xmm7
-        pslld     xmm7,WORD_BIT         ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
-
-        movdqa    xmm4,xmm0             ; xmm4=(10 11 ** 13 ** 15 ** 17)
-        movdqa    xmm5,xmm2             ; xmm5=(50 51 ** 53 ** 55 ** 57)
-        punpcklwd xmm4,xmm1             ; xmm4=(10 30 11 31 ** ** 13 33)
-        punpcklwd xmm5,xmm3             ; xmm5=(50 70 51 71 ** ** 53 73)
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-        psrld   xmm0,WORD_BIT           ; xmm0=(11 -- 13 -- 15 -- 17 --)
-        pand    xmm1,xmm7               ; xmm1=(-- 31 -- 33 -- 35 -- 37)
-        psrld   xmm2,WORD_BIT           ; xmm2=(51 -- 53 -- 55 -- 57 --)
-        pand    xmm3,xmm7               ; xmm3=(-- 71 -- 73 -- 75 -- 77)
-        por     xmm0,xmm1               ; xmm0=(11 31 13 33 15 35 17 37)
-        por     xmm2,xmm3               ; xmm2=(51 71 53 73 55 75 57 77)
-        pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd   xmm4,xmm5               ; xmm4=tmp0[col0 col1 **** col3]
-        paddd   xmm0,xmm2               ; xmm0=tmp0[col1 col3 col5 col7]
-
-        ; -- Even part
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; xmm6=(00 01 ** 03 ** 05 ** 07)
-
-        movdqa  xmm1,xmm6               ; xmm1=(00 01 ** 03 ** 05 ** 07)
-        pslld   xmm6,WORD_BIT           ; xmm6=(-- 00 -- ** -- ** -- **)
-        pand    xmm1,xmm7               ; xmm1=(-- 01 -- 03 -- 05 -- 07)
-        psrad   xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
-        psrad   xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
-
-        ; -- Final output stage
-
-        movdqa  xmm3,xmm6
-        movdqa  xmm5,xmm1
-        paddd   xmm6,xmm4       ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
-        paddd   xmm1,xmm0       ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
-        psubd   xmm3,xmm4       ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
-        psubd   xmm5,xmm0       ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
-
-        movdqa  xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)]      ; xmm2=[PD_DESCALE_P1_2]
-
-        punpckldq  xmm6,xmm3            ; xmm6=(A0 B0 ** **)
-
-        movdqa     xmm7,xmm1
-        punpcklqdq xmm1,xmm5            ; xmm1=(A1 A3 B1 B3)
-        punpckhqdq xmm7,xmm5            ; xmm7=(A5 A7 B5 B7)
-
-        paddd   xmm6,xmm2
-        psrad   xmm6,DESCALE_P1_2
-
-        paddd   xmm1,xmm2
-        paddd   xmm7,xmm2
-        psrad   xmm1,DESCALE_P1_2
-        psrad   xmm7,DESCALE_P1_2
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     edi, JSAMPARRAY [output_buf(ebp)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(ebp)]
-
-        ; | input:| result:|
-        ; | A0 B0 |        |
-        ; | A1 B1 | C0 C1  |
-        ; | A3 B3 | D0 D1  |
-        ; | A5 B5 |        |
-        ; | A7 B7 |        |
-
-        ; -- Odd part
-
-        packssdw  xmm1,xmm1             ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
-        packssdw  xmm7,xmm7             ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd     xmm1,xmm7             ; xmm1=tmp0[row0 row1 row0 row1]
-
-        ; -- Even part
-
-        pslld     xmm6,(CONST_BITS+2)   ; xmm6=tmp10[row0 row1 **** ****]
-
-        ; -- Final output stage
-
-        movdqa    xmm4,xmm6
-        paddd     xmm6,xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
-        psubd     xmm4,xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
-
-        punpckldq xmm6,xmm4     ; xmm6=(C0 D0 C1 D1)
-
-        paddd     xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
-        psrad     xmm6,DESCALE_P2_2
-
-        packssdw  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
-        packsswb  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
-        paddb     xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-        pextrw  ebx,xmm6,0x00           ; ebx=(C0 D0 -- --)
-        pextrw  ecx,xmm6,0x01           ; ecx=(C1 D1 -- --)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     WORD [edx+eax*SIZEOF_JSAMPLE], bx
-        mov     WORD [esi+eax*SIZEOF_JSAMPLE], cx
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jpeg_nbits_table.inc b/media/libjpeg/simd/jpeg_nbits_table.inc
deleted file mode 100644
index cbc69904e1..0000000000
--- a/media/libjpeg/simd/jpeg_nbits_table.inc
+++ /dev/null
@@ -1,4097 +0,0 @@
-jpeg_nbits_table db  \
-   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  \
-   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  \
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  \
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
diff --git a/media/libjpeg/simd/jquant-3dn.asm b/media/libjpeg/simd/jquant-3dn.asm
deleted file mode 100644
index 0b4164b261..0000000000
--- a/media/libjpeg/simd/jquant-3dn.asm
+++ /dev/null
@@ -1,232 +0,0 @@
-;
-; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                             FAST_FLOAT *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_float_3dnow)
-
-EXTN(jsimd_convsamp_float_3dnow):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pcmpeqw  mm7,mm7
-        psllw    mm7,7
-        packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-        psubb   mm0,mm7                         ; mm0=(01234567)
-        psubb   mm1,mm7                         ; mm1=(89ABCDEF)
-
-        punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
-        punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
-        punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
-        punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
-
-        punpcklwd mm4,mm2                       ; mm4=(***0***1)
-        punpckhwd mm2,mm2                       ; mm2=(***2***3)
-        punpcklwd mm5,mm0                       ; mm5=(***4***5)
-        punpckhwd mm0,mm0                       ; mm0=(***6***7)
-
-        psrad   mm4,(DWORD_BIT-BYTE_BIT)        ; mm4=(01)
-        psrad   mm2,(DWORD_BIT-BYTE_BIT)        ; mm2=(23)
-        pi2fd   mm4,mm4
-        pi2fd   mm2,mm2
-        psrad   mm5,(DWORD_BIT-BYTE_BIT)        ; mm5=(45)
-        psrad   mm0,(DWORD_BIT-BYTE_BIT)        ; mm0=(67)
-        pi2fd   mm5,mm5
-        pi2fd   mm0,mm0
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
-        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
-        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-
-        punpcklwd mm6,mm3                       ; mm6=(***8***9)
-        punpckhwd mm3,mm3                       ; mm3=(***A***B)
-        punpcklwd mm4,mm1                       ; mm4=(***C***D)
-        punpckhwd mm1,mm1                       ; mm1=(***E***F)
-
-        psrad   mm6,(DWORD_BIT-BYTE_BIT)        ; mm6=(89)
-        psrad   mm3,(DWORD_BIT-BYTE_BIT)        ; mm3=(AB)
-        pi2fd   mm6,mm6
-        pi2fd   mm3,mm3
-        psrad   mm4,(DWORD_BIT-BYTE_BIT)        ; mm4=(CD)
-        psrad   mm1,(DWORD_BIT-BYTE_BIT)        ; mm1=(EF)
-        pi2fd   mm4,mm4
-        pi2fd   mm1,mm1
-
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
-        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
-
-        add     esi, byte 2*SIZEOF_JSAMPROW
-        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .convloop
-
-        femms           ; empty MMX/3DNow! state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-;                             FAST_FLOAT *workspace);
-;
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; FAST_FLOAT *divisors
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_float_3dnow)
-
-EXTN(jsimd_quantize_float_3dnow):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov       eax, 0x4B400000       ; (float)0x00C00000 (rndint_magic)
-        movd      mm7,eax
-        punpckldq mm7,mm7               ; mm7={12582912.0F 12582912.0F}
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     eax, DCTSIZE2/16
-        alignx  16,7
-.quantloop:
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-        pfmul   mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        pfmul   mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
-        pfmul   mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
-        pfmul   mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
-
-        pfadd   mm0,mm7                 ; mm0=(00 ** 01 **)
-        pfadd   mm1,mm7                 ; mm1=(02 ** 03 **)
-        pfadd   mm2,mm7                 ; mm0=(04 ** 05 **)
-        pfadd   mm3,mm7                 ; mm1=(06 ** 07 **)
-
-        movq      mm4,mm0
-        punpcklwd mm0,mm1               ; mm0=(00 02 ** **)
-        punpckhwd mm4,mm1               ; mm4=(01 03 ** **)
-        movq      mm5,mm2
-        punpcklwd mm2,mm3               ; mm2=(04 06 ** **)
-        punpckhwd mm5,mm3               ; mm5=(05 07 ** **)
-
-        punpcklwd mm0,mm4               ; mm0=(00 01 02 03)
-        punpcklwd mm2,mm5               ; mm2=(04 05 06 07)
-
-        movq    mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-        pfmul   mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        pfmul   mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
-        pfmul   mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
-        pfmul   mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
-
-        pfadd   mm6,mm7                 ; mm0=(10 ** 11 **)
-        pfadd   mm1,mm7                 ; mm4=(12 ** 13 **)
-        pfadd   mm3,mm7                 ; mm0=(14 ** 15 **)
-        pfadd   mm4,mm7                 ; mm4=(16 ** 17 **)
-
-        movq      mm5,mm6
-        punpcklwd mm6,mm1               ; mm6=(10 12 ** **)
-        punpckhwd mm5,mm1               ; mm5=(11 13 ** **)
-        movq      mm1,mm3
-        punpcklwd mm3,mm4               ; mm3=(14 16 ** **)
-        punpckhwd mm1,mm4               ; mm1=(15 17 ** **)
-
-        punpcklwd mm6,mm5               ; mm6=(10 11 12 13)
-        punpcklwd mm3,mm1               ; mm3=(14 15 16 17)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
-
-        add     esi, byte 16*SIZEOF_FAST_FLOAT
-        add     edx, byte 16*SIZEOF_FAST_FLOAT
-        add     edi, byte 16*SIZEOF_JCOEF
-        dec     eax
-        jnz     near .quantloop
-
-        femms           ; empty MMX/3DNow! state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jquant-mmx.asm b/media/libjpeg/simd/jquant-mmx.asm
deleted file mode 100644
index aed6071bfc..0000000000
--- a/media/libjpeg/simd/jquant-mmx.asm
+++ /dev/null
@@ -1,273 +0,0 @@
-;
-; jquant.asm - sample data conversion and quantization (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                     DCTELEM *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_mmx)
-
-EXTN(jsimd_convsamp_mmx):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pxor    mm6,mm6                 ; mm6=(all 0's)
-        pcmpeqw mm7,mm7
-        psllw   mm7,7                   ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]    ; mm0=(01234567)
-        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]    ; mm1=(89ABCDEF)
-
-        mov     ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]    ; mm2=(GHIJKLMN)
-        movq    mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]    ; mm3=(OPQRSTUV)
-
-        movq      mm4,mm0
-        punpcklbw mm0,mm6               ; mm0=(0123)
-        punpckhbw mm4,mm6               ; mm4=(4567)
-        movq      mm5,mm1
-        punpcklbw mm1,mm6               ; mm1=(89AB)
-        punpckhbw mm5,mm6               ; mm5=(CDEF)
-
-        paddw   mm0,mm7
-        paddw   mm4,mm7
-        paddw   mm1,mm7
-        paddw   mm5,mm7
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
-
-        movq      mm0,mm2
-        punpcklbw mm2,mm6               ; mm2=(GHIJ)
-        punpckhbw mm0,mm6               ; mm0=(KLMN)
-        movq      mm4,mm3
-        punpcklbw mm3,mm6               ; mm3=(OPQR)
-        punpckhbw mm4,mm6               ; mm4=(STUV)
-
-        paddw   mm2,mm7
-        paddw   mm0,mm7
-        paddw   mm3,mm7
-        paddw   mm4,mm7
-
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
-
-        add     esi, byte 4*SIZEOF_JSAMPROW
-        add     edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     short .convloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM *divisors,
-;                     DCTELEM *workspace);
-;
-
-%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-%define SHIFT(m,n,b)      MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; DCTELEM *divisors
-%define workspace       ebp+16          ; DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_mmx)
-
-EXTN(jsimd_quantize_mmx):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     ah, 2
-        alignx  16,7
-.quantloop1:
-        mov     al, DCTSIZE2/8/2
-        alignx  16,7
-.quantloop2:
-        movq    mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
-
-        movq    mm0,mm2
-        movq    mm1,mm3
-
-        psraw   mm2,(WORD_BIT-1)  ; -1 if value < 0, 0 otherwise
-        psraw   mm3,(WORD_BIT-1)
-
-        pxor    mm0,mm2   ; val = -val
-        pxor    mm1,mm3
-        psubw   mm0,mm2
-        psubw   mm1,mm3
-
-        ;
-        ; MMX is an annoyingly crappy instruction set. It has two
-        ; misfeatures that are causing problems here:
-        ;
-        ; - All multiplications are signed.
-        ;
-        ; - The second operand for the shifts is not treated as packed.
-        ;
-        ;
-        ; We work around the first problem by implementing this algorithm:
-        ;
-        ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
-        ; {
-        ;   enum { SHORT_BIT = 16 };
-        ;   signed short sx = (signed short) x;
-        ;   signed short sy = (signed short) y;
-        ;   signed long sz;
-        ;
-        ;   sz = (long) sx * (long) sy;     /* signed multiply */
-        ;
-        ;   if (sx < 0) sz += (long) sy << SHORT_BIT;
-        ;   if (sy < 0) sz += (long) sx << SHORT_BIT;
-        ;
-        ;   return (unsigned long) sz;
-        ; }
-        ;
-        ; (note that a negative sx adds _sy_ and vice versa)
-        ;
-        ; For the second problem, we replace the shift by a multiplication.
-        ; Unfortunately that means we have to deal with the signed issue again.
-        ;
-
-        paddw   mm0, MMWORD [CORRECTION(0,0,edx)]   ; correction + roundfactor
-        paddw   mm1, MMWORD [CORRECTION(0,1,edx)]
-
-        movq    mm4,mm0   ; store current value for later
-        movq    mm5,mm1
-        pmulhw  mm0, MMWORD [RECIPROCAL(0,0,edx)]   ; reciprocal
-        pmulhw  mm1, MMWORD [RECIPROCAL(0,1,edx)]
-        paddw   mm0,mm4         ; reciprocal is always negative (MSB=1),
-        paddw   mm1,mm5   ; so we always need to add the initial value
-                        ; (input value is never negative as we
-                        ; inverted it at the start of this routine)
-
-        ; here it gets a bit tricky as both scale
-        ; and mm0/mm1 can be negative
-        movq    mm6, MMWORD [SCALE(0,0,edx)]    ; scale
-        movq    mm7, MMWORD [SCALE(0,1,edx)]
-        movq    mm4,mm0
-        movq    mm5,mm1
-        pmulhw  mm0,mm6
-        pmulhw  mm1,mm7
-
-        psraw   mm6,(WORD_BIT-1)    ; determine if scale is negative
-        psraw   mm7,(WORD_BIT-1)
-
-        pand    mm6,mm4             ; and add input if it is
-        pand    mm7,mm5
-        paddw   mm0,mm6
-        paddw   mm1,mm7
-
-        psraw   mm4,(WORD_BIT-1)    ; then check if negative input
-        psraw   mm5,(WORD_BIT-1)
-
-        pand    mm4, MMWORD [SCALE(0,0,edx)]    ; and add scale if it is
-        pand    mm5, MMWORD [SCALE(0,1,edx)]
-        paddw   mm0,mm4
-        paddw   mm1,mm5
-
-        pxor    mm0,mm2   ; val = -val
-        pxor    mm1,mm3
-        psubw   mm0,mm2
-        psubw   mm1,mm3
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
-
-        add     esi, byte 8*SIZEOF_DCTELEM
-        add     edx, byte 8*SIZEOF_DCTELEM
-        add     edi, byte 8*SIZEOF_JCOEF
-        dec     al
-        jnz     near .quantloop2
-        dec     ah
-        jnz     near .quantloop1        ; to avoid branch misprediction
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jquant-sse.asm b/media/libjpeg/simd/jquant-sse.asm
deleted file mode 100644
index 1baf88f257..0000000000
--- a/media/libjpeg/simd/jquant-sse.asm
+++ /dev/null
@@ -1,210 +0,0 @@
-;
-; jquant.asm - sample data conversion and quantization (SSE & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                           FAST_FLOAT *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_float_sse)
-
-EXTN(jsimd_convsamp_float_sse):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pcmpeqw  mm7,mm7
-        psllw    mm7,7
-        packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-        psubb   mm0,mm7                         ; mm0=(01234567)
-        psubb   mm1,mm7                         ; mm1=(89ABCDEF)
-
-        punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
-        punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
-        punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
-        punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
-
-        punpcklwd mm4,mm2                       ; mm4=(***0***1)
-        punpckhwd mm2,mm2                       ; mm2=(***2***3)
-        punpcklwd mm5,mm0                       ; mm5=(***4***5)
-        punpckhwd mm0,mm0                       ; mm0=(***6***7)
-
-        psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(01)
-        psrad     mm2,(DWORD_BIT-BYTE_BIT)      ; mm2=(23)
-        cvtpi2ps  xmm0,mm4                      ; xmm0=(01**)
-        cvtpi2ps  xmm1,mm2                      ; xmm1=(23**)
-        psrad     mm5,(DWORD_BIT-BYTE_BIT)      ; mm5=(45)
-        psrad     mm0,(DWORD_BIT-BYTE_BIT)      ; mm0=(67)
-        cvtpi2ps  xmm2,mm5                      ; xmm2=(45**)
-        cvtpi2ps  xmm3,mm0                      ; xmm3=(67**)
-
-        punpcklwd mm6,mm3                       ; mm6=(***8***9)
-        punpckhwd mm3,mm3                       ; mm3=(***A***B)
-        punpcklwd mm4,mm1                       ; mm4=(***C***D)
-        punpckhwd mm1,mm1                       ; mm1=(***E***F)
-
-        psrad     mm6,(DWORD_BIT-BYTE_BIT)      ; mm6=(89)
-        psrad     mm3,(DWORD_BIT-BYTE_BIT)      ; mm3=(AB)
-        cvtpi2ps  xmm4,mm6                      ; xmm4=(89**)
-        cvtpi2ps  xmm5,mm3                      ; xmm5=(AB**)
-        psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(CD)
-        psrad     mm1,(DWORD_BIT-BYTE_BIT)      ; mm1=(EF)
-        cvtpi2ps  xmm6,mm4                      ; xmm6=(CD**)
-        cvtpi2ps  xmm7,mm1                      ; xmm7=(EF**)
-
-        movlhps   xmm0,xmm1                     ; xmm0=(0123)
-        movlhps   xmm2,xmm3                     ; xmm2=(4567)
-        movlhps   xmm4,xmm5                     ; xmm4=(89AB)
-        movlhps   xmm6,xmm7                     ; xmm6=(CDEF)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-
-        add     esi, byte 2*SIZEOF_JSAMPROW
-        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .convloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-;                           FAST_FLOAT *workspace);
-;
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; FAST_FLOAT *divisors
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_float_sse)
-
-EXTN(jsimd_quantize_float_sse):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     eax, DCTSIZE2/16
-        alignx  16,7
-.quantloop:
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-        movhlps  xmm4,xmm0
-        movhlps  xmm5,xmm1
-
-        cvtps2pi mm0,xmm0
-        cvtps2pi mm1,xmm1
-        cvtps2pi mm4,xmm4
-        cvtps2pi mm5,xmm5
-
-        movhlps  xmm6,xmm2
-        movhlps  xmm7,xmm3
-
-        cvtps2pi mm2,xmm2
-        cvtps2pi mm3,xmm3
-        cvtps2pi mm6,xmm6
-        cvtps2pi mm7,xmm7
-
-        packssdw mm0,mm4
-        packssdw mm1,mm5
-        packssdw mm2,mm6
-        packssdw mm3,mm7
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
-
-        add     esi, byte 16*SIZEOF_FAST_FLOAT
-        add     edx, byte 16*SIZEOF_FAST_FLOAT
-        add     edi, byte 16*SIZEOF_JCOEF
-        dec     eax
-        jnz     short .quantloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jquantf-sse2-64.asm b/media/libjpeg/simd/jquantf-sse2-64.asm
deleted file mode 100644
index ef5c1f959e..0000000000
--- a/media/libjpeg/simd/jquantf-sse2-64.asm
+++ /dev/null
@@ -1,157 +0,0 @@
-;
-; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                            FAST_FLOAT *workspace);
-;
-
-; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
-; r12 = FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_float_sse2)
-
-EXTN(jsimd_convsamp_float_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        pcmpeqw  xmm7,xmm7
-        psllw    xmm7,7
-        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
-        mov rsi, r10
-        mov     eax, r11d
-        mov rdi, r12
-        mov     rcx, DCTSIZE/2
-.convloop:
-        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
-
-        movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
-        movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
-
-        psubb   xmm0,xmm7                       ; xmm0=(01234567)
-        psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
-
-        punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
-        punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
-
-        punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
-        punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
-        punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
-        punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
-
-        psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
-        psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
-        psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
-        psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
-
-        add     rsi, byte 2*SIZEOF_JSAMPROW
-        add     rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     rcx
-        jnz     short .convloop
-
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-;                         FAST_FLOAT *workspace);
-;
-
-; r10 = JCOEFPTR coef_block
-; r11 = FAST_FLOAT *divisors
-; r12 = FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_float_sse2)
-
-EXTN(jsimd_quantize_float_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov rsi, r12
-        mov rdx, r11
-        mov rdi, r10
-        mov     rax, DCTSIZE2/16
-.quantloop:
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
-        cvtps2dq xmm0,xmm0
-        cvtps2dq xmm1,xmm1
-        cvtps2dq xmm2,xmm2
-        cvtps2dq xmm3,xmm3
-
-        packssdw xmm0,xmm1
-        packssdw xmm2,xmm3
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
-
-        add     rsi, byte 16*SIZEOF_FAST_FLOAT
-        add     rdx, byte 16*SIZEOF_FAST_FLOAT
-        add     rdi, byte 16*SIZEOF_JCOEF
-        dec     rax
-        jnz     short .quantloop
-
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jquantf-sse2.asm b/media/libjpeg/simd/jquantf-sse2.asm
deleted file mode 100644
index 1cbc267400..0000000000
--- a/media/libjpeg/simd/jquantf-sse2.asm
+++ /dev/null
@@ -1,170 +0,0 @@
-;
-; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                            FAST_FLOAT *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_float_sse2)
-
-EXTN(jsimd_convsamp_float_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pcmpeqw  xmm7,xmm7
-        psllw    xmm7,7
-        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-        movq    xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-        psubb   xmm0,xmm7                       ; xmm0=(01234567)
-        psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
-
-        punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
-        punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
-
-        punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
-        punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
-        punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
-        punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
-
-        psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
-        psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
-        psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
-        psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-
-        add     esi, byte 2*SIZEOF_JSAMPROW
-        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     short .convloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-;                            FAST_FLOAT *workspace);
-;
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; FAST_FLOAT *divisors
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_float_sse2)
-
-EXTN(jsimd_quantize_float_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     eax, DCTSIZE2/16
-        alignx  16,7
-.quantloop:
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-        cvtps2dq xmm0,xmm0
-        cvtps2dq xmm1,xmm1
-        cvtps2dq xmm2,xmm2
-        cvtps2dq xmm3,xmm3
-
-        packssdw xmm0,xmm1
-        packssdw xmm2,xmm3
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
-
-        add     esi, byte 16*SIZEOF_FAST_FLOAT
-        add     edx, byte 16*SIZEOF_FAST_FLOAT
-        add     edi, byte 16*SIZEOF_JCOEF
-        dec     eax
-        jnz     short .quantloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jquanti-sse2-64.asm b/media/libjpeg/simd/jquanti-sse2-64.asm
deleted file mode 100644
index 66c4e51907..0000000000
--- a/media/libjpeg/simd/jquanti-sse2-64.asm
+++ /dev/null
@@ -1,186 +0,0 @@
-;
-; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                      DCTELEM *workspace);
-;
-
-; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
-; r12 = DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_sse2)
-
-EXTN(jsimd_convsamp_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        pxor    xmm6,xmm6               ; xmm6=(all 0's)
-        pcmpeqw xmm7,xmm7
-        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-        mov rsi, r10
-        mov eax, r11d
-        mov rdi, r12
-        mov     rcx, DCTSIZE/4
-.convloop:
-        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
-
-        movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
-        movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)
-
-        mov     rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
-        movq    xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)
-
-        punpcklbw xmm0,xmm6             ; xmm0=(01234567)
-        punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
-        paddw     xmm0,xmm7
-        paddw     xmm1,xmm7
-        punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
-        punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
-        paddw     xmm2,xmm7
-        paddw     xmm3,xmm7
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
-        movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
-        add     rsi, byte 4*SIZEOF_JSAMPROW
-        add     rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     rcx
-        jnz     short .convloop
-
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
-;                      DCTELEM *workspace);
-;
-
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-
-; r10 = JCOEFPTR coef_block
-; r11 = DCTELEM *divisors
-; r12 = DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_sse2)
-
-EXTN(jsimd_quantize_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov rsi, r12
-        mov rdx, r11
-        mov rdi, r10
-        mov     rax, DCTSIZE2/32
-.quantloop:
-        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
-        movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
-        movdqa  xmm0,xmm4
-        movdqa  xmm1,xmm5
-        movdqa  xmm2,xmm6
-        movdqa  xmm3,xmm7
-        psraw   xmm4,(WORD_BIT-1)
-        psraw   xmm5,(WORD_BIT-1)
-        psraw   xmm6,(WORD_BIT-1)
-        psraw   xmm7,(WORD_BIT-1)
-        pxor    xmm0,xmm4
-        pxor    xmm1,xmm5
-        pxor    xmm2,xmm6
-        pxor    xmm3,xmm7
-        psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
-        psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
-        psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
-        psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;
-
-        paddw   xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
-        paddw   xmm1, XMMWORD [CORRECTION(1,0,rdx)]
-        paddw   xmm2, XMMWORD [CORRECTION(2,0,rdx)]
-        paddw   xmm3, XMMWORD [CORRECTION(3,0,rdx)]
-        pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
-        pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
-        pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
-        pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
-        pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)]  ; scale
-        pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
-        pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
-        pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
-
-        pxor    xmm0,xmm4
-        pxor    xmm1,xmm5
-        pxor    xmm2,xmm6
-        pxor    xmm3,xmm7
-        psubw   xmm0,xmm4
-        psubw   xmm1,xmm5
-        psubw   xmm2,xmm6
-        psubw   xmm3,xmm7
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
-        movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
-        add     rsi, byte 32*SIZEOF_DCTELEM
-        add     rdx, byte 32*SIZEOF_DCTELEM
-        add     rdi, byte 32*SIZEOF_JCOEF
-        dec     rax
-        jnz     near .quantloop
-
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jquanti-sse2.asm b/media/libjpeg/simd/jquanti-sse2.asm
deleted file mode 100644
index aea8604e22..0000000000
--- a/media/libjpeg/simd/jquanti-sse2.asm
+++ /dev/null
@@ -1,199 +0,0 @@
-;
-; jquanti.asm - sample data conversion and quantization (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                      DCTELEM *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_sse2)
-
-EXTN(jsimd_convsamp_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pxor    xmm6,xmm6               ; xmm6=(all 0's)
-        pcmpeqw xmm7,xmm7
-        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
-        movq    xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)
-
-        mov     ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
-        movq    xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)
-
-        punpcklbw xmm0,xmm6             ; xmm0=(01234567)
-        punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
-        paddw     xmm0,xmm7
-        paddw     xmm1,xmm7
-        punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
-        punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
-        paddw     xmm2,xmm7
-        paddw     xmm3,xmm7
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
-        movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
-        add     esi, byte 4*SIZEOF_JSAMPROW
-        add     edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     short .convloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
-;                      DCTELEM *workspace);
-;
-
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; DCTELEM *divisors
-%define workspace       ebp+16          ; DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_sse2)
-
-EXTN(jsimd_quantize_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     eax, DCTSIZE2/32
-        alignx  16,7
-.quantloop:
-        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
-        movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
-        movdqa  xmm0,xmm4
-        movdqa  xmm1,xmm5
-        movdqa  xmm2,xmm6
-        movdqa  xmm3,xmm7
-        psraw   xmm4,(WORD_BIT-1)
-        psraw   xmm5,(WORD_BIT-1)
-        psraw   xmm6,(WORD_BIT-1)
-        psraw   xmm7,(WORD_BIT-1)
-        pxor    xmm0,xmm4
-        pxor    xmm1,xmm5
-        pxor    xmm2,xmm6
-        pxor    xmm3,xmm7
-        psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
-        psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
-        psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
-        psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;
-
-        paddw   xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
-        paddw   xmm1, XMMWORD [CORRECTION(1,0,edx)]
-        paddw   xmm2, XMMWORD [CORRECTION(2,0,edx)]
-        paddw   xmm3, XMMWORD [CORRECTION(3,0,edx)]
-        pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
-        pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
-        pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
-        pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
-        pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)]  ; scale
-        pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
-        pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
-        pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
-
-        pxor    xmm0,xmm4
-        pxor    xmm1,xmm5
-        pxor    xmm2,xmm6
-        pxor    xmm3,xmm7
-        psubw   xmm0,xmm4
-        psubw   xmm1,xmm5
-        psubw   xmm2,xmm6
-        psubw   xmm3,xmm7
-        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
-        movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
-        add     esi, byte 32*SIZEOF_DCTELEM
-        add     edx, byte 32*SIZEOF_DCTELEM
-        add     edi, byte 32*SIZEOF_JCOEF
-        dec     eax
-        jnz     near .quantloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jsimd.h b/media/libjpeg/simd/jsimd.h
index dc6ec430db..64747c6360 100644
--- a/media/libjpeg/simd/jsimd.h
+++ b/media/libjpeg/simd/jsimd.h
@@ -2,10 +2,12 @@
  * simd/jsimd.h
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, 2014-2016, D. R. Commander.
+ * Copyright (C) 2011, 2014-2016, 2018, 2020, D. R. Commander.
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2014, Linaro Limited.
- * Copyright (C) 2015-2016, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -15,857 +17,1242 @@
 
 /* Bitmask for supported acceleration methods */
 
-#define JSIMD_NONE       0x00
-#define JSIMD_MMX        0x01
-#define JSIMD_3DNOW      0x02
-#define JSIMD_SSE        0x04
-#define JSIMD_SSE2       0x08
-#define JSIMD_ARM_NEON   0x10
-#define JSIMD_MIPS_DSPR2 0x20
-#define JSIMD_ALTIVEC    0x40
+#define JSIMD_NONE     0x00
+#define JSIMD_MMX      0x01
+#define JSIMD_3DNOW    0x02
+#define JSIMD_SSE      0x04
+#define JSIMD_SSE2     0x08
+#define JSIMD_NEON     0x10
+#define JSIMD_DSPR2    0x20
+#define JSIMD_ALTIVEC  0x40
+#define JSIMD_AVX2     0x80
+#define JSIMD_MMI      0x100
 
 /* SIMD Ext: retrieve SIMD/CPU information */
-EXTERN(unsigned int) jpeg_simd_cpu_support (void);
+EXTERN(unsigned int) jpeg_simd_cpu_support(void);
 
 /* RGB & extended RGB --> YCC Colorspace Conversion */
 EXTERN(void) jsimd_rgb_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 extern const int jconst_rgb_ycc_convert_sse2[];
 EXTERN(void) jsimd_rgb_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_ycc_convert_avx2[];
+EXTERN(void) jsimd_rgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 EXTERN(void) jsimd_rgb_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+#ifndef NEON_INTRINSICS
 
 EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-
-EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+#endif
+
+EXTERN(void) jsimd_rgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 EXTERN(void) jsimd_rgb_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 /* RGB & extended RGB --> Grayscale Colorspace Conversion */
 EXTERN(void) jsimd_rgb_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 extern const int jconst_rgb_gray_convert_sse2[];
 EXTERN(void) jsimd_rgb_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-
-EXTERN(void) jsimd_rgb_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgb_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgbx_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgr_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgrx_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxbgr_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxrgb_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_gray_convert_avx2[];
+EXTERN(void) jsimd_rgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 EXTERN(void) jsimd_rgb_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 /* YCC --> RGB & extended RGB Colorspace Conversion */
 EXTERN(void) jsimd_ycc_rgb_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 extern const int jconst_ycc_rgb_convert_sse2[];
 EXTERN(void) jsimd_ycc_rgb_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+extern const int jconst_ycc_rgb_convert_avx2[];
+EXTERN(void) jsimd_ycc_rgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 EXTERN(void) jsimd_ycc_rgb_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_rgb565_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+#ifndef NEON_INTRINSICS
 
 EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-
-EXTERN(void) jsimd_ycc_rgb_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+#endif
+
+EXTERN(void) jsimd_ycc_rgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 EXTERN(void) jsimd_ycc_rgb_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 /* NULL Colorspace Conversion */
-EXTERN(void) jsimd_c_null_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows, int num_components);
+EXTERN(void) jsimd_c_null_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows, int num_components);
 
 /* h2v1 Downsampling */
 EXTERN(void) jsimd_h2v1_downsample_mmx
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v1_downsample_sse2
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_avx2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v1_downsample_neon
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
-EXTERN(void) jsimd_h2v1_downsample_mips_dspr2
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v1_downsample_dspr2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v1_downsample_altivec
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 /* h2v2 Downsampling */
 EXTERN(void) jsimd_h2v2_downsample_mmx
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v2_downsample_sse2
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_avx2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v2_downsample_neon
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_dspr2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
-EXTERN(void) jsimd_h2v2_downsample_mips_dspr2
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v2_downsample_mmi
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v2_downsample_altivec
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 /* h2v2 Smooth Downsampling */
-EXTERN(void) jsimd_h2v2_smooth_downsample_mips_dspr2
-        (JSAMPARRAY input_data, JSAMPARRAY output_data,
-         JDIMENSION v_samp_factor, int max_v_samp_factor,
-         int smoothing_factor, JDIMENSION width_blocks,
-         JDIMENSION image_width);
+EXTERN(void) jsimd_h2v2_smooth_downsample_dspr2
+  (JSAMPARRAY input_data, JSAMPARRAY output_data, JDIMENSION v_samp_factor,
+   int max_v_samp_factor, int smoothing_factor, JDIMENSION width_in_blocks,
+   JDIMENSION image_width);
 
 
 /* Upsampling */
 EXTERN(void) jsimd_h2v1_upsample_mmx
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_upsample_mmx
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_upsample_sse2
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_upsample_sse2
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
-
-EXTERN(void) jsimd_h2v1_upsample_mips_dspr2
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v2_upsample_mips_dspr2
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
-
-EXTERN(void) jsimd_int_upsample_mips_dspr2
-        (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
-         int max_v_samp_factor);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_int_upsample_dspr2
+  (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
+   int max_v_samp_factor);
 
 EXTERN(void) jsimd_h2v1_upsample_altivec
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_upsample_altivec
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 /* Fancy Upsampling */
 EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 extern const int jconst_fancy_upsample_sse2[];
 EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
-EXTERN(void) jsimd_h2v1_fancy_upsample_neon
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+extern const int jconst_fancy_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_fancy_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
-EXTERN(void) jsimd_h2v1_fancy_upsample_mips_dspr2
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v2_fancy_upsample_mips_dspr2
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmi
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmi
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_altivec
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_altivec
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 /* Merged Upsampling */
 EXTERN(void) jsimd_h2v1_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 EXTERN(void) jsimd_h2v2_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 extern const int jconst_merged_upsample_sse2[];
 EXTERN(void) jsimd_h2v1_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 EXTERN(void) jsimd_h2v2_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
-
-EXTERN(void) jsimd_h2v1_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-
-EXTERN(void) jsimd_h2v2_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+extern const int jconst_merged_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 EXTERN(void) jsimd_h2v1_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 EXTERN(void) jsimd_h2v2_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 /* Sample Conversion */
 EXTERN(void) jsimd_convsamp_mmx
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 EXTERN(void) jsimd_convsamp_sse2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_avx2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 EXTERN(void) jsimd_convsamp_neon
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
-EXTERN(void) jsimd_convsamp_mips_dspr2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+EXTERN(void) jsimd_convsamp_dspr2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 EXTERN(void) jsimd_convsamp_altivec
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 /* Floating Point Sample Conversion */
 EXTERN(void) jsimd_convsamp_float_3dnow
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_convsamp_float_sse
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_convsamp_float_sse2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
-EXTERN(void) jsimd_convsamp_float_mips_dspr2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+EXTERN(void) jsimd_convsamp_float_dspr2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
-/* Slow Integer Forward DCT */
-EXTERN(void) jsimd_fdct_islow_mmx (DCTELEM *data);
+/* Accurate Integer Forward DCT */
+EXTERN(void) jsimd_fdct_islow_mmx(DCTELEM *data);
 
 extern const int jconst_fdct_islow_sse2[];
-EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_sse2(DCTELEM *data);
+
+extern const int jconst_fdct_islow_avx2[];
+EXTERN(void) jsimd_fdct_islow_avx2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_neon(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_islow_neon (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_dspr2(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_mmi(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_altivec(DCTELEM *data);
 
 /* Fast Integer Forward DCT */
-EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_mmx(DCTELEM *data);
 
 extern const int jconst_fdct_ifast_sse2[];
-EXTERN(void) jsimd_fdct_ifast_sse2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_sse2(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_ifast_neon (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_neon(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_ifast_mips_dspr2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_ifast_altivec (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_mmi(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data);
 
 /* Floating Point Forward DCT */
-EXTERN(void) jsimd_fdct_float_3dnow (FAST_FLOAT *data);
+EXTERN(void) jsimd_fdct_float_3dnow(FAST_FLOAT *data);
 
 extern const int jconst_fdct_float_sse[];
-EXTERN(void) jsimd_fdct_float_sse (FAST_FLOAT *data);
+EXTERN(void) jsimd_fdct_float_sse(FAST_FLOAT *data);
 
 /* Quantization */
 EXTERN(void) jsimd_quantize_mmx
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 EXTERN(void) jsimd_quantize_sse2
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_avx2
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 EXTERN(void) jsimd_quantize_neon
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_dspr2
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
-EXTERN(void) jsimd_quantize_mips_dspr2
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_mmi
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 EXTERN(void) jsimd_quantize_altivec
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 /* Floating Point Quantization */
 EXTERN(void) jsimd_quantize_float_3dnow
-        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_quantize_float_sse
-        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_quantize_float_sse2
-        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
-EXTERN(void) jsimd_quantize_float_mips_dspr2
-        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+EXTERN(void) jsimd_quantize_float_dspr2
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 /* Scaled Inverse DCT */
 EXTERN(void) jsimd_idct_2x2_mmx
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_mmx
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_red_sse2[];
 EXTERN(void) jsimd_idct_2x2_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_2x2_neon
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_neon
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
-
-EXTERN(void) jsimd_idct_2x2_mips_dspr2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
-EXTERN(void) jsimd_idct_4x4_mips_dspr2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col, int *workspace);
-EXTERN(void) jsimd_idct_6x6_mips_dspr2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
-EXTERN(void) jsimd_idct_12x12_pass1_mips_dspr2
-        (JCOEFPTR coef_block, void *dct_table, int *workspace);
-EXTERN(void) jsimd_idct_12x12_pass2_mips_dspr2
-        (int *workspace, int *output);
-
-/* Slow Integer Inverse DCT */
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_2x2_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col, int *workspace);
+EXTERN(void) jsimd_idct_6x6_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12_pass1_dspr2
+  (JCOEFPTR coef_block, void *dct_table, int *workspace);
+EXTERN(void) jsimd_idct_12x12_pass2_dspr2
+  (int *workspace, int *output);
+
+/* Accurate Integer Inverse DCT */
 EXTERN(void) jsimd_idct_islow_mmx
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_islow_sse2[];
 EXTERN(void) jsimd_idct_islow_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_islow_avx2[];
+EXTERN(void) jsimd_idct_islow_avx2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_islow_neon
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_islow_dspr2
+  (void *dct_table, JCOEFPTR coef_block, int *output_buf, JSAMPLE *output_col);
 
-EXTERN(void) jsimd_idct_islow_mips_dspr2
-        (void *dct_table, JCOEFPTR coef_block, int *output_buf,
-         JSAMPLE *output_col);
+EXTERN(void) jsimd_idct_islow_mmi
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_islow_altivec
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 /* Fast Integer Inverse DCT */
 EXTERN(void) jsimd_idct_ifast_mmx
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_ifast_sse2[];
 EXTERN(void) jsimd_idct_ifast_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_ifast_neon
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(void) jsimd_idct_ifast_cols_dspr2
+  (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
+   const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_rows_dspr2
+  (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
+   const int *idct_coefs);
 
-EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2
-        (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
-         const int *idct_coefs);
-EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2
-        (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
-         const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_mmi
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_ifast_altivec
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 /* Floating Point Inverse DCT */
 EXTERN(void) jsimd_idct_float_3dnow
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_float_sse[];
 EXTERN(void) jsimd_idct_float_sse
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_float_sse2[];
 EXTERN(void) jsimd_idct_float_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 /* Huffman coding */
 extern const int jconst_huff_encode_one_block[];
-EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2
-        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
-         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_sse2
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+#ifndef NEON_INTRINSICS
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+#endif
+
+/* Progressive Huffman encoding */
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
 
-EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon
-        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
-         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
 
-EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon_slowtbl
-        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
-         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
diff --git a/media/libjpeg/simd/jsimd_arm.c b/media/libjpeg/simd/jsimd_arm.c
deleted file mode 100644
index 61cd073e1b..0000000000
--- a/media/libjpeg/simd/jsimd_arm.c
+++ /dev/null
@@ -1,727 +0,0 @@
-/*
- * jsimd_arm.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
- * Copyright (C) 2015-2016, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 32-bit ARM architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_feature (char *buffer, char *feature)
-{
-  char *p;
-  if (*feature == 0)
-    return 0;
-  if (strncmp(buffer, "Features", 8) != 0)
-    return 0;
-  buffer += 8;
-  while (isspace(*buffer))
-    buffer++;
-
-  /* Check if 'feature' is present in the buffer as a separate word */
-  while ((p = strstr(buffer, feature))) {
-    if (p > buffer && !isspace(*(p - 1))) {
-      buffer++;
-      continue;
-    }
-    p += strlen(feature);
-    if (*p != 0 && !isspace(*p)) {
-      buffer++;
-      continue;
-    }
-    return 1;
-  }
-  return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
-  char *buffer = (char *)malloc(bufsize);
-  FILE *fd;
-  simd_support = 0;
-
-  if (!buffer)
-    return 0;
-
-  fd = fopen("/proc/cpuinfo", "r");
-  if (fd) {
-    while (fgets(buffer, bufsize, fd)) {
-      if (!strchr(buffer, '\n') && !feof(fd)) {
-        /* "impossible" happened - insufficient size of the buffer! */
-        fclose(fd);
-        free(buffer);
-        return 0;
-      }
-      if (check_feature(buffer, "neon"))
-        simd_support |= JSIMD_ARM_NEON;
-    }
-    fclose(fd);
-  }
-  free(buffer);
-  return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#endif
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = 0;
-
-#if defined(__ARM_NEON__)
-  simd_support |= JSIMD_ARM_NEON;
-#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  /* We still have a chance to use NEON regardless of globally used
-   * -mcpu/-mfpu options passed to gcc by performing runtime detection via
-   * /proc/cpuinfo parsing on linux/android */
-  while (!parse_proc_cpuinfo(bufsize)) {
-    bufsize *= 2;
-    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
-      break;
-  }
-#endif
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCENEON");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = JSIMD_ARM_NEON;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_huffman = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      neonfct=jsimd_extrgb_ycc_convert_neon;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      neonfct=jsimd_extrgbx_ycc_convert_neon;
-      break;
-    case JCS_EXT_BGR:
-      neonfct=jsimd_extbgr_ycc_convert_neon;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      neonfct=jsimd_extbgrx_ycc_convert_neon;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      neonfct=jsimd_extxbgr_ycc_convert_neon;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      neonfct=jsimd_extxrgb_ycc_convert_neon;
-      break;
-    default:
-      neonfct=jsimd_extrgb_ycc_convert_neon;
-      break;
-  }
-
-  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      neonfct=jsimd_ycc_extrgb_convert_neon;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      neonfct=jsimd_ycc_extrgbx_convert_neon;
-      break;
-    case JCS_EXT_BGR:
-      neonfct=jsimd_ycc_extbgr_convert_neon;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      neonfct=jsimd_ycc_extbgrx_convert_neon;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      neonfct=jsimd_ycc_extxbgr_convert_neon;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      neonfct=jsimd_ycc_extxrgb_convert_neon;
-      break;
-    default:
-      neonfct=jsimd_ycc_extrgb_convert_neon;
-      break;
-  }
-
-  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
-                                output_buf, num_rows);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
-                                 compptr->downsampled_width, input_data,
-                                 output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  jsimd_convsamp_neon(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  jsimd_fdct_ifast_neon(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  jsimd_quantize_neon(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
-                      output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
-                      output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON && simd_huffman)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
-                                          dctbl, actbl);
-}
diff --git a/media/libjpeg/simd/jsimd_arm64.c b/media/libjpeg/simd/jsimd_arm64.c
deleted file mode 100644
index 09449bb6fd..0000000000
--- a/media/libjpeg/simd/jsimd_arm64.c
+++ /dev/null
@@ -1,802 +0,0 @@
-/*
- * jsimd_arm64.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
- * Copyright (C) 2015-2016, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 64-bit ARM architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-#define JSIMD_FASTLD3 1
-#define JSIMD_FASTST3 2
-#define JSIMD_FASTTBL 4
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
-                                    JSIMD_FASTTBL;
-
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_cpuinfo (char *buffer, const char *field, char *value)
-{
-  char *p;
-  if (*value == 0)
-    return 0;
-  if (strncmp(buffer, field, strlen(field)) != 0)
-    return 0;
-  buffer += strlen(field);
-  while (isspace(*buffer))
-    buffer++;
-
-  /* Check if 'value' is present in the buffer as a separate word */
-  while ((p = strstr(buffer, value))) {
-    if (p > buffer && !isspace(*(p - 1))) {
-      buffer++;
-      continue;
-    }
-    p += strlen(value);
-    if (*p != 0 && !isspace(*p)) {
-      buffer++;
-      continue;
-    }
-    return 1;
-  }
-  return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
-  char *buffer = (char *)malloc(bufsize);
-  FILE *fd;
-
-  if (!buffer)
-    return 0;
-
-  fd = fopen("/proc/cpuinfo", "r");
-  if (fd) {
-    while (fgets(buffer, bufsize, fd)) {
-      if (!strchr(buffer, '\n') && !feof(fd)) {
-        /* "impossible" happened - insufficient size of the buffer! */
-        fclose(fd);
-        free(buffer);
-        return 0;
-      }
-      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
-          check_cpuinfo(buffer, "CPU part", "0xd07"))
-        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
-           percent speedup by disabling the use of that instruction.  The
-           speedup on Cortex-A57 is more subtle but still measurable. */
-        simd_features &= ~JSIMD_FASTTBL;
-      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
-        /* The SIMD version of Huffman encoding is slower than the C version on
-           Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
-           CPU. */
-        simd_huffman = simd_features = 0;
-    }
-    fclose(fd);
-  }
-  free(buffer);
-  return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-
-/*
- * ARMv8 architectures support NEON extensions by default.
- * It is no longer optional as it was with ARMv7.
- */
-
-
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#endif
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = 0;
-
-  simd_support |= JSIMD_ARM_NEON;
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  while (!parse_proc_cpuinfo(bufsize)) {
-    bufsize *= 2;
-    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
-      break;
-  }
-#endif
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCENEON");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = JSIMD_ARM_NEON;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_huffman = 0;
-  env = getenv("JSIMD_FASTLD3");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_features |= JSIMD_FASTLD3;
-  if ((env != NULL) && (strcmp(env, "0") == 0))
-    simd_features &= ~JSIMD_FASTLD3;
-  env = getenv("JSIMD_FASTST3");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_features |= JSIMD_FASTST3;
-  if ((env != NULL) && (strcmp(env, "0") == 0))
-    simd_features &= ~JSIMD_FASTST3;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      if (simd_features & JSIMD_FASTLD3)
-        neonfct=jsimd_extrgb_ycc_convert_neon;
-      else
-        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      neonfct=jsimd_extrgbx_ycc_convert_neon;
-      break;
-    case JCS_EXT_BGR:
-      if (simd_features & JSIMD_FASTLD3)
-        neonfct=jsimd_extbgr_ycc_convert_neon;
-      else
-        neonfct=jsimd_extbgr_ycc_convert_neon_slowld3;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      neonfct=jsimd_extbgrx_ycc_convert_neon;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      neonfct=jsimd_extxbgr_ycc_convert_neon;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      neonfct=jsimd_extxrgb_ycc_convert_neon;
-      break;
-    default:
-      if (simd_features & JSIMD_FASTLD3)
-        neonfct=jsimd_extrgb_ycc_convert_neon;
-      else
-        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
-      break;
-  }
-
-  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      if (simd_features & JSIMD_FASTST3)
-        neonfct=jsimd_ycc_extrgb_convert_neon;
-      else
-        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      neonfct=jsimd_ycc_extrgbx_convert_neon;
-      break;
-    case JCS_EXT_BGR:
-      if (simd_features & JSIMD_FASTST3)
-        neonfct=jsimd_ycc_extbgr_convert_neon;
-      else
-        neonfct=jsimd_ycc_extbgr_convert_neon_slowst3;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      neonfct=jsimd_ycc_extbgrx_convert_neon;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      neonfct=jsimd_ycc_extxbgr_convert_neon;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      neonfct=jsimd_ycc_extxrgb_convert_neon;
-      break;
-    default:
-      if (simd_features & JSIMD_FASTST3)
-        neonfct=jsimd_ycc_extrgb_convert_neon;
-      else
-        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
-      break;
-  }
-
-  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
-                                output_buf, num_rows);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor, compptr->width_in_blocks,
-                             input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor, compptr->width_in_blocks,
-                             input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  jsimd_convsamp_neon(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  jsimd_fdct_islow_neon(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  jsimd_fdct_ifast_neon(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  jsimd_quantize_neon(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
-                      output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
-                      output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON && simd_huffman)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  if (simd_features & JSIMD_FASTTBL)
-    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
-                                            dctbl, actbl);
-  else
-    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
-                                                    last_dc_val, dctbl, actbl);
-}
diff --git a/media/libjpeg/simd/jsimd_arm_neon.S b/media/libjpeg/simd/jsimd_arm_neon.S
deleted file mode 100644
index cd2612724a..0000000000
--- a/media/libjpeg/simd/jsimd_arm_neon.S
+++ /dev/null
@@ -1,2878 +0,0 @@
-/*
- * ARMv7 NEON optimizations for libjpeg-turbo
- *
- * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
- * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
- * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
-#endif
-
-.text
-.fpu neon
-.arch armv7a
-.object_arch armv4
-.arm
-.syntax unified
-
-
-#define RESPECT_STRICT_ALIGNMENT 1
-
-
-/*****************************************************************************/
-
-/* Supplementary macro for setting function attributes */
-.macro asm_function fname
-#ifdef __APPLE__
-    .globl _\fname
-_\fname:
-#else
-    .global \fname
-#ifdef __ELF__
-    .hidden \fname
-    .type \fname, %function
-#endif
-\fname:
-#endif
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4 x0, x1, x2, x3
-    vtrn.16         \x0, \x1
-    vtrn.16         \x2, \x3
-    vtrn.32         \x0, \x2
-    vtrn.32         \x1, \x3
-.endm
-
-
-#define CENTERJSAMPLE 128
-
-/*****************************************************************************/
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients.
- *
- * GLOBAL(void)
- * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
- *                        JSAMPARRAY output_buf, JDIMENSION output_col)
- */
-
-#define FIX_0_298631336 (2446)
-#define FIX_0_390180644 (3196)
-#define FIX_0_541196100 (4433)
-#define FIX_0_765366865 (6270)
-#define FIX_0_899976223 (7373)
-#define FIX_1_175875602 (9633)
-#define FIX_1_501321110 (12299)
-#define FIX_1_847759065 (15137)
-#define FIX_1_961570560 (16069)
-#define FIX_2_053119869 (16819)
-#define FIX_2_562915447 (20995)
-#define FIX_3_072711026 (25172)
-
-#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
-#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
-#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
-#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
-#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
-#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
-#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
-#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
-
-/*
- * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
- * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
- */
-#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
-{                                                                             \
-    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
-    JLONG   q1, q2, q3, q4, q5, q6, q7;                                       \
-    JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
-                                                                              \
-    /* 1-D iDCT input data */                                                 \
-    row0 = xrow0;                                                             \
-    row1 = xrow1;                                                             \
-    row2 = xrow2;                                                             \
-    row3 = xrow3;                                                             \
-    row4 = xrow4;                                                             \
-    row5 = xrow5;                                                             \
-    row6 = xrow6;                                                             \
-    row7 = xrow7;                                                             \
-                                                                              \
-    q5 = row7 + row3;                                                         \
-    q4 = row5 + row1;                                                         \
-    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
-         MULTIPLY(q4, FIX_1_175875602);                                       \
-    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
-         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
-    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
-         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
-    q4 = q6;                                                                  \
-    q3 = ((JLONG) row0 - (JLONG) row4) << 13;                                 \
-    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
-          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
-    /* now we can use q1 (reloadable constants have been used up) */          \
-    q1 = q3 + q2;                                                             \
-    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
-          MULTIPLY(row1, -FIX_0_899976223);                                   \
-    q5 = q7;                                                                  \
-    q1 = q1 + q6;                                                             \
-    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
-          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
-                                                                              \
-    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
-    tmp11_plus_tmp2 = q1;                                                     \
-    row1 = 0;                                                                 \
-                                                                              \
-    q1 = q1 - q6;                                                             \
-    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
-          MULTIPLY(row3, -FIX_2_562915447);                                   \
-    q1 = q1 - q6;                                                             \
-    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
-         MULTIPLY(row6, FIX_0_541196100);                                     \
-    q3 = q3 - q2;                                                             \
-                                                                              \
-    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
-    tmp11_minus_tmp2 = q1;                                                    \
-                                                                              \
-    q1 = ((JLONG) row0 + (JLONG) row4) << 13;                                 \
-    q2 = q1 + q6;                                                             \
-    q1 = q1 - q6;                                                             \
-                                                                              \
-    /* pick up the results */                                                 \
-    tmp0  = q4;                                                               \
-    tmp1  = q5;                                                               \
-    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
-    tmp3  = q7;                                                               \
-    tmp10 = q2;                                                               \
-    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
-    tmp12 = q3;                                                               \
-    tmp13 = q1;                                                               \
-}
-
-#define XFIX_0_899976223                   d0[0]
-#define XFIX_0_541196100                   d0[1]
-#define XFIX_2_562915447                   d0[2]
-#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
-#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
-#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
-#define XFIX_0_541196100_PLUS_0_765366865  d1[2]
-#define XFIX_1_175875602                   d1[3]
-#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
-#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
-#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
-#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
-
-.balign 16
-jsimd_idct_islow_neon_consts:
-  .short FIX_0_899976223                    /* d0[0] */
-  .short FIX_0_541196100                    /* d0[1] */
-  .short FIX_2_562915447                    /* d0[2] */
-  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
-  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
-  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
-  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
-  .short FIX_1_175875602                    /* d1[3] */
-  /* reloadable constants */
-  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
-  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
-  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
-  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
-
-asm_function jsimd_idct_islow_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    ROW0L           .req d16
-    ROW0R           .req d17
-    ROW1L           .req d18
-    ROW1R           .req d19
-    ROW2L           .req d20
-    ROW2R           .req d21
-    ROW3L           .req d22
-    ROW3R           .req d23
-    ROW4L           .req d24
-    ROW4R           .req d25
-    ROW5L           .req d26
-    ROW5R           .req d27
-    ROW6L           .req d28
-    ROW6R           .req d29
-    ROW7L           .req d30
-    ROW7R           .req d31
-
-    /* Load and dequantize coefficients into NEON registers
-     * with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17     ( q8  )
-     *   1 | d18     | d19     ( q9  )
-     *   2 | d20     | d21     ( q10 )
-     *   3 | d22     | d23     ( q11 )
-     *   4 | d24     | d25     ( q12 )
-     *   5 | d26     | d27     ( q13 )
-     *   6 | d28     | d29     ( q14 )
-     *   7 | d30     | d31     ( q15 )
-     */
-    adr             ip, jsimd_idct_islow_neon_consts
-    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
-    vmul.s16        q8, q8, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q9, q9, q1
-    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
-    vmul.s16        q10, q10, q2
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vmul.s16        q11, q11, q3
-    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
-    vmul.s16        q12, q12, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q14, q14, q2
-    vmul.s16        q13, q13, q1
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
-    add             ip, ip, #16
-    vmul.s16        q15, q15, q3
-    vpush           {d8-d15}                      /* save NEON registers */
-    /* 1-D IDCT, pass 1, left 4x8 half */
-    vadd.s16        d4, ROW7L, ROW3L
-    vadd.s16        d5, ROW5L, ROW1L
-    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6, d5, XFIX_1_175875602
-    vmull.s16       q7, d4, XFIX_1_175875602
-      /* Check for the zero coefficients in the right 4x8 half */
-      push            {r4, r5}
-    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3, ROW0L, ROW4L
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
-    vmull.s16       q2, ROW2L, XFIX_0_541196100
-    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
-      orr             r0, r4, r5
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
-    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3, q3, #13
-      orr             r0, r0, r4
-    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
-      orr             r0, r0, r5
-    vadd.s32        q1, q3, q2
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-      orr             r0, r0, r4
-    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
-      orr             r0, r0, r5
-    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1L, q1, #11
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
-      orr             r0, r0, r4
-    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
-      orr             r0, r0, r5
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
-    vmlal.s16       q6, ROW6L, XFIX_0_541196100
-    vsub.s32        q3, q3, q2
-      orr             r0, r0, r4
-    vrshrn.s32      ROW6L, q1, #11
-      orr             r0, r0, r5
-    vadd.s32        q1, q3, q5
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW0L, ROW4L
-      orr             r0, r0, r4
-    vrshrn.s32      ROW2L, q1, #11
-      orr             r0, r0, r5
-    vrshrn.s32      ROW5L, q3, #11
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
-      orr             r0, r0, r4
-    vadd.s32        q2, q5, q6
-      orrs            r0, r0, r5
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-      orr             r0, r4, r5
-    vsub.s32        q3, q1, q4
-      pop             {r4, r5}
-    vrshrn.s32      ROW7L, q2, #11
-    vrshrn.s32      ROW3L, q5, #11
-    vrshrn.s32      ROW0L, q6, #11
-    vrshrn.s32      ROW4L, q3, #11
-
-      beq             3f  /* Go to do some special handling for the sparse
-                             right 4x8 half */
-
-    /* 1-D IDCT, pass 1, right 4x8 half */
-    vld1.s16        {d2}, [ip, :64]  /* reload constants */
-    vadd.s16        d10, ROW7R, ROW3R
-    vadd.s16        d8, ROW5R, ROW1R
-      /* Transpose left 4x8 half */
-      vtrn.16         ROW6L, ROW7L
-    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6, d8, XFIX_1_175875602
-      vtrn.16         ROW2L, ROW3L
-    vmull.s16       q7, d10, XFIX_1_175875602
-    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
-      vtrn.16         ROW0L, ROW1L
-    vsubl.s16       q3, ROW0R, ROW4R
-    vmull.s16       q2, ROW2R, XFIX_0_541196100
-    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
-      vtrn.16         ROW4L, ROW5L
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
-    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
-      vtrn.32         ROW1L, ROW3L
-    vshl.s32        q3, q3, #13
-    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
-      vtrn.32         ROW4L, ROW6L
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-      vtrn.32         ROW0L, ROW2L
-    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
-    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1R, q1, #11
-      vtrn.32         ROW5L, ROW7L
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6, ROW6R, XFIX_0_541196100
-    vsub.s32        q3, q3, q2
-    vrshrn.s32      ROW6R, q1, #11
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW0R, ROW4R
-    vrshrn.s32      ROW2R, q1, #11
-    vrshrn.s32      ROW5R, q3, #11
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vrshrn.s32      ROW7R, q2, #11
-    vrshrn.s32      ROW3R, q5, #11
-    vrshrn.s32      ROW0R, q6, #11
-    vrshrn.s32      ROW4R, q3, #11
-    /* Transpose right 4x8 half */
-    vtrn.16         ROW6R, ROW7R
-    vtrn.16         ROW2R, ROW3R
-    vtrn.16         ROW0R, ROW1R
-    vtrn.16         ROW4R, ROW5R
-    vtrn.32         ROW1R, ROW3R
-    vtrn.32         ROW4R, ROW6R
-    vtrn.32         ROW0R, ROW2R
-    vtrn.32         ROW5R, ROW7R
-
-1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
-    vmlal.s16       q6, ROW1L, XFIX_1_175875602
-    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
-    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
-    vmlal.s16       q7, ROW3L, XFIX_1_175875602
-    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
-    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
-    vmull.s16       q2, ROW2L, XFIX_0_541196100
-    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
-    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3, q3, #13
-    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
-    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vshrn.s32       ROW1L, q1, #16
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
-    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW2L, q1, #16
-    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5, #16
-    vshrn.s32       ROW0L, q6, #16
-    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
-    /* 1-D IDCT, pass 2, right 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW5R, XFIX_1_175875602
-    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
-    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
-    vmull.s16       q7, ROW7R, XFIX_1_175875602
-    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
-    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
-    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
-    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
-    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
-    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
-    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
-    vshl.s32        q3, q3, #13
-    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
-    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
-    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
-    vmlal.s16       q6, ROW6R, XFIX_0_541196100
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW6R, q1, #16
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3, #16
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW7R, q2, #16
-    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3, #16
-
-2:  /* Descale to 8-bit and range limit */
-    vqrshrn.s16     d16, q8, #2
-    vqrshrn.s16     d17, q9, #2
-    vqrshrn.s16     d18, q10, #2
-    vqrshrn.s16     d19, q11, #2
-    vpop            {d8-d15}                      /* restore NEON registers */
-    vqrshrn.s16     d20, q12, #2
-      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
-      vtrn.16         q8, q9
-    vqrshrn.s16     d21, q13, #2
-    vqrshrn.s16     d22, q14, #2
-      vmov.u8         q0, #(CENTERJSAMPLE)
-    vqrshrn.s16     d23, q15, #2
-      vtrn.8          d16, d17
-      vtrn.8          d18, d19
-      vadd.u8         q8, q8, q0
-      vadd.u8         q9, q9, q0
-      vtrn.16         q10, q11
-        /* Store results to the output buffer */
-        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-        add             TMP1, TMP1, OUTPUT_COL
-        add             TMP2, TMP2, OUTPUT_COL
-        vst1.8          {d16}, [TMP1]
-      vtrn.8          d20, d21
-        vst1.8          {d17}, [TMP2]
-        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-        add             TMP1, TMP1, OUTPUT_COL
-        add             TMP2, TMP2, OUTPUT_COL
-        vst1.8          {d18}, [TMP1]
-      vadd.u8         q10, q10, q0
-        vst1.8          {d19}, [TMP2]
-        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-        add             TMP1, TMP1, OUTPUT_COL
-        add             TMP2, TMP2, OUTPUT_COL
-        add             TMP3, TMP3, OUTPUT_COL
-        add             TMP4, TMP4, OUTPUT_COL
-      vtrn.8          d22, d23
-        vst1.8          {d20}, [TMP1]
-      vadd.u8         q11, q11, q0
-        vst1.8          {d21}, [TMP2]
-        vst1.8          {d22}, [TMP3]
-        vst1.8          {d23}, [TMP4]
-    bx              lr
-
-3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
-
-    /* Transpose left 4x8 half */
-    vtrn.16         ROW6L, ROW7L
-    vtrn.16         ROW2L, ROW3L
-    vtrn.16         ROW0L, ROW1L
-    vtrn.16         ROW4L, ROW5L
-    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
-    vtrn.32         ROW1L, ROW3L
-    vtrn.32         ROW4L, ROW6L
-    vtrn.32         ROW0L, ROW2L
-    vtrn.32         ROW5L, ROW7L
-
-    cmp             r0, #0
-    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
-                           pass */
-
-    /* Only row 0 is non-zero for the right 4x8 half  */
-    vdup.s16        ROW1R, ROW0R[1]
-    vdup.s16        ROW2R, ROW0R[2]
-    vdup.s16        ROW3R, ROW0R[3]
-    vdup.s16        ROW4R, ROW0R[0]
-    vdup.s16        ROW5R, ROW0R[1]
-    vdup.s16        ROW6R, ROW0R[2]
-    vdup.s16        ROW7R, ROW0R[3]
-    vdup.s16        ROW0R, ROW0R[0]
-    b               1b  /* Go to 'normal' second pass */
-
-4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW1L, XFIX_1_175875602
-    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7, ROW3L, XFIX_1_175875602
-    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2, ROW2L, XFIX_0_541196100
-    vshll.s16       q3, ROW0L, #13
-    vmov            q4, q6
-    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1, q1, q6
-    vadd.s32        q6, q6, q6
-    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
-    vshrn.s32       ROW1L, q1, #16
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vshll.s16       q5, ROW0L, #13
-    vshrn.s32       ROW2L, q1, #16
-    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5, #16
-    vshrn.s32       ROW0L, q6, #16
-    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
-    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW5L, XFIX_1_175875602
-    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7, ROW7L, XFIX_1_175875602
-    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2, ROW6L, XFIX_0_541196100
-    vshll.s16       q3, ROW4L, #13
-    vmov            q4, q6
-    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1, q1, q6
-    vadd.s32        q6, q6, q6
-    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
-    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW6R, q1, #16
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vshll.s16       q5, ROW4L, #13
-    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3, #16
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW7R, q2, #16
-    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3, #16
-    b               2b                            /* Go to epilogue */
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-    .unreq          ROW0L
-    .unreq          ROW0R
-    .unreq          ROW1L
-    .unreq          ROW1R
-    .unreq          ROW2L
-    .unreq          ROW2R
-    .unreq          ROW3L
-    .unreq          ROW3R
-    .unreq          ROW4L
-    .unreq          ROW4R
-    .unreq          ROW5L
-    .unreq          ROW5R
-    .unreq          ROW6L
-    .unreq          ROW6R
-    .unreq          ROW7L
-    .unreq          ROW7R
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
- * function from jidctfst.c
- *
- * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
- * But in ARM NEON case some extra additions are required because VQDMULH
- * instruction can't handle the constants larger than 1. So the expressions
- * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
- * which introduces an extra addition. Overall, there are 6 extra additions
- * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
- */
-
-#define XFIX_1_082392200 d0[0]
-#define XFIX_1_414213562 d0[1]
-#define XFIX_1_847759065 d0[2]
-#define XFIX_2_613125930 d0[3]
-
-.balign 16
-jsimd_idct_ifast_neon_consts:
-  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
-  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
-  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
-  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
-
-asm_function jsimd_idct_ifast_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    /* Load and dequantize coefficients into NEON registers
-     * with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17     ( q8  )
-     *   1 | d18     | d19     ( q9  )
-     *   2 | d20     | d21     ( q10 )
-     *   3 | d22     | d23     ( q11 )
-     *   4 | d24     | d25     ( q12 )
-     *   5 | d26     | d27     ( q13 )
-     *   6 | d28     | d29     ( q14 )
-     *   7 | d30     | d31     ( q15 )
-     */
-    adr             ip, jsimd_idct_ifast_neon_consts
-    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
-    vmul.s16        q8, q8, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q9, q9, q1
-    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
-    vmul.s16        q10, q10, q2
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vmul.s16        q11, q11, q3
-    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
-    vmul.s16        q12, q12, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q14, q14, q2
-    vmul.s16        q13, q13, q1
-    vld1.16         {d0}, [ip, :64]  /* load constants */
-    vmul.s16        q15, q15, q3
-    vpush           {d8-d13}         /* save NEON registers */
-    /* 1-D IDCT, pass 1 */
-    vsub.s16        q2, q10, q14
-    vadd.s16        q14, q10, q14
-    vsub.s16        q1, q11, q13
-    vadd.s16        q13, q11, q13
-    vsub.s16        q5, q9, q15
-    vadd.s16        q15, q9, q15
-    vqdmulh.s16     q4, q2, XFIX_1_414213562
-    vqdmulh.s16     q6, q1, XFIX_2_613125930
-    vadd.s16        q3, q1, q1
-    vsub.s16        q1, q5, q1
-    vadd.s16        q10, q2, q4
-    vqdmulh.s16     q4, q1, XFIX_1_847759065
-    vsub.s16        q2, q15, q13
-    vadd.s16        q3, q3, q6
-    vqdmulh.s16     q6, q2, XFIX_1_414213562
-    vadd.s16        q1, q1, q4
-    vqdmulh.s16     q4, q5, XFIX_1_082392200
-    vsub.s16        q10, q10, q14
-    vadd.s16        q2, q2, q6
-    vsub.s16        q6, q8, q12
-    vadd.s16        q12, q8, q12
-    vadd.s16        q9, q5, q4
-    vadd.s16        q5, q6, q10
-    vsub.s16        q10, q6, q10
-    vadd.s16        q6, q15, q13
-    vadd.s16        q8, q12, q14
-    vsub.s16        q3, q6, q3
-    vsub.s16        q12, q12, q14
-    vsub.s16        q3, q3, q1
-    vsub.s16        q1, q9, q1
-    vadd.s16        q2, q3, q2
-    vsub.s16        q15, q8, q6
-    vadd.s16        q1, q1, q2
-    vadd.s16        q8, q8, q6
-    vadd.s16        q14, q5, q3
-    vsub.s16        q9, q5, q3
-    vsub.s16        q13, q10, q2
-    vadd.s16        q10, q10, q2
-      /* Transpose */
-      vtrn.16         q8, q9
-    vsub.s16        q11, q12, q1
-      vtrn.16         q14, q15
-    vadd.s16        q12, q12, q1
-      vtrn.16         q10, q11
-      vtrn.16         q12, q13
-      vtrn.32         q9, q11
-      vtrn.32         q12, q14
-      vtrn.32         q8, q10
-      vtrn.32         q13, q15
-      vswp            d28, d21
-      vswp            d26, d19
-    /* 1-D IDCT, pass 2 */
-    vsub.s16        q2, q10, q14
-      vswp            d30, d23
-    vadd.s16        q14, q10, q14
-      vswp            d24, d17
-    vsub.s16        q1, q11, q13
-    vadd.s16        q13, q11, q13
-    vsub.s16        q5, q9, q15
-    vadd.s16        q15, q9, q15
-    vqdmulh.s16     q4, q2, XFIX_1_414213562
-    vqdmulh.s16     q6, q1, XFIX_2_613125930
-    vadd.s16        q3, q1, q1
-    vsub.s16        q1, q5, q1
-    vadd.s16        q10, q2, q4
-    vqdmulh.s16     q4, q1, XFIX_1_847759065
-    vsub.s16        q2, q15, q13
-    vadd.s16        q3, q3, q6
-    vqdmulh.s16     q6, q2, XFIX_1_414213562
-    vadd.s16        q1, q1, q4
-    vqdmulh.s16     q4, q5, XFIX_1_082392200
-    vsub.s16        q10, q10, q14
-    vadd.s16        q2, q2, q6
-    vsub.s16        q6, q8, q12
-    vadd.s16        q12, q8, q12
-    vadd.s16        q9, q5, q4
-    vadd.s16        q5, q6, q10
-    vsub.s16        q10, q6, q10
-    vadd.s16        q6, q15, q13
-    vadd.s16        q8, q12, q14
-    vsub.s16        q3, q6, q3
-    vsub.s16        q12, q12, q14
-    vsub.s16        q3, q3, q1
-    vsub.s16        q1, q9, q1
-    vadd.s16        q2, q3, q2
-    vsub.s16        q15, q8, q6
-    vadd.s16        q1, q1, q2
-    vadd.s16        q8, q8, q6
-    vadd.s16        q14, q5, q3
-    vsub.s16        q9, q5, q3
-    vsub.s16        q13, q10, q2
-    vpop            {d8-d13}      /* restore NEON registers */
-    vadd.s16        q10, q10, q2
-    vsub.s16        q11, q12, q1
-    vadd.s16        q12, q12, q1
-    /* Descale to 8-bit and range limit */
-    vmov.u8         q0, #0x80
-    vqshrn.s16      d16, q8, #5
-    vqshrn.s16      d17, q9, #5
-    vqshrn.s16      d18, q10, #5
-    vqshrn.s16      d19, q11, #5
-    vqshrn.s16      d20, q12, #5
-    vqshrn.s16      d21, q13, #5
-    vqshrn.s16      d22, q14, #5
-    vqshrn.s16      d23, q15, #5
-    vadd.u8         q8, q8, q0
-    vadd.u8         q9, q9, q0
-    vadd.u8         q10, q10, q0
-    vadd.u8         q11, q11, q0
-    /* Transpose the final 8-bit samples */
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.8          d16, d17
-    vtrn.8          d18, d19
-      /* Store results to the output buffer */
-      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-      add             TMP1, TMP1, OUTPUT_COL
-      add             TMP2, TMP2, OUTPUT_COL
-      vst1.8          {d16}, [TMP1]
-      vst1.8          {d17}, [TMP2]
-      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-      add             TMP1, TMP1, OUTPUT_COL
-      add             TMP2, TMP2, OUTPUT_COL
-      vst1.8          {d18}, [TMP1]
-    vtrn.8          d20, d21
-      vst1.8          {d19}, [TMP2]
-      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-      add             TMP1, TMP1, OUTPUT_COL
-      add             TMP2, TMP2, OUTPUT_COL
-      add             TMP3, TMP3, OUTPUT_COL
-      add             TMP4, TMP4, OUTPUT_COL
-      vst1.8          {d20}, [TMP1]
-    vtrn.8          d22, d23
-      vst1.8          {d21}, [TMP2]
-      vst1.8          {d22}, [TMP3]
-      vst1.8          {d23}, [TMP4]
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular NEON optimized function is
- *       bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- *       idct_helper/transpose_4x4 macros and reordering instructions,
- *       but readability will suffer somewhat.
- */
-
-#define CONST_BITS  13
-
-#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
-
-.balign 16
-jsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065      /* d0[0] */
-  .short -FIX_0_765366865     /* d0[1] */
-  .short -FIX_0_211164243     /* d0[2] */
-  .short FIX_1_451774981      /* d0[3] */
-  .short -FIX_2_172734803     /* d1[0] */
-  .short FIX_1_061594337      /* d1[1] */
-  .short -FIX_0_509795579     /* d1[2] */
-  .short -FIX_0_601344887     /* d1[3] */
-  .short FIX_0_899976223      /* d2[0] */
-  .short FIX_2_562915447      /* d2[1] */
-  .short 1 << (CONST_BITS+1)  /* d2[2] */
-  .short 0                    /* d2[3] */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    vmull.s16       q14, \x4, d2[2]
-    vmlal.s16       q14, \x8, d0[0]
-    vmlal.s16       q14, \x14, d0[1]
-
-    vmull.s16       q13, \x16, d1[2]
-    vmlal.s16       q13, \x12, d1[3]
-    vmlal.s16       q13, \x10, d2[0]
-    vmlal.s16       q13, \x6, d2[1]
-
-    vmull.s16       q15, \x4, d2[2]
-    vmlsl.s16       q15, \x8, d0[0]
-    vmlsl.s16       q15, \x14, d0[1]
-
-    vmull.s16       q12, \x16, d0[2]
-    vmlal.s16       q12, \x12, d0[3]
-    vmlal.s16       q12, \x10, d1[0]
-    vmlal.s16       q12, \x6, d1[1]
-
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q14, q14, #\shift
-    vmovn.s32       \y26, q10
-    vmovn.s32       \y29, q14
-  .else
-    vrshrn.s32      \y26, q10, #\shift
-    vrshrn.s32      \y29, q14, #\shift
-  .endif
-
-    vadd.s32        q10, q15, q12
-    vsub.s32        q15, q15, q12
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q15, q15, #\shift
-    vmovn.s32       \y27, q10
-    vmovn.s32       \y28, q15
-  .else
-    vrshrn.s32      \y27, q10, #\shift
-    vrshrn.s32      \y28, q15, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    vpush           {d8-d15}
-
-    /* Load constants (d3 is just used for padding) */
-    adr             TMP4, jsimd_idct_4x4_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
-
-    /* Load all COEF_BLOCK into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d4      | d5
-     *   1 | d6      | d7
-     *   2 | d8      | d9
-     *   3 | d10     | d11
-     *   4 | -       | -
-     *   5 | d12     | d13
-     *   6 | d14     | d15
-     *   7 | d16     | d17
-     */
-    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
-    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
-    add COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
-    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
-    /* dequantize */
-    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
-    vmul.s16        q2, q2, q9
-    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
-    vmul.s16        q3, q3, q10
-    vmul.s16        q4, q4, q11
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
-    vmul.s16        q5, q5, q12
-    vmul.s16        q6, q6, q13
-    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
-    vmul.s16        q7, q7, q14
-    vmul.s16        q8, q8, q15
-
-    /* Pass 1 */
-    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
-    transpose_4x4   d4, d6, d8, d10
-    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
-    transpose_4x4   d5, d7, d9, d11
-
-    /* Pass 2 */
-    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
-    transpose_4x4   d26, d27, d28, d29
-
-    /* Range limit */
-    vmov.u16        q15, #0x80
-    vadd.s16        q13, q13, q15
-    vadd.s16        q14, q14, q15
-    vqmovun.s16     d26, q13
-    vqmovun.s16     d27, q14
-
-    /* Store results to the output buffer */
-    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-    add             TMP3, TMP3, OUTPUT_COL
-    add             TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
-    /* We can use much less instructions on little endian systems if the
-     * OS kernel is not configured to trap unaligned memory accesses
-     */
-    vst1.32         {d26[0]}, [TMP1]!
-    vst1.32         {d27[0]}, [TMP3]!
-    vst1.32         {d26[1]}, [TMP2]!
-    vst1.32         {d27[1]}, [TMP4]!
-#else
-    vst1.8          {d26[0]}, [TMP1]!
-    vst1.8          {d27[0]}, [TMP3]!
-    vst1.8          {d26[1]}, [TMP1]!
-    vst1.8          {d27[1]}, [TMP3]!
-    vst1.8          {d26[2]}, [TMP1]!
-    vst1.8          {d27[2]}, [TMP3]!
-    vst1.8          {d26[3]}, [TMP1]!
-    vst1.8          {d27[3]}, [TMP3]!
-
-    vst1.8          {d26[4]}, [TMP2]!
-    vst1.8          {d27[4]}, [TMP4]!
-    vst1.8          {d26[5]}, [TMP2]!
-    vst1.8          {d27[5]}, [TMP4]!
-    vst1.8          {d26[6]}, [TMP2]!
-    vst1.8          {d27[6]}, [TMP4]!
-    vst1.8          {d26[7]}, [TMP2]!
-    vst1.8          {d27[7]}, [TMP4]!
-#endif
-
-    vpop            {d8-d15}
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular NEON optimized function is
- *       bit exact compatibility with jpeg-6b.
- */
-
-.balign 8
-jsimd_idct_2x2_neon_consts:
-  .short -FIX_0_720959822  /* d0[0] */
-  .short FIX_0_850430095   /* d0[1] */
-  .short -FIX_1_272758580  /* d0[2] */
-  .short FIX_3_624509785   /* d0[3] */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    vshll.s16       q14, \x4, #15
-    vmull.s16       q13, \x6, d0[3]
-    vmlal.s16       q13, \x10, d0[2]
-    vmlal.s16       q13, \x12, d0[1]
-    vmlal.s16       q13, \x16, d0[0]
-
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q14, q14, #\shift
-    vmovn.s32       \y26, q10
-    vmovn.s32       \y27, q14
-  .else
-    vrshrn.s32      \y26, q10, #\shift
-    vrshrn.s32      \y27, q14, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req ip
-
-    vpush           {d8-d15}
-
-    /* Load constants */
-    adr             TMP2, jsimd_idct_2x2_neon_consts
-    vld1.16         {d0}, [TMP2, :64]
-
-    /* Load all COEF_BLOCK into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d4      | d5
-     *   1 | d6      | d7
-     *   2 | -       | -
-     *   3 | d10     | d11
-     *   4 | -       | -
-     *   5 | d12     | d13
-     *   6 | -       | -
-     *   7 | d16     | d17
-     */
-    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
-    /* Dequantize */
-    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
-    vmul.s16        q2, q2, q9
-    vmul.s16        q3, q3, q10
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
-    vmul.s16        q5, q5, q12
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
-    vmul.s16        q6, q6, q13
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
-    vmul.s16        q8, q8, q15
-
-    /* Pass 1 */
-#if 0
-    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
-    transpose_4x4   d4, d6, d8, d10
-    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
-    transpose_4x4   d5, d7, d9, d11
-#else
-    vmull.s16       q13, d6, d0[3]
-    vmlal.s16       q13, d10, d0[2]
-    vmlal.s16       q13, d12, d0[1]
-    vmlal.s16       q13, d16, d0[0]
-    vmull.s16       q12, d7, d0[3]
-    vmlal.s16       q12, d11, d0[2]
-    vmlal.s16       q12, d13, d0[1]
-    vmlal.s16       q12, d17, d0[0]
-    vshll.s16       q14, d4, #15
-    vshll.s16       q15, d5, #15
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-    vrshrn.s32      d4, q10, #13
-    vrshrn.s32      d6, q14, #13
-    vadd.s32        q10, q15, q12
-    vsub.s32        q14, q15, q12
-    vrshrn.s32      d5, q10, #13
-    vrshrn.s32      d7, q14, #13
-    vtrn.16         q2, q3
-    vtrn.32         q3, q5
-#endif
-
-    /* Pass 2 */
-    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
-
-    /* Range limit */
-    vmov.u16        q15, #0x80
-    vadd.s16        q13, q13, q15
-    vqmovun.s16     d26, q13
-    vqmovun.s16     d27, q13
-
-    /* Store results to the output buffer */
-    ldmia           OUTPUT_BUF, {TMP1, TMP2}
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-
-    vst1.8          {d26[0]}, [TMP1]!
-    vst1.8          {d27[4]}, [TMP1]!
-    vst1.8          {d26[1]}, [TMP2]!
-    vst1.8          {d27[5]}, [TMP2]!
-
-    vpop            {d8-d15}
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_ycc_extrgb_convert_neon
- * jsimd_ycc_extbgr_convert_neon
- * jsimd_ycc_extrgbx_convert_neon
- * jsimd_ycc_extbgrx_convert_neon
- * jsimd_ycc_extxbgr_convert_neon
- * jsimd_ycc_extxrgb_convert_neon
- *
- * Colorspace conversion YCbCr -> RGB
- */
-
-
-.macro do_load size
-  .if \size == 8
-    vld1.8          {d4}, [U, :64]!
-    vld1.8          {d5}, [V, :64]!
-    vld1.8          {d0}, [Y, :64]!
-    pld             [U, #64]
-    pld             [V, #64]
-    pld             [Y, #64]
-  .elseif \size == 4
-    vld1.8          {d4[0]}, [U]!
-    vld1.8          {d4[1]}, [U]!
-    vld1.8          {d4[2]}, [U]!
-    vld1.8          {d4[3]}, [U]!
-    vld1.8          {d5[0]}, [V]!
-    vld1.8          {d5[1]}, [V]!
-    vld1.8          {d5[2]}, [V]!
-    vld1.8          {d5[3]}, [V]!
-    vld1.8          {d0[0]}, [Y]!
-    vld1.8          {d0[1]}, [Y]!
-    vld1.8          {d0[2]}, [Y]!
-    vld1.8          {d0[3]}, [Y]!
-  .elseif \size == 2
-    vld1.8          {d4[4]}, [U]!
-    vld1.8          {d4[5]}, [U]!
-    vld1.8          {d5[4]}, [V]!
-    vld1.8          {d5[5]}, [V]!
-    vld1.8          {d0[4]}, [Y]!
-    vld1.8          {d0[5]}, [Y]!
-  .elseif \size == 1
-    vld1.8          {d4[6]}, [U]!
-    vld1.8          {d5[6]}, [V]!
-    vld1.8          {d0[6]}, [Y]!
-  .else
-    .error unsupported macroblock size
-  .endif
-.endm
-
-.macro do_store bpp, size
-  .if \bpp == 24
-    .if \size == 8
-      vst3.8        {d10, d11, d12}, [RGB]!
-    .elseif \size == 4
-      vst3.8        {d10[0], d11[0], d12[0]}, [RGB]!
-      vst3.8        {d10[1], d11[1], d12[1]}, [RGB]!
-      vst3.8        {d10[2], d11[2], d12[2]}, [RGB]!
-      vst3.8        {d10[3], d11[3], d12[3]}, [RGB]!
-    .elseif \size == 2
-      vst3.8        {d10[4], d11[4], d12[4]}, [RGB]!
-      vst3.8        {d10[5], d11[5], d12[5]}, [RGB]!
-    .elseif \size == 1
-      vst3.8        {d10[6], d11[6], d12[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .elseif \bpp == 32
-    .if \size == 8
-      vst4.8        {d10, d11, d12, d13}, [RGB]!
-    .elseif \size == 4
-      vst4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-      vst4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-      vst4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-      vst4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-    .elseif \size == 2
-      vst4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-      vst4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-    .elseif \size == 1
-      vst4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .elseif \bpp == 16
-    .if \size == 8
-      vst1.16       {q15}, [RGB]!
-    .elseif \size == 4
-      vst1.16       {d30}, [RGB]!
-    .elseif \size == 2
-      vst1.16       {d31[0]}, [RGB]!
-      vst1.16       {d31[1]}, [RGB]!
-    .elseif \size == 1
-      vst1.16       {d31[2]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .else
-    .error unsupported bpp
-  .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined YCbCr->RGB conversion
- */
-
-.macro do_yuv_to_rgb_stage1
-    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
-    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
-    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
-    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb_stage2
-    vrshrn.s32      d20, q10, #15
-    vrshrn.s32      d21, q11, #15
-    vrshrn.s32      d24, q12, #14
-    vrshrn.s32      d25, q13, #14
-    vrshrn.s32      d28, q14, #14
-    vrshrn.s32      d29, q15, #14
-    vaddw.u8        q11, q10, d0
-    vaddw.u8        q12, q12, d0
-    vaddw.u8        q14, q14, d0
-  .if \bpp != 16
-    vqmovun.s16     d1\g_offs, q11
-    vqmovun.s16     d1\r_offs, q12
-    vqmovun.s16     d1\b_offs, q14
-  .else  /* rgb565 */
-    vqshlu.s16      q13, q11, #8
-    vqshlu.s16      q15, q12, #8
-    vqshlu.s16      q14, q14, #8
-    vsri.u16        q15, q13, #5
-    vsri.u16        q15, q14, #11
-  .endif
-.endm
-
-.macro do_yuv_to_rgb_stage2_store_load_stage1
-                                       /* "do_yuv_to_rgb_stage2" and "store" */
-                                       vrshrn.s32      d20, q10, #15
-    /* "load" and "do_yuv_to_rgb_stage1" */
-    pld             [U, #64]
-                                       vrshrn.s32      d21, q11, #15
-    pld             [V, #64]
-                                       vrshrn.s32      d24, q12, #14
-                                       vrshrn.s32      d25, q13, #14
-    vld1.8          {d4}, [U, :64]!
-                                       vrshrn.s32      d28, q14, #14
-    vld1.8          {d5}, [V, :64]!
-                                       vrshrn.s32      d29, q15, #14
-    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
-                                       vaddw.u8        q11, q10, d0
-    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
-                                       vaddw.u8        q12, q12, d0
-                                       vaddw.u8        q14, q14, d0
-  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
-                                       vqmovun.s16     d1\g_offs, q11
-    pld             [Y, #64]
-                                       vqmovun.s16     d1\r_offs, q12
-    vld1.8          {d0}, [Y, :64]!
-                                       vqmovun.s16     d1\b_offs, q14
-    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
-                                       do_store        \bpp, 8
-    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
-  .else  /**************************** rgb565 ********************************/
-                                       vqshlu.s16      q13, q11, #8
-    pld             [Y, #64]
-                                       vqshlu.s16      q15, q12, #8
-                                       vqshlu.s16      q14, q14, #8
-    vld1.8          {d0}, [Y, :64]!
-    vmull.s16       q11, d7, d1[1]
-    vmlal.s16       q11, d9, d1[2]
-                                       vsri.u16        q15, q13, #5
-    vmull.s16       q12, d8, d1[0]
-                                       vsri.u16        q15, q14, #11
-    vmull.s16       q13, d9, d1[0]
-    vmull.s16       q14, d6, d1[3]
-                                       do_store        \bpp, 8
-    vmull.s16       q15, d7, d1[3]
-  .endif
-.endm
-
-.macro do_yuv_to_rgb
-    do_yuv_to_rgb_stage1
-    do_yuv_to_rgb_stage2
-.endm
-
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-jsimd_ycc_\colorid\()_neon_consts:
-  .short 0,      0,     0,      0
-  .short 22971, -11277, -23401, 29033
-  .short -128,  -128,   -128,   -128
-  .short -128,  -128,   -128,   -128
-
-asm_function jsimd_ycc_\colorid\()_convert_neon
-    OUTPUT_WIDTH    .req r0
-    INPUT_BUF       .req r1
-    INPUT_ROW       .req r2
-    OUTPUT_BUF      .req r3
-    NUM_ROWS        .req r4
-
-    INPUT_BUF0      .req r5
-    INPUT_BUF1      .req r6
-    INPUT_BUF2      .req INPUT_BUF
-
-    RGB             .req r7
-    Y               .req r8
-    U               .req r9
-    V               .req r10
-    N               .req ip
-
-    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
-    adr             ip, jsimd_ycc_\colorid\()_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]
-
-    /* Save ARM registers and handle input arguments */
-    push            {r4, r5, r6, r7, r8, r9, r10, lr}
-    ldr             NUM_ROWS, [sp, #(4 * 8)]
-    ldr             INPUT_BUF0, [INPUT_BUF]
-    ldr             INPUT_BUF1, [INPUT_BUF, #4]
-    ldr             INPUT_BUF2, [INPUT_BUF, #8]
-    .unreq          INPUT_BUF
-
-    /* Save NEON registers */
-    vpush           {d8-d15}
-
-    /* Initially set d10, d11, d12, d13 to 0xFF */
-    vmov.u8         q5, #255
-    vmov.u8         q6, #255
-
-    /* Outer loop over scanlines */
-    cmp             NUM_ROWS, #1
-    blt             9f
-0:
-    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
-    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
-    mov             N, OUTPUT_WIDTH
-    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
-    add             INPUT_ROW, INPUT_ROW, #1
-    ldr             RGB, [OUTPUT_BUF], #4
-
-    /* Inner loop over pixels */
-    subs            N, N, #8
-    blt             3f
-    do_load         8
-    do_yuv_to_rgb_stage1
-    subs            N, N, #8
-    blt             2f
-1:
-    do_yuv_to_rgb_stage2_store_load_stage1
-    subs            N, N, #8
-    bge             1b
-2:
-    do_yuv_to_rgb_stage2
-    do_store        \bpp, 8
-    tst             N, #7
-    beq             8f
-3:
-    tst             N, #4
-    beq             3f
-    do_load         4
-3:
-    tst             N, #2
-    beq             4f
-    do_load         2
-4:
-    tst             N, #1
-    beq             5f
-    do_load         1
-5:
-    do_yuv_to_rgb
-    tst             N, #4
-    beq             6f
-    do_store        \bpp, 4
-6:
-    tst             N, #2
-    beq             7f
-    do_store        \bpp, 2
-7:
-    tst             N, #1
-    beq             8f
-    do_store        \bpp, 1
-8:
-    subs            NUM_ROWS, NUM_ROWS, #1
-    bgt             0b
-9:
-    /* Restore all registers and return */
-    vpop            {d8-d15}
-    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
-
-    .unreq          OUTPUT_WIDTH
-    .unreq          INPUT_ROW
-    .unreq          OUTPUT_BUF
-    .unreq          NUM_ROWS
-    .unreq          INPUT_BUF0
-    .unreq          INPUT_BUF1
-    .unreq          INPUT_BUF2
-    .unreq          RGB
-    .unreq          Y
-    .unreq          U
-    .unreq          V
-    .unreq          N
-
-.purgem do_yuv_to_rgb
-.purgem do_yuv_to_rgb_stage1
-.purgem do_yuv_to_rgb_stage2
-.purgem do_yuv_to_rgb_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R  G  B */
-generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
-generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
-generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
-generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 0, 0
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_extrgb_ycc_convert_neon
- * jsimd_extbgr_ycc_convert_neon
- * jsimd_extrgbx_ycc_convert_neon
- * jsimd_extbgrx_ycc_convert_neon
- * jsimd_extxbgr_ycc_convert_neon
- * jsimd_extxrgb_ycc_convert_neon
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro do_store size
-  .if \size == 8
-    vst1.8          {d20}, [Y]!
-    vst1.8          {d21}, [U]!
-    vst1.8          {d22}, [V]!
-  .elseif \size == 4
-    vst1.8          {d20[0]}, [Y]!
-    vst1.8          {d20[1]}, [Y]!
-    vst1.8          {d20[2]}, [Y]!
-    vst1.8          {d20[3]}, [Y]!
-    vst1.8          {d21[0]}, [U]!
-    vst1.8          {d21[1]}, [U]!
-    vst1.8          {d21[2]}, [U]!
-    vst1.8          {d21[3]}, [U]!
-    vst1.8          {d22[0]}, [V]!
-    vst1.8          {d22[1]}, [V]!
-    vst1.8          {d22[2]}, [V]!
-    vst1.8          {d22[3]}, [V]!
-  .elseif \size == 2
-    vst1.8          {d20[4]}, [Y]!
-    vst1.8          {d20[5]}, [Y]!
-    vst1.8          {d21[4]}, [U]!
-    vst1.8          {d21[5]}, [U]!
-    vst1.8          {d22[4]}, [V]!
-    vst1.8          {d22[5]}, [V]!
-  .elseif \size == 1
-    vst1.8          {d20[6]}, [Y]!
-    vst1.8          {d21[6]}, [U]!
-    vst1.8          {d22[6]}, [V]!
-  .else
-    .error unsupported macroblock size
-  .endif
-.endm
-
-.macro do_load bpp, size
-  .if \bpp == 24
-    .if \size == 8
-      vld3.8        {d10, d11, d12}, [RGB]!
-      pld           [RGB, #128]
-    .elseif \size == 4
-      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
-      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
-      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
-      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
-    .elseif \size == 2
-      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
-      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
-    .elseif \size == 1
-      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .elseif \bpp == 32
-    .if \size == 8
-      vld4.8        {d10, d11, d12, d13}, [RGB]!
-      pld           [RGB, #128]
-    .elseif \size == 4
-      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-    .elseif \size == 2
-      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-    .elseif \size == 1
-      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .else
-    .error unsupported bpp
-  .endif
-.endm
-
-.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined RGB->YCbCr conversion
- */
-
-.macro do_rgb_to_yuv_stage1
-    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
-    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
-    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
-    vmull.u16       q7, d4, d0[0]
-    vmlal.u16       q7, d6, d0[1]
-    vmlal.u16       q7, d8, d0[2]
-    vmull.u16       q8, d5, d0[0]
-    vmlal.u16       q8, d7, d0[1]
-    vmlal.u16       q8, d9, d0[2]
-    vrev64.32       q9, q1
-    vrev64.32       q13, q1
-    vmlsl.u16       q9, d4, d0[3]
-    vmlsl.u16       q9, d6, d1[0]
-    vmlal.u16       q9, d8, d1[1]
-    vmlsl.u16       q13, d5, d0[3]
-    vmlsl.u16       q13, d7, d1[0]
-    vmlal.u16       q13, d9, d1[1]
-    vrev64.32       q14, q1
-    vrev64.32       q15, q1
-    vmlal.u16       q14, d4, d1[1]
-    vmlsl.u16       q14, d6, d1[2]
-    vmlsl.u16       q14, d8, d1[3]
-    vmlal.u16       q15, d5, d1[1]
-    vmlsl.u16       q15, d7, d1[2]
-    vmlsl.u16       q15, d9, d1[3]
-.endm
-
-.macro do_rgb_to_yuv_stage2
-    vrshrn.u32      d20, q7, #16
-    vrshrn.u32      d21, q8, #16
-    vshrn.u32       d22, q9, #16
-    vshrn.u32       d23, q13, #16
-    vshrn.u32       d24, q14, #16
-    vshrn.u32       d25, q15, #16
-    vmovn.u16       d20, q10       /* d20 = y */
-    vmovn.u16       d21, q11       /* d21 = u */
-    vmovn.u16       d22, q12       /* d22 = v */
-.endm
-
-.macro do_rgb_to_yuv
-    do_rgb_to_yuv_stage1
-    do_rgb_to_yuv_stage2
-.endm
-
-.macro do_rgb_to_yuv_stage2_store_load_stage1
-      vrshrn.u32      d20, q7, #16
-      vrshrn.u32      d21, q8, #16
-      vshrn.u32       d22, q9, #16
-    vrev64.32       q9, q1
-      vshrn.u32       d23, q13, #16
-    vrev64.32       q13, q1
-      vshrn.u32       d24, q14, #16
-      vshrn.u32       d25, q15, #16
-    do_load         \bpp, 8
-      vmovn.u16       d20, q10     /* d20 = y */
-    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
-      vmovn.u16       d21, q11     /* d21 = u */
-    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
-      vmovn.u16       d22, q12     /* d22 = v */
-    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
-    vmull.u16       q7, d4, d0[0]
-    vmlal.u16       q7, d6, d0[1]
-    vmlal.u16       q7, d8, d0[2]
-      vst1.8          {d20}, [Y]!
-    vmull.u16       q8, d5, d0[0]
-    vmlal.u16       q8, d7, d0[1]
-    vmlal.u16       q8, d9, d0[2]
-    vmlsl.u16       q9, d4, d0[3]
-    vmlsl.u16       q9, d6, d1[0]
-    vmlal.u16       q9, d8, d1[1]
-      vst1.8          {d21}, [U]!
-    vmlsl.u16       q13, d5, d0[3]
-    vmlsl.u16       q13, d7, d1[0]
-    vmlal.u16       q13, d9, d1[1]
-    vrev64.32       q14, q1
-    vrev64.32       q15, q1
-    vmlal.u16       q14, d4, d1[1]
-    vmlsl.u16       q14, d6, d1[2]
-    vmlsl.u16       q14, d8, d1[3]
-      vst1.8          {d22}, [V]!
-    vmlal.u16       q15, d5, d1[1]
-    vmlsl.u16       q15, d7, d1[2]
-    vmlsl.u16       q15, d9, d1[3]
-.endm
-
-.balign 16
-jsimd_\colorid\()_ycc_neon_consts:
-  .short 19595, 38470, 7471,  11059
-  .short 21709, 32768, 27439, 5329
-  .short 32767, 128,   32767, 128
-  .short 32767, 128,   32767, 128
-
-asm_function jsimd_\colorid\()_ycc_convert_neon
-    OUTPUT_WIDTH    .req r0
-    INPUT_BUF       .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_ROW      .req r3
-    NUM_ROWS        .req r4
-
-    OUTPUT_BUF0     .req r5
-    OUTPUT_BUF1     .req r6
-    OUTPUT_BUF2     .req OUTPUT_BUF
-
-    RGB             .req r7
-    Y               .req r8
-    U               .req r9
-    V               .req r10
-    N               .req ip
-
-    /* Load constants to d0, d1, d2, d3 */
-    adr             ip, jsimd_\colorid\()_ycc_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]
-
-    /* Save ARM registers and handle input arguments */
-    push            {r4, r5, r6, r7, r8, r9, r10, lr}
-    ldr             NUM_ROWS, [sp, #(4 * 8)]
-    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
-    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
-    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
-    .unreq          OUTPUT_BUF
-
-    /* Save NEON registers */
-    vpush           {d8-d15}
-
-    /* Outer loop over scanlines */
-    cmp             NUM_ROWS, #1
-    blt             9f
-0:
-    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
-    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
-    mov             N, OUTPUT_WIDTH
-    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
-    add             OUTPUT_ROW, OUTPUT_ROW, #1
-    ldr             RGB, [INPUT_BUF], #4
-
-    /* Inner loop over pixels */
-    subs            N, N, #8
-    blt             3f
-    do_load         \bpp, 8
-    do_rgb_to_yuv_stage1
-    subs            N, N, #8
-    blt             2f
-1:
-    do_rgb_to_yuv_stage2_store_load_stage1
-    subs            N, N, #8
-    bge             1b
-2:
-    do_rgb_to_yuv_stage2
-    do_store        8
-    tst             N, #7
-    beq             8f
-3:
-    tst             N, #4
-    beq             3f
-    do_load         \bpp, 4
-3:
-    tst             N, #2
-    beq             4f
-    do_load         \bpp, 2
-4:
-    tst             N, #1
-    beq             5f
-    do_load         \bpp, 1
-5:
-    do_rgb_to_yuv
-    tst             N, #4
-    beq             6f
-    do_store        4
-6:
-    tst             N, #2
-    beq             7f
-    do_store        2
-7:
-    tst             N, #1
-    beq             8f
-    do_store        1
-8:
-    subs            NUM_ROWS, NUM_ROWS, #1
-    bgt             0b
-9:
-    /* Restore all registers and return */
-    vpop            {d8-d15}
-    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
-
-    .unreq          OUTPUT_WIDTH
-    .unreq          OUTPUT_ROW
-    .unreq          INPUT_BUF
-    .unreq          NUM_ROWS
-    .unreq          OUTPUT_BUF0
-    .unreq          OUTPUT_BUF1
-    .unreq          OUTPUT_BUF2
-    .unreq          RGB
-    .unreq          Y
-    .unreq          U
-    .unreq          V
-    .unreq          N
-
-.purgem do_rgb_to_yuv
-.purgem do_rgb_to_yuv_stage1
-.purgem do_rgb_to_yuv_stage2
-.purgem do_rgb_to_yuv_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R  G  B */
-generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- *       rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
-    SAMPLE_DATA     .req r0
-    START_COL       .req r1
-    WORKSPACE       .req r2
-    TMP1            .req r3
-    TMP2            .req r4
-    TMP3            .req r5
-    TMP4            .req ip
-
-    push            {r4, r5}
-    vmov.u8         d0, #128
-
-    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    vld1.8          {d16}, [TMP1]
-    vsubl.u8        q8, d16, d0
-    vld1.8          {d18}, [TMP2]
-    vsubl.u8        q9, d18, d0
-    vld1.8          {d20}, [TMP3]
-    vsubl.u8        q10, d20, d0
-    vld1.8          {d22}, [TMP4]
-    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
-    vsubl.u8        q11, d22, d0
-    vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    vld1.8          {d24}, [TMP1]
-    vsubl.u8        q12, d24, d0
-    vld1.8          {d26}, [TMP2]
-    vsubl.u8        q13, d26, d0
-    vld1.8          {d28}, [TMP3]
-    vsubl.u8        q14, d28, d0
-    vld1.8          {d30}, [TMP4]
-    vsubl.u8        q15, d30, d0
-    vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
-    vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
-    pop             {r4, r5}
-    bx              lr
-
-    .unreq          SAMPLE_DATA
-    .unreq          START_COL
-    .unreq          WORKSPACE
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- *       rid of a bunch of VLD1.16 instructions
- */
-
-#define XFIX_0_382683433 d0[0]
-#define XFIX_0_541196100 d0[1]
-#define XFIX_0_707106781 d0[2]
-#define XFIX_1_306562965 d0[3]
-
-.balign 16
-jsimd_fdct_ifast_neon_consts:
-  .short (98 * 128)               /* XFIX_0_382683433 */
-  .short (139 * 128)              /* XFIX_0_541196100 */
-  .short (181 * 128)              /* XFIX_0_707106781 */
-  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
-
-asm_function jsimd_fdct_ifast_neon
-
-    DATA            .req r0
-    TMP             .req ip
-
-    vpush           {d8-d15}
-
-    /* Load constants */
-    adr             TMP, jsimd_fdct_ifast_neon_consts
-    vld1.16         {d0}, [TMP, :64]
-
-    /* Load all DATA into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17    | q8
-     *   1 | d18     | d19    | q9
-     *   2 | d20     | d21    | q10
-     *   3 | d22     | d23    | q11
-     *   4 | d24     | d25    | q12
-     *   5 | d26     | d27    | q13
-     *   6 | d28     | d29    | q14
-     *   7 | d30     | d31    | q15
-     */
-
-    vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
-    vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
-    vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
-    vld1.16         {d28, d29, d30, d31}, [DATA, :128]
-    sub             DATA, DATA, #(128 - 32)
-
-    mov             TMP, #2
-1:
-    /* Transpose */
-    vtrn.16         q12, q13
-    vtrn.16         q10, q11
-    vtrn.16         q8, q9
-    vtrn.16         q14, q15
-    vtrn.32         q9, q11
-    vtrn.32         q13, q15
-    vtrn.32         q8, q10
-    vtrn.32         q12, q14
-    vswp            d30, d23
-    vswp            d24, d17
-    vswp            d26, d19
-      /* 1-D FDCT */
-      vadd.s16        q2, q11, q12
-    vswp            d28, d21
-      vsub.s16        q12, q11, q12
-      vsub.s16        q6, q10, q13
-      vadd.s16        q10, q10, q13
-      vsub.s16        q7, q9, q14
-      vadd.s16        q9, q9, q14
-      vsub.s16        q1, q8, q15
-      vadd.s16        q8, q8, q15
-      vsub.s16        q4, q9, q10
-      vsub.s16        q5, q8, q2
-      vadd.s16        q3, q9, q10
-      vadd.s16        q4, q4, q5
-      vadd.s16        q2, q8, q2
-      vqdmulh.s16     q4, q4, XFIX_0_707106781
-      vadd.s16        q11, q12, q6
-      vadd.s16        q8, q2, q3
-      vsub.s16        q12, q2, q3
-      vadd.s16        q3, q6, q7
-      vadd.s16        q7, q7, q1
-      vqdmulh.s16     q3, q3, XFIX_0_707106781
-      vsub.s16        q6, q11, q7
-      vadd.s16        q10, q5, q4
-      vqdmulh.s16     q6, q6, XFIX_0_382683433
-      vsub.s16        q14, q5, q4
-      vqdmulh.s16     q11, q11, XFIX_0_541196100
-      vqdmulh.s16     q5, q7, XFIX_1_306562965
-      vadd.s16        q4, q1, q3
-      vsub.s16        q3, q1, q3
-      vadd.s16        q7, q7, q6
-      vadd.s16        q11, q11, q6
-      vadd.s16        q7, q7, q5
-      vadd.s16        q13, q3, q11
-      vsub.s16        q11, q3, q11
-      vadd.s16        q9, q4, q7
-      vsub.s16        q15, q4, q7
-    subs            TMP, TMP, #1
-    bne             1b
-
-    /* store results */
-    vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
-    vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
-    vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
-    vst1.16         {d28, d29, d30, d31}, [DATA, :128]
-
-    vpop            {d8-d15}
-    bx              lr
-
-    .unreq          DATA
-    .unreq          TMP
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
- *                      DCTELEM *workspace);
- *
- * Note: the code uses 2 stage pipelining in order to improve instructions
- *       scheduling and eliminate stalls (this provides ~15% better
- *       performance for this function on both ARM Cortex-A8 and
- *       ARM Cortex-A9 when compared to the non-pipelined variant).
- *       The instructions which belong to the second stage use different
- *       indentation for better readiability.
- */
-asm_function jsimd_quantize_neon
-
-    COEF_BLOCK      .req r0
-    DIVISORS        .req r1
-    WORKSPACE       .req r2
-
-    RECIPROCAL      .req DIVISORS
-    CORRECTION      .req r3
-    SHIFT           .req ip
-    LOOP_COUNT      .req r4
-
-    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
-    vabs.s16        q12, q0
-    add             CORRECTION, DIVISORS, #(64 * 2)
-    add             SHIFT, DIVISORS, #(64 * 6)
-    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
-    vabs.s16        q13, q1
-    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10  /* add correction */
-    vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
-    vmull.u16       q11, d25, d17
-    vmull.u16       q8, d26, d18
-    vmull.u16       q9, d27, d19
-    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
-    vshrn.u32       d20, q10, #16
-    vshrn.u32       d21, q11, #16
-    vshrn.u32       d22, q8, #16
-    vshrn.u32       d23, q9, #16
-    vneg.s16        q12, q12
-    vneg.s16        q13, q13
-    vshr.s16        q2, q0, #15    /* extract sign */
-    vshr.s16        q3, q1, #15
-    vshl.u16        q14, q10, q12  /* shift */
-    vshl.u16        q15, q11, q13
-
-    push            {r4, r5}
-    mov             LOOP_COUNT, #3
-1:
-    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
-      veor.u16        q14, q14, q2  /* restore sign */
-    vabs.s16        q12, q0
-    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
-    vabs.s16        q13, q1
-      veor.u16        q15, q15, q3
-    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10  /* add correction */
-    vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
-    vmull.u16       q11, d25, d17
-    vmull.u16       q8, d26, d18
-    vmull.u16       q9, d27, d19
-      vsub.u16        q14, q14, q2
-    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
-      vsub.u16        q15, q15, q3
-    vshrn.u32       d20, q10, #16
-    vshrn.u32       d21, q11, #16
-      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-    vshrn.u32       d22, q8, #16
-    vshrn.u32       d23, q9, #16
-    vneg.s16        q12, q12
-    vneg.s16        q13, q13
-    vshr.s16        q2, q0, #15    /* extract sign */
-    vshr.s16        q3, q1, #15
-    vshl.u16        q14, q10, q12  /* shift */
-    vshl.u16        q15, q11, q13
-    subs            LOOP_COUNT, LOOP_COUNT, #1
-    bne             1b
-    pop             {r4, r5}
-
-      veor.u16        q14, q14, q2  /* restore sign */
-      veor.u16        q15, q15, q3
-      vsub.u16        q14, q14, q2
-      vsub.u16        q15, q15, q3
-      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-
-    bx              lr  /* return */
-
-    .unreq          COEF_BLOCK
-    .unreq          DIVISORS
-    .unreq          WORKSPACE
-    .unreq          RECIPROCAL
-    .unreq          CORRECTION
-    .unreq          SHIFT
-    .unreq          LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
- *                                 JDIMENSION downsampled_width,
- *                                 JSAMPARRAY input_data,
- *                                 JSAMPARRAY *output_data_ptr);
- *
- * Note: the use of unaligned writes is the main remaining bottleneck in
- *       this code, which can be potentially solved to get up to tens
- *       of percents performance improvement on Cortex-A8/Cortex-A9.
- */
-
-/*
- * Upsample 16 source pixels to 32 destination pixels. The new 16 source
- * pixels are loaded to q0. The previous 16 source pixels are in q1. The
- * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
- * Register d28 is used for multiplication by 3. Register q15 is used
- * for adding +1 bias.
- */
-.macro upsample16 OUTPTR, INPTR
-    vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8, d0
-    vext.8          q2, q1, q0, #15
-    vmovl.u8        q9, d1
-    vaddw.u8        q10, q15, d4
-    vaddw.u8        q11, q15, d5
-    vmlal.u8        q8, d4, d28
-    vmlal.u8        q9, d5, d28
-    vmlal.u8        q10, d0, d28
-    vmlal.u8        q11, d1, d28
-    vmov            q1, q0        /* backup source pixels to q1 */
-    vrshrn.u16      d6, q8, #2
-    vrshrn.u16      d7, q9, #2
-    vshrn.u16       d8, q10, #2
-    vshrn.u16       d9, q11, #2
-    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-.endm
-
-/*
- * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
- * macro, the roles of q0 and q1 registers are reversed for even and odd
- * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
- * Also this unrolling allows to reorder loads and stores to compensate
- * multiplication latency and reduce stalls.
- */
-.macro upsample32 OUTPTR, INPTR
-    /* even 16 pixels group */
-    vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8, d0
-    vext.8          q2, q1, q0, #15
-    vmovl.u8        q9, d1
-    vaddw.u8        q10, q15, d4
-    vaddw.u8        q11, q15, d5
-    vmlal.u8        q8, d4, d28
-    vmlal.u8        q9, d5, d28
-    vmlal.u8        q10, d0, d28
-    vmlal.u8        q11, d1, d28
-      /* odd 16 pixels group */
-      vld1.8          {q1}, [\INPTR]!
-    vrshrn.u16      d6, q8, #2
-    vrshrn.u16      d7, q9, #2
-    vshrn.u16       d8, q10, #2
-    vshrn.u16       d9, q11, #2
-      vmovl.u8        q8, d2
-      vext.8          q2, q0, q1, #15
-      vmovl.u8        q9, d3
-      vaddw.u8        q10, q15, d4
-      vaddw.u8        q11, q15, d5
-      vmlal.u8        q8, d4, d28
-      vmlal.u8        q9, d5, d28
-      vmlal.u8        q10, d2, d28
-      vmlal.u8        q11, d3, d28
-    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-      vrshrn.u16      d6, q8, #2
-      vrshrn.u16      d7, q9, #2
-      vshrn.u16       d8, q10, #2
-      vshrn.u16       d9, q11, #2
-      vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-.endm
-
-/*
- * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
- */
-.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
-    /* special case for the first and last pixels */
-    sub             \WIDTH, \WIDTH, #1
-    add             \OUTPTR, \OUTPTR, #1
-    ldrb            \TMP1, [\INPTR, \WIDTH]
-    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
-    ldrb            \TMP1, [\INPTR], #1
-    strb            \TMP1, [\OUTPTR, #-1]
-    vmov.8          d3[7], \TMP1
-
-    subs            \WIDTH, \WIDTH, #32
-    blt             5f
-0:  /* process 32 pixels per iteration */
-    upsample32      \OUTPTR, \INPTR
-    subs            \WIDTH, \WIDTH, #32
-    bge             0b
-5:
-    adds            \WIDTH, \WIDTH, #16
-    blt             1f
-0:  /* process 16 pixels if needed */
-    upsample16      \OUTPTR, \INPTR
-    subs            \WIDTH, \WIDTH, #16
-1:
-    adds            \WIDTH, \WIDTH, #16
-    beq             9f
-
-    /* load the remaining 1-15 pixels */
-    add             \INPTR, \INPTR, \WIDTH
-    tst             \WIDTH, #1
-    beq             2f
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[0]}, [\INPTR]
-2:
-    tst             \WIDTH, #2
-    beq             2f
-    vext.8          d0, d0, d0, #6
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[1]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[0]}, [\INPTR]
-2:
-    tst             \WIDTH, #4
-    beq             2f
-    vrev64.32       d0, d0
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[3]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[2]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[1]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[0]}, [\INPTR]
-2:
-    tst             \WIDTH, #8
-    beq             2f
-    vmov            d1, d0
-    sub             \INPTR, \INPTR, #8
-    vld1.8          {d0}, [\INPTR]
-2:  /* upsample the remaining pixels */
-    vmovl.u8        q8, d0
-    vext.8          q2, q1, q0, #15
-    vmovl.u8        q9, d1
-    vaddw.u8        q10, q15, d4
-    vaddw.u8        q11, q15, d5
-    vmlal.u8        q8, d4, d28
-    vmlal.u8        q9, d5, d28
-    vmlal.u8        q10, d0, d28
-    vmlal.u8        q11, d1, d28
-    vrshrn.u16      d10, q8, #2
-    vrshrn.u16      d12, q9, #2
-    vshrn.u16       d11, q10, #2
-    vshrn.u16       d13, q11, #2
-    vzip.8          d10, d11
-    vzip.8          d12, d13
-    /* store the remaining pixels */
-    tst             \WIDTH, #8
-    beq             2f
-    vst1.8          {d10, d11}, [\OUTPTR]!
-    vmov            q5, q6
-2:
-    tst             \WIDTH, #4
-    beq             2f
-    vst1.8          {d10}, [\OUTPTR]!
-    vmov            d10, d11
-2:
-    tst             \WIDTH, #2
-    beq             2f
-    vst1.8          {d10[0]}, [\OUTPTR]!
-    vst1.8          {d10[1]}, [\OUTPTR]!
-    vst1.8          {d10[2]}, [\OUTPTR]!
-    vst1.8          {d10[3]}, [\OUTPTR]!
-    vext.8          d10, d10, d10, #4
-2:
-    tst             \WIDTH, #1
-    beq             2f
-    vst1.8          {d10[0]}, [\OUTPTR]!
-    vst1.8          {d10[1]}, [\OUTPTR]!
-2:
-9:
-.endm
-
-asm_function jsimd_h2v1_fancy_upsample_neon
-
-    MAX_V_SAMP_FACTOR .req r0
-    DOWNSAMPLED_WIDTH .req r1
-    INPUT_DATA        .req r2
-    OUTPUT_DATA_PTR   .req r3
-    OUTPUT_DATA       .req OUTPUT_DATA_PTR
-
-    OUTPTR            .req r4
-    INPTR             .req r5
-    WIDTH             .req ip
-    TMP               .req lr
-
-    push            {r4, r5, r6, lr}
-    vpush           {d8-d15}
-
-    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
-    cmp             MAX_V_SAMP_FACTOR, #0
-    ble             99f
-
-    /* initialize constants */
-    vmov.u8         d28, #3
-    vmov.u16        q15, #1
-11:
-    ldr             INPTR, [INPUT_DATA], #4
-    ldr             OUTPTR, [OUTPUT_DATA], #4
-    mov             WIDTH, DOWNSAMPLED_WIDTH
-    upsample_row    OUTPTR, INPTR, WIDTH, TMP
-    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
-    bgt             11b
-
-99:
-    vpop            {d8-d15}
-    pop             {r4, r5, r6, pc}
-
-    .unreq          MAX_V_SAMP_FACTOR
-    .unreq          DOWNSAMPLED_WIDTH
-    .unreq          INPUT_DATA
-    .unreq          OUTPUT_DATA_PTR
-    .unreq          OUTPUT_DATA
-
-    .unreq          OUTPTR
-    .unreq          INPTR
-    .unreq          WIDTH
-    .unreq          TMP
-
-.purgem upsample16
-.purgem upsample32
-.purgem upsample_row
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(JOCTET*)
- * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
- *                              JCOEFPTR block, int last_dc_val,
- *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
- *
- */
-
-.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
-    sub             \PUT_BITS, \PUT_BITS, #0x8
-    lsr             \TMP, \PUT_BUFFER, \PUT_BITS
-    uxtb            \TMP, \TMP
-    strb            \TMP, [\BUFFER, #1]!
-    cmp             \TMP, #0xff
-    /*it eq*/
-    strbeq          \ZERO, [\BUFFER, #1]!
-.endm
-
-.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
-    /*lsl             \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
-    add             \PUT_BITS, \SIZE
-    /*orr             \PUT_BUFFER, \PUT_BUFFER, \CODE*/
-    orr             \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
-.endm
-
-.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
-  cmp               \PUT_BITS, #0x10
-  blt               15f
-    eor               \ZERO, \ZERO, \ZERO
-    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
-    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
-15:
-.endm
-
-.balign 16
-jsimd_huff_encode_one_block_neon_consts:
-  .byte 0x01
-  .byte 0x02
-  .byte 0x04
-  .byte 0x08
-  .byte 0x10
-  .byte 0x20
-  .byte 0x40
-  .byte 0x80
-
-asm_function jsimd_huff_encode_one_block_neon
-    push            {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-    add             r7, sp, #0x1c
-    sub             r4, sp, #0x40
-    bfc             r4, #0, #5
-    mov             sp, r4           /* align sp on 32 bytes */
-    vst1.64         {d8, d9, d10, d11}, [r4, :128]!
-    vst1.64         {d12, d13, d14, d15}, [r4, :128]
-    sub             sp, #0x140       /* reserve 320 bytes */
-    str             r0, [sp, #0x18]  /* working state > sp + Ox18 */
-    add             r4, sp, #0x20    /* r4 = t1 */
-    ldr             lr, [r7, #0x8]   /* lr = dctbl */
-    sub             r10, r1, #0x1    /* r10=buffer-- */
-    ldrsh           r1, [r2]
-    mov             r9, #0x10
-    mov             r8, #0x1
-    adr             r5, jsimd_huff_encode_one_block_neon_consts
-    /* prepare data */
-    vld1.8          {d26}, [r5, :64]
-    veor            q8, q8, q8
-    veor            q9, q9, q9
-    vdup.16         q14, r9
-    vdup.16         q15, r8
-    veor            q10, q10, q10
-    veor            q11, q11, q11
-    sub             r1, r1, r3
-    add             r9, r2, #0x22
-    add             r8, r2, #0x18
-    add             r3, r2, #0x36
-    vmov.16         d0[0], r1
-    vld1.16         {d2[0]}, [r9, :16]
-    vld1.16         {d4[0]}, [r8, :16]
-    vld1.16         {d6[0]}, [r3, :16]
-    add             r1, r2, #0x2
-    add             r9, r2, #0x30
-    add             r8, r2, #0x26
-    add             r3, r2, #0x28
-    vld1.16         {d0[1]}, [r1, :16]
-    vld1.16         {d2[1]}, [r9, :16]
-    vld1.16         {d4[1]}, [r8, :16]
-    vld1.16         {d6[1]}, [r3, :16]
-    add             r1, r2, #0x10
-    add             r9, r2, #0x40
-    add             r8, r2, #0x34
-    add             r3, r2, #0x1a
-    vld1.16         {d0[2]}, [r1, :16]
-    vld1.16         {d2[2]}, [r9, :16]
-    vld1.16         {d4[2]}, [r8, :16]
-    vld1.16         {d6[2]}, [r3, :16]
-    add             r1, r2, #0x20
-    add             r9, r2, #0x32
-    add             r8, r2, #0x42
-    add             r3, r2, #0xc
-    vld1.16         {d0[3]}, [r1, :16]
-    vld1.16         {d2[3]}, [r9, :16]
-    vld1.16         {d4[3]}, [r8, :16]
-    vld1.16         {d6[3]}, [r3, :16]
-    add             r1, r2, #0x12
-    add             r9, r2, #0x24
-    add             r8, r2, #0x50
-    add             r3, r2, #0xe
-    vld1.16         {d1[0]}, [r1, :16]
-    vld1.16         {d3[0]}, [r9, :16]
-    vld1.16         {d5[0]}, [r8, :16]
-    vld1.16         {d7[0]}, [r3, :16]
-    add             r1, r2, #0x4
-    add             r9, r2, #0x16
-    add             r8, r2, #0x60
-    add             r3, r2, #0x1c
-    vld1.16         {d1[1]}, [r1, :16]
-    vld1.16         {d3[1]}, [r9, :16]
-    vld1.16         {d5[1]}, [r8, :16]
-    vld1.16         {d7[1]}, [r3, :16]
-    add             r1, r2, #0x6
-    add             r9, r2, #0x8
-    add             r8, r2, #0x52
-    add             r3, r2, #0x2a
-    vld1.16         {d1[2]}, [r1, :16]
-    vld1.16         {d3[2]}, [r9, :16]
-    vld1.16         {d5[2]}, [r8, :16]
-    vld1.16         {d7[2]}, [r3, :16]
-    add             r1, r2, #0x14
-    add             r9, r2, #0xa
-    add             r8, r2, #0x44
-    add             r3, r2, #0x38
-    vld1.16         {d1[3]}, [r1, :16]
-    vld1.16         {d3[3]}, [r9, :16]
-    vld1.16         {d5[3]}, [r8, :16]
-    vld1.16         {d7[3]}, [r3, :16]
-    vcgt.s16        q8, q8, q0
-    vcgt.s16        q9, q9, q1
-    vcgt.s16        q10, q10, q2
-    vcgt.s16        q11, q11, q3
-    vabs.s16        q0, q0
-    vabs.s16        q1, q1
-    vabs.s16        q2, q2
-    vabs.s16        q3, q3
-    veor            q8, q8, q0
-    veor            q9, q9, q1
-    veor            q10, q10, q2
-    veor            q11, q11, q3
-    add             r9, r4, #0x20
-    add             r8, r4, #0x80
-    add             r3, r4, #0xa0
-    vclz.i16        q0, q0
-    vclz.i16        q1, q1
-    vclz.i16        q2, q2
-    vclz.i16        q3, q3
-    vsub.i16        q0, q14, q0
-    vsub.i16        q1, q14, q1
-    vsub.i16        q2, q14, q2
-    vsub.i16        q3, q14, q3
-    vst1.16         {d0, d1, d2, d3}, [r4, :256]
-    vst1.16         {d4, d5, d6, d7}, [r9, :256]
-    vshl.s16        q0, q15, q0
-    vshl.s16        q1, q15, q1
-    vshl.s16        q2, q15, q2
-    vshl.s16        q3, q15, q3
-    vsub.i16        q0, q0, q15
-    vsub.i16        q1, q1, q15
-    vsub.i16        q2, q2, q15
-    vsub.i16        q3, q3, q15
-    vand            q8, q8, q0
-    vand            q9, q9, q1
-    vand            q10, q10, q2
-    vand            q11, q11, q3
-    vst1.16         {d16, d17, d18, d19}, [r8, :256]
-    vst1.16         {d20, d21, d22, d23}, [r3, :256]
-    add             r1, r2, #0x46
-    add             r9, r2, #0x3a
-    add             r8, r2, #0x74
-    add             r3, r2, #0x6a
-    vld1.16         {d8[0]}, [r1, :16]
-    vld1.16         {d10[0]}, [r9, :16]
-    vld1.16         {d12[0]}, [r8, :16]
-    vld1.16         {d14[0]}, [r3, :16]
-    veor            q8, q8, q8
-    veor            q9, q9, q9
-    veor            q10, q10, q10
-    veor            q11, q11, q11
-    add             r1, r2, #0x54
-    add             r9, r2, #0x2c
-    add             r8, r2, #0x76
-    add             r3, r2, #0x78
-    vld1.16         {d8[1]}, [r1, :16]
-    vld1.16         {d10[1]}, [r9, :16]
-    vld1.16         {d12[1]}, [r8, :16]
-    vld1.16         {d14[1]}, [r3, :16]
-    add             r1, r2, #0x62
-    add             r9, r2, #0x1e
-    add             r8, r2, #0x68
-    add             r3, r2, #0x7a
-    vld1.16         {d8[2]}, [r1, :16]
-    vld1.16         {d10[2]}, [r9, :16]
-    vld1.16         {d12[2]}, [r8, :16]
-    vld1.16         {d14[2]}, [r3, :16]
-    add             r1, r2, #0x70
-    add             r9, r2, #0x2e
-    add             r8, r2, #0x5a
-    add             r3, r2, #0x6c
-    vld1.16         {d8[3]}, [r1, :16]
-    vld1.16         {d10[3]}, [r9, :16]
-    vld1.16         {d12[3]}, [r8, :16]
-    vld1.16         {d14[3]}, [r3, :16]
-    add             r1, r2, #0x72
-    add             r9, r2, #0x3c
-    add             r8, r2, #0x4c
-    add             r3, r2, #0x5e
-    vld1.16         {d9[0]}, [r1, :16]
-    vld1.16         {d11[0]}, [r9, :16]
-    vld1.16         {d13[0]}, [r8, :16]
-    vld1.16         {d15[0]}, [r3, :16]
-    add             r1, r2, #0x64
-    add             r9, r2, #0x4a
-    add             r8, r2, #0x3e
-    add             r3, r2, #0x6e
-    vld1.16         {d9[1]}, [r1, :16]
-    vld1.16         {d11[1]}, [r9, :16]
-    vld1.16         {d13[1]}, [r8, :16]
-    vld1.16         {d15[1]}, [r3, :16]
-    add             r1, r2, #0x56
-    add             r9, r2, #0x58
-    add             r8, r2, #0x4e
-    add             r3, r2, #0x7c
-    vld1.16         {d9[2]}, [r1, :16]
-    vld1.16         {d11[2]}, [r9, :16]
-    vld1.16         {d13[2]}, [r8, :16]
-    vld1.16         {d15[2]}, [r3, :16]
-    add             r1, r2, #0x48
-    add             r9, r2, #0x66
-    add             r8, r2, #0x5c
-    add             r3, r2, #0x7e
-    vld1.16         {d9[3]}, [r1, :16]
-    vld1.16         {d11[3]}, [r9, :16]
-    vld1.16         {d13[3]}, [r8, :16]
-    vld1.16         {d15[3]}, [r3, :16]
-    vcgt.s16        q8, q8, q4
-    vcgt.s16        q9, q9, q5
-    vcgt.s16        q10, q10, q6
-    vcgt.s16        q11, q11, q7
-    vabs.s16        q4, q4
-    vabs.s16        q5, q5
-    vabs.s16        q6, q6
-    vabs.s16        q7, q7
-    veor            q8, q8, q4
-    veor            q9, q9, q5
-    veor            q10, q10, q6
-    veor            q11, q11, q7
-    add             r1, r4, #0x40
-    add             r9, r4, #0x60
-    add             r8, r4, #0xc0
-    add             r3, r4, #0xe0
-    vclz.i16        q4, q4
-    vclz.i16        q5, q5
-    vclz.i16        q6, q6
-    vclz.i16        q7, q7
-    vsub.i16        q4, q14, q4
-    vsub.i16        q5, q14, q5
-    vsub.i16        q6, q14, q6
-    vsub.i16        q7, q14, q7
-    vst1.16         {d8, d9, d10, d11}, [r1, :256]
-    vst1.16         {d12, d13, d14, d15}, [r9, :256]
-    vshl.s16        q4, q15, q4
-    vshl.s16        q5, q15, q5
-    vshl.s16        q6, q15, q6
-    vshl.s16        q7, q15, q7
-    vsub.i16        q4, q4, q15
-    vsub.i16        q5, q5, q15
-    vsub.i16        q6, q6, q15
-    vsub.i16        q7, q7, q15
-    vand            q8, q8, q4
-    vand            q9, q9, q5
-    vand            q10, q10, q6
-    vand            q11, q11, q7
-    vst1.16         {d16, d17, d18, d19}, [r8, :256]
-    vst1.16         {d20, d21, d22, d23}, [r3, :256]
-    ldr             r12, [r7, #0xc]       /* r12 = actbl */
-    add             r1, lr, #0x400        /* r1 = dctbl->ehufsi */
-    mov             r9, r12               /* r9 = actbl */
-    add             r6, r4, #0x80         /* r6 = t2 */
-    ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
-    ldr             r4, [r0, #0xc]        /* r4  = put_bits */
-    ldrh            r2, [r6, #-128]       /* r2  = nbits */
-    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG) 1)<<nbits) - 1; */
-    ldr             r0, [lr, r2, lsl #2]
-    ldrb            r5, [r1, r2]
-    put_bits        r11, r4, r0, r5
-    checkbuf15      r10, r11, r4, r5, r0
-    put_bits        r11, r4, r3, r2
-    checkbuf15      r10, r11, r4, r5, r0
-    mov             lr, r6                /* lr = t2 */
-    add             r5, r9, #0x400        /* r5 = actbl->ehufsi */
-    ldrsb           r6, [r5, #0xf0]       /* r6 = actbl->ehufsi[0xf0] */
-    veor            q8, q8, q8
-    vceq.i16        q0, q0, q8
-    vceq.i16        q1, q1, q8
-    vceq.i16        q2, q2, q8
-    vceq.i16        q3, q3, q8
-    vceq.i16        q4, q4, q8
-    vceq.i16        q5, q5, q8
-    vceq.i16        q6, q6, q8
-    vceq.i16        q7, q7, q8
-    vmovn.i16       d0, q0
-    vmovn.i16       d2, q1
-    vmovn.i16       d4, q2
-    vmovn.i16       d6, q3
-    vmovn.i16       d8, q4
-    vmovn.i16       d10, q5
-    vmovn.i16       d12, q6
-    vmovn.i16       d14, q7
-    vand            d0, d0, d26
-    vand            d2, d2, d26
-    vand            d4, d4, d26
-    vand            d6, d6, d26
-    vand            d8, d8, d26
-    vand            d10, d10, d26
-    vand            d12, d12, d26
-    vand            d14, d14, d26
-    vpadd.i8        d0, d0, d2
-    vpadd.i8        d4, d4, d6
-    vpadd.i8        d8, d8, d10
-    vpadd.i8        d12, d12, d14
-    vpadd.i8        d0, d0, d4
-    vpadd.i8        d8, d8, d12
-    vpadd.i8        d0, d0, d8
-    vmov.32         r1, d0[1]
-    vmov.32         r8, d0[0]
-    mvn             r1, r1
-    mvn             r8, r8
-    lsrs            r1, r1, #0x1
-    rrx             r8, r8            /* shift in last r1 bit while shifting out DC bit */
-    rbit            r1, r1            /* r1 = index1 */
-    rbit            r8, r8            /* r8 = index0 */
-    ldr             r0, [r9, #0x3c0]  /* r0 = actbl->ehufco[0xf0] */
-    str             r1, [sp, #0x14]   /* index1 > sp + 0x14 */
-    cmp             r8, #0x0
-    beq             6f
-1:
-    clz             r2, r8
-    add             lr, lr, r2, lsl #1
-    lsl             r8, r8, r2
-    ldrh            r1, [lr, #-126]
-2:
-    cmp             r2, #0x10
-    blt             3f
-    sub             r2, r2, #0x10
-    put_bits        r11, r4, r0, r6
-    cmp             r4, #0x10
-    blt             2b
-    eor             r3, r3, r3
-    emit_byte       r10, r11, r4, r3, r12
-    emit_byte       r10, r11, r4, r3, r12
-    b               2b
-3:
-    add             r2, r1, r2, lsl #4
-    ldrh            r3, [lr, #2]!
-    ldr             r12, [r9, r2, lsl #2]
-    ldrb            r2, [r5, r2]
-    put_bits        r11, r4, r12, r2
-    checkbuf15      r10, r11, r4, r2, r12
-    put_bits        r11, r4, r3, r1
-    checkbuf15      r10, r11, r4, r2, r12
-    lsls            r8, r8, #0x1
-    bne             1b
-6:
-    add             r12, sp, #0x20   /* r12 = t1 */
-    ldr             r8, [sp, #0x14]  /* r8 = index1 */
-    adds            r12, #0xc0       /* r12 = t2 + (DCTSIZE2/2) */
-    cmp             r8, #0x0
-    beq             6f
-    clz             r2, r8
-    sub             r12, r12, lr
-    lsl             r8, r8, r2
-    add             r2, r2, r12, lsr #1
-    add             lr, lr, r2, lsl #1
-    b               7f
-1:
-    clz             r2, r8
-    add             lr, lr, r2, lsl #1
-    lsl             r8, r8, r2
-7:
-    ldrh            r1, [lr, #-126]
-2:
-    cmp             r2, #0x10
-    blt             3f
-    sub             r2, r2, #0x10
-    put_bits        r11, r4, r0, r6
-    cmp             r4, #0x10
-    blt             2b
-    eor             r3, r3, r3
-    emit_byte       r10, r11, r4, r3, r12
-    emit_byte       r10, r11, r4, r3, r12
-    b               2b
-3:
-    add             r2, r1, r2, lsl #4
-    ldrh            r3, [lr, #2]!
-    ldr             r12, [r9, r2, lsl #2]
-    ldrb            r2, [r5, r2]
-    put_bits        r11, r4, r12, r2
-    checkbuf15      r10, r11, r4, r2, r12
-    put_bits        r11, r4, r3, r1
-    checkbuf15      r10, r11, r4, r2, r12
-    lsls            r8, r8, #0x1
-    bne             1b
-6:
-    add             r0, sp, #0x20
-    add             r0, #0xfe
-    cmp             lr, r0
-    bhs             1f
-    ldr             r1, [r9]
-    ldrb            r0, [r5]
-    put_bits        r11, r4, r1, r0
-    checkbuf15      r10, r11, r4, r0, r1
-1:
-    ldr             r12, [sp, #0x18]
-    str             r11, [r12, #0x8]
-    str             r4, [r12, #0xc]
-    add             r0, r10, #0x1
-    add             r4, sp, #0x140
-    vld1.64         {d8, d9, d10, d11}, [r4, :128]!
-    vld1.64         {d12, d13, d14, d15}, [r4, :128]
-    sub             r4, r7, #0x1c
-    mov             sp, r4
-    pop             {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-.purgem emit_byte
-.purgem put_bits
-.purgem checkbuf15
diff --git a/media/libjpeg/simd/jsimd_i386.c b/media/libjpeg/simd/jsimd_i386.c
deleted file mode 100644
index 6da8bd8913..0000000000
--- a/media/libjpeg/simd/jsimd_i386.c
+++ /dev/null
@@ -1,1091 +0,0 @@
-/*
- * jsimd_i386.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 32-bit x86 architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-/*
- * In the PIC cases, we have no guarantee that constants will keep
- * their alignment. This macro allows us to verify it at runtime.
- */
-#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0)
-
-#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = jpeg_simd_cpu_support();
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCEMMX");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_MMX;
-  env = getenv("JSIMD_FORCE3DNOW");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_3DNOW|JSIMD_MMX;
-  env = getenv("JSIMD_FORCESSE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_SSE|JSIMD_MMX;
-  env = getenv("JSIMD_FORCESSE2");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_SSE2;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_huffman = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-  void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_extrgb_ycc_convert_sse2;
-      mmxfct=jsimd_extrgb_ycc_convert_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_extrgbx_ycc_convert_sse2;
-      mmxfct=jsimd_extrgbx_ycc_convert_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_extbgr_ycc_convert_sse2;
-      mmxfct=jsimd_extbgr_ycc_convert_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_extbgrx_ycc_convert_sse2;
-      mmxfct=jsimd_extbgrx_ycc_convert_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_extxbgr_ycc_convert_sse2;
-      mmxfct=jsimd_extxbgr_ycc_convert_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_extxrgb_ycc_convert_sse2;
-      mmxfct=jsimd_extxrgb_ycc_convert_mmx;
-      break;
-    default:
-      sse2fct=jsimd_rgb_ycc_convert_sse2;
-      mmxfct=jsimd_rgb_ycc_convert_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
-    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-  void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_extrgb_gray_convert_sse2;
-      mmxfct=jsimd_extrgb_gray_convert_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_extrgbx_gray_convert_sse2;
-      mmxfct=jsimd_extrgbx_gray_convert_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_extbgr_gray_convert_sse2;
-      mmxfct=jsimd_extbgr_gray_convert_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_extbgrx_gray_convert_sse2;
-      mmxfct=jsimd_extbgrx_gray_convert_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_extxbgr_gray_convert_sse2;
-      mmxfct=jsimd_extxbgr_gray_convert_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_extxrgb_gray_convert_sse2;
-      mmxfct=jsimd_extxrgb_gray_convert_mmx;
-      break;
-    default:
-      sse2fct=jsimd_rgb_gray_convert_sse2;
-      mmxfct=jsimd_rgb_gray_convert_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
-    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_ycc_extrgb_convert_sse2;
-      mmxfct=jsimd_ycc_extrgb_convert_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_ycc_extrgbx_convert_sse2;
-      mmxfct=jsimd_ycc_extrgbx_convert_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_ycc_extbgr_convert_sse2;
-      mmxfct=jsimd_ycc_extbgr_convert_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_ycc_extbgrx_convert_sse2;
-      mmxfct=jsimd_ycc_extbgrx_convert_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_ycc_extxbgr_convert_sse2;
-      mmxfct=jsimd_ycc_extxbgr_convert_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_ycc_extxrgb_convert_sse2;
-      mmxfct=jsimd_ycc_extxrgb_convert_mmx;
-      break;
-    default:
-      sse2fct=jsimd_ycc_rgb_convert_sse2;
-      mmxfct=jsimd_ycc_rgb_convert_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
-    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-                               compptr->v_samp_factor,
-                               compptr->width_in_blocks, input_data,
-                               output_data);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
-                              compptr->v_samp_factor, compptr->width_in_blocks,
-                              input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-                               compptr->v_samp_factor,
-                               compptr->width_in_blocks, input_data,
-                               output_data);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
-                              compptr->v_samp_factor, compptr->width_in_blocks,
-                              input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
-                             input_data, output_data_ptr);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
-                            input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
-                             input_data, output_data_ptr);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
-                            input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                   compptr->downsampled_width, input_data,
-                                   output_data_ptr);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
-                                  compptr->downsampled_width, input_data,
-                                  output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                   compptr->downsampled_width, input_data,
-                                   output_data_ptr);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
-                                  compptr->downsampled_width, input_data,
-                                  output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extrgbx_merged_upsample_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extbgr_merged_upsample_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extbgrx_merged_upsample_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extxbgr_merged_upsample_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extxrgb_merged_upsample_mmx;
-      break;
-    default:
-      sse2fct=jsimd_h2v2_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_merged_upsample_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extrgbx_merged_upsample_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extbgr_merged_upsample_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extbgrx_merged_upsample_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extxbgr_merged_upsample_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extxrgb_merged_upsample_mmx;
-      break;
-    default:
-      sse2fct=jsimd_h2v1_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_merged_upsample_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_SSE)
-    return 1;
-  if (simd_support & JSIMD_3DNOW)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_convsamp_sse2(sample_data, start_col, workspace);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_convsamp_mmx(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
-  else if (simd_support & JSIMD_SSE)
-    jsimd_convsamp_float_sse(sample_data, start_col, workspace);
-  else if (simd_support & JSIMD_3DNOW)
-    jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
-    return 1;
-  if (simd_support & JSIMD_3DNOW)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    jsimd_fdct_islow_sse2(data);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_fdct_islow_mmx(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    jsimd_fdct_ifast_sse2(data);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_fdct_ifast_mmx(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
-    jsimd_fdct_float_sse(data);
-  else if (simd_support & JSIMD_3DNOW)
-    jsimd_fdct_float_3dnow(data);
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_SSE)
-    return 1;
-  if (simd_support & JSIMD_3DNOW)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_quantize_sse2(coef_block, divisors, workspace);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_quantize_mmx(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_quantize_float_sse2(coef_block, divisors, workspace);
-  else if (simd_support & JSIMD_SSE)
-    jsimd_quantize_float_sse(coef_block, divisors, workspace);
-  else if (simd_support & JSIMD_3DNOW)
-    jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-  if (sizeof(FLOAT_MULT_TYPE) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
-    return 1;
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
-    return 1;
-  if (simd_support & JSIMD_3DNOW)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
-    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
-                          output_col);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf,
-                         output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
-    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
-                          output_col);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf,
-                         output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
-    jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
-                          output_col);
-  else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
-    jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf,
-                         output_col);
-  else if (simd_support & JSIMD_3DNOW)
-    jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf,
-                           output_col);
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
-      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
-                                          dctbl, actbl);
-}
diff --git a/media/libjpeg/simd/jsimd_mips.c b/media/libjpeg/simd/jsimd_mips.c
deleted file mode 100644
index 63b8115d16..0000000000
--- a/media/libjpeg/simd/jsimd_mips.c
+++ /dev/null
@@ -1,1138 +0,0 @@
-/*
- * jsimd_mips.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
- * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * MIPS architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-static unsigned int simd_support = ~0;
-
-#if defined(__linux__)
-
-LOCAL(int)
-parse_proc_cpuinfo(const char* search_string)
-{
-  const char* file_name = "/proc/cpuinfo";
-  char cpuinfo_line[256];
-  FILE* f = NULL;
-  simd_support = 0;
-
-  if ((f = fopen(file_name, "r")) != NULL) {
-    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
-      if (strstr(cpuinfo_line, search_string) != NULL) {
-        fclose(f);
-        simd_support |= JSIMD_MIPS_DSPR2;
-        return 1;
-      }
-    }
-    fclose(f);
-  }
-  /* Did not find string in the proc file, or not Linux ELF. */
-  return 0;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = 0;
-
-#if defined(__MIPSEL__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-  simd_support |= JSIMD_MIPS_DSPR2;
-#elif defined(__linux__)
-  /* We still have a chance to use MIPS DSPR2 regardless of globally used
-   * -mdspr2 options passed to gcc by performing runtime detection via
-   * /proc/cpuinfo parsing on linux */
-  if (!parse_proc_cpuinfo("MIPS 74K"))
-    return;
-#endif
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCEDSPR2");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = JSIMD_MIPS_DSPR2;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-}
-
-static const int mips_idct_ifast_coefs[4] = {
-  0x45404540,           // FIX( 1.082392200 / 2) =  17734 = 0x4546
-  0x5A805A80,           // FIX( 1.414213562 / 2) =  23170 = 0x5A82
-  0x76407640,           // FIX( 1.847759065 / 2) =  30274 = 0x7642
-  0xAC60AC60            // FIX(-2.613125930 / 4) = -21407 = 0xAC61
-};
-
-/* The following struct is borrowed from jdsample.c */
-typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JSAMPARRAY input_data,
-                               JSAMPARRAY *output_data_ptr);
-
-typedef struct {
-  struct jpeg_upsampler pub;
-  JSAMPARRAY color_buf[MAX_COMPONENTS];
-  upsample1_ptr methods[MAX_COMPONENTS];
-  int next_row_out;
-  JDIMENSION rows_to_go;
-  int rowgroup_height[MAX_COMPONENTS];
-  UINT8 h_expand[MAX_COMPONENTS];
-  UINT8 v_expand[MAX_COMPONENTS];
-} my_upsampler;
-
-typedef my_upsampler *my_upsample_ptr;
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_c_can_null_convert (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2;
-
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2;
-      break;
-    default:
-      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
-      break;
-  }
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row,
-                 num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_extrgbx_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_extbgr_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_extbgrx_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_extxbgr_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_extxrgb_gray_convert_mips_dspr2;
-      break;
-    default:
-      mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2;
-      break;
-  }
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row,
-                 num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2;
-      break;
-  default:
-      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
-      break;
-  }
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    mipsdspr2fct(cinfo->output_width, input_buf, input_row, output_buf,
-                 num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_c_null_convert (j_compress_ptr cinfo,
-                      JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                      JDIMENSION output_row, int num_rows)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_c_null_convert_mips_dspr2(cinfo->image_width, input_buf,
-                                    output_buf, output_row, num_rows,
-                                    cinfo->num_components);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_smooth_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if(DCTSIZE != 8)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v2_downsample_mips_dspr2(cinfo->image_width,
-                                     cinfo->max_v_samp_factor,
-                                     compptr->v_samp_factor,
-                                     compptr->width_in_blocks, input_data,
-                                     output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
-                              jpeg_component_info *compptr,
-                              JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v2_smooth_downsample_mips_dspr2(input_data, output_data,
-                                          compptr->v_samp_factor,
-                                          cinfo->max_v_samp_factor,
-                                          cinfo->smoothing_factor,
-                                          compptr->width_in_blocks,
-                                          cinfo->image_width);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v1_downsample_mips_dspr2(cinfo->image_width,
-                                     cinfo->max_v_samp_factor,
-                                     compptr->v_samp_factor,
-                                     compptr->width_in_blocks,
-                                     input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_int_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v2_upsample_mips_dspr2(cinfo->max_v_samp_factor,
-                                   cinfo->output_width, input_data,
-                                   output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v1_upsample_mips_dspr2(cinfo->max_v_samp_factor,
-                                   cinfo->output_width, input_data,
-                                   output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-
-  jsimd_int_upsample_mips_dspr2(upsample->h_expand[compptr->component_index],
-                                upsample->v_expand[compptr->component_index],
-                                input_data, output_data_ptr,
-                                cinfo->output_width,
-                                cinfo->max_v_samp_factor);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v2_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
-                                         compptr->downsampled_width,
-                                         input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v1_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
-                                         compptr->downsampled_width,
-                                         input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY,
-                       JSAMPLE *);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_h2v2_extbgr_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2;
-      break;
-    default:
-      mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2;
-      break;
-  }
-
-  mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
-               cinfo->sample_range_limit);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY,
-                       JSAMPLE *);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_h2v1_extbgr_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2;
-      break;
-    default:
-      mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2;
-      break;
-  }
-
-  mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
-               cinfo->sample_range_limit);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_convsamp_mips_dspr2(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-  if ((simd_support & JSIMD_MIPS_DSPR2))
-    jsimd_convsamp_float_mips_dspr2(sample_data, start_col, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_fdct_islow_mips_dspr2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_fdct_ifast_mips_dspr2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_quantize_mips_dspr2(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_quantize_float_mips_dspr2(coef_block, divisors, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_6x6 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_12x12 (void)
-{
-  init_simd();
-
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_idct_2x2_mips_dspr2(compptr->dct_table, coef_block, output_buf,
-                              output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2) {
-    int workspace[DCTSIZE*4];  /* buffers data between passes */
-    jsimd_idct_4x4_mips_dspr2(compptr->dct_table, coef_block, output_buf,
-                              output_col, workspace);
-  }
-}
-
-GLOBAL(void)
-jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-           JCOEFPTR coef_block, JSAMPARRAY output_buf,
-           JDIMENSION output_col)
-{
-    if (simd_support & JSIMD_MIPS_DSPR2)
-      jsimd_idct_6x6_mips_dspr2(compptr->dct_table, coef_block, output_buf,
-                                output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block,
-                  JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2) {
-    int workspace[96];
-    int output[12] = {
-      (int)(output_buf[0] + output_col),
-      (int)(output_buf[1] + output_col),
-      (int)(output_buf[2] + output_col),
-      (int)(output_buf[3] + output_col),
-      (int)(output_buf[4] + output_col),
-      (int)(output_buf[5] + output_col),
-      (int)(output_buf[6] + output_col),
-      (int)(output_buf[7] + output_col),
-      (int)(output_buf[8] + output_col),
-      (int)(output_buf[9] + output_col),
-      (int)(output_buf[10] + output_col),
-      (int)(output_buf[11] + output_col),
-    };
-    jsimd_idct_12x12_pass1_mips_dspr2(coef_block, compptr->dct_table,
-                                      workspace);
-    jsimd_idct_12x12_pass2_mips_dspr2(workspace, output);
-  }
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2) {
-    int output[8] = {
-      (int)(output_buf[0] + output_col),
-      (int)(output_buf[1] + output_col),
-      (int)(output_buf[2] + output_col),
-      (int)(output_buf[3] + output_col),
-      (int)(output_buf[4] + output_col),
-      (int)(output_buf[5] + output_col),
-      (int)(output_buf[6] + output_col),
-      (int)(output_buf[7] + output_col),
-    };
-
-    jsimd_idct_islow_mips_dspr2(coef_block, compptr->dct_table,
-                                output, IDCT_range_limit(cinfo));
-  }
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2) {
-    JCOEFPTR inptr;
-    IFAST_MULT_TYPE *quantptr;
-    DCTELEM workspace[DCTSIZE2];  /* buffers data between passes */
-
-    /* Pass 1: process columns from input, store into work array. */
-
-    inptr = coef_block;
-    quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
-
-    jsimd_idct_ifast_cols_mips_dspr2(inptr, quantptr,
-                                     workspace, mips_idct_ifast_coefs);
-
-    /* Pass 2: process rows from work array, store into output array. */
-    /* Note that we must descale the results by a factor of 8 == 2**3, */
-    /* and also undo the PASS1_BITS scaling. */
-
-    jsimd_idct_ifast_rows_mips_dspr2(workspace, output_buf,
-                                     output_col, mips_idct_ifast_coefs);
-  }
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return NULL;
-}
diff --git a/media/libjpeg/simd/jsimd_mips_dspr2.S b/media/libjpeg/simd/jsimd_mips_dspr2.S
deleted file mode 100644
index c8c286cb3e..0000000000
--- a/media/libjpeg/simd/jsimd_mips_dspr2.S
+++ /dev/null
@@ -1,4487 +0,0 @@
-/*
- * MIPS DSPr2 optimizations for libjpeg-turbo
- *
- * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * All Rights Reserved.
- * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
- *           Darko Laus       (darko.laus@imgtec.com)
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#include "jsimd_mips_dspr2_asm.h"
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - input_buf
- * a2     - output_buf
- * a3     - output_row
- * 16(sp) - num_rows
- * 20(sp) - cinfo->num_components
- *
- * Null conversion for compression
- */
-
-    SAVE_REGS_ON_STACK 8, s0, s1
-
-    lw        t9, 24(sp)   // t9 = num_rows
-    lw        s0, 28(sp)   // s0 = cinfo->num_components
-    andi      t0, a0, 3    // t0 = cinfo->image_width & 3
-    beqz      t0, 4f       // no residual
-     nop
-0:
-    addiu     t9, t9, -1
-    bltz      t9, 7f
-     li       t1, 0
-1:
-    sll       t3, t1, 2
-    lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
-    lw        t2, 0(a1)    // t2 = inptr = *input_buf
-    sll       t4, a3, 2
-    lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
-    addu      t2, t2, t1
-    addu      s1, t5, a0
-    addu      t6, t5, t0
-2:
-    lbu       t3, 0(t2)
-    addiu     t5, t5, 1
-    sb        t3, -1(t5)
-    bne       t6, t5, 2b
-     addu     t2, t2, s0
-3:
-    lbu       t3, 0(t2)
-    addu      t4, t2, s0
-    addu      t7, t4, s0
-    addu      t8, t7, s0
-    addu      t2, t8, s0
-    lbu       t4, 0(t4)
-    lbu       t7, 0(t7)
-    lbu       t8, 0(t8)
-    addiu     t5, t5, 4
-    sb        t3, -4(t5)
-    sb        t4, -3(t5)
-    sb        t7, -2(t5)
-    bne       s1, t5, 3b
-     sb       t8, -1(t5)
-    addiu     t1, t1, 1
-    bne       t1, s0, 1b
-     nop
-    addiu     a1, a1, 4
-    bgez      t9, 0b
-     addiu    a3, a3, 1
-    b         7f
-     nop
-4:
-    addiu     t9, t9, -1
-    bltz      t9, 7f
-     li       t1, 0
-5:
-    sll       t3, t1, 2
-    lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
-    lw        t2, 0(a1)    // t2 = inptr = *input_buf
-    sll       t4, a3, 2
-    lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
-    addu      t2, t2, t1
-    addu      s1, t5, a0
-    addu      t6, t5, t0
-6:
-    lbu       t3, 0(t2)
-    addu      t4, t2, s0
-    addu      t7, t4, s0
-    addu      t8, t7, s0
-    addu      t2, t8, s0
-    lbu       t4, 0(t4)
-    lbu       t7, 0(t7)
-    lbu       t8, 0(t8)
-    addiu     t5, t5, 4
-    sb        t3, -4(t5)
-    sb        t4, -3(t5)
-    sb        t7, -2(t5)
-    bne       s1, t5, 6b
-     sb       t8, -1(t5)
-    addiu     t1, t1, 1
-    bne       t1, s0, 5b
-     nop
-    addiu     a1, a1, 4
-    bgez      t9, 4b
-     addiu    a3, a3, 1
-7:
-    RESTORE_REGS_FROM_STACK 8, s0, s1
-
-    j         ra
-     nop
-
-END(jsimd_c_null_convert_mips_dspr2)
-
-/*****************************************************************************/
-/*
- * jsimd_extrgb_ycc_convert_mips_dspr2
- * jsimd_extbgr_ycc_convert_mips_dspr2
- * jsimd_extrgbx_ycc_convert_mips_dspr2
- * jsimd_extbgrx_ycc_convert_mips_dspr2
- * jsimd_extxbgr_ycc_convert_mips_dspr2
- * jsimd_extxrgb_ycc_convert_mips_dspr2
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
-
-.macro DO_RGB_TO_YCC r,    \
-                     g,    \
-                     b,    \
-                     inptr
-    lbu     \r, \r_offs(\inptr)
-    lbu     \g, \g_offs(\inptr)
-    lbu     \b, \b_offs(\inptr)
-    addiu   \inptr, \pixel_size
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - input_buf
- * a2     - output_buf
- * a3     - output_row
- * 16(sp) - num_rows
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    lw      t7, 48(sp)        // t7 = num_rows
-    li      s0, 0x4c8b        // FIX(0.29900)
-    li      s1, 0x9646        // FIX(0.58700)
-    li      s2, 0x1d2f        // FIX(0.11400)
-    li      s3, 0xffffd4cd    // -FIX(0.16874)
-    li      s4, 0xffffab33    // -FIX(0.33126)
-    li      s5, 0x8000        // FIX(0.50000)
-    li      s6, 0xffff94d1    // -FIX(0.41869)
-    li      s7, 0xffffeb2f    // -FIX(0.08131)
-    li      t8, 0x807fff      // CBCR_OFFSET + ONE_HALF-1
-
-0:
-    addiu   t7, -1            // --num_rows
-    lw      t6, 0(a1)         // t6 = input_buf[0]
-    lw      t0, 0(a2)
-    lw      t1, 4(a2)
-    lw      t2, 8(a2)
-    sll     t3, a3, 2
-    lwx     t0, t3(t0)        // t0 = output_buf[0][output_row]
-    lwx     t1, t3(t1)        // t1 = output_buf[1][output_row]
-    lwx     t2, t3(t2)        // t2 = output_buf[2][output_row]
-
-    addu    t9, t2, a0        // t9 = end address
-    addiu   a3, 1
-
-1:
-    DO_RGB_TO_YCC t3, t4, t5, t6
-
-    mtlo    s5, $ac0
-    mtlo    t8, $ac1
-    mtlo    t8, $ac2
-    maddu   $ac0, s2, t5
-    maddu   $ac1, s5, t5
-    maddu   $ac2, s5, t3
-    maddu   $ac0, s0, t3
-    maddu   $ac1, s3, t3
-    maddu   $ac2, s6, t4
-    maddu   $ac0, s1, t4
-    maddu   $ac1, s4, t4
-    maddu   $ac2, s7, t5
-    extr.w  t3, $ac0, 16
-    extr.w  t4, $ac1, 16
-    extr.w  t5, $ac2, 16
-    sb      t3, 0(t0)
-    sb      t4, 0(t1)
-    sb      t5, 0(t2)
-    addiu   t0, 1
-    addiu   t2, 1
-    bne     t2, t9, 1b
-     addiu  t1, 1
-    bgtz    t7, 0b
-     addiu  a1, 4
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j ra
-     nop
-END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
-
-.purgem DO_RGB_TO_YCC
-
-.endm
-
-/*------------------------------------------id -- pix R  G  B */
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
-
-/*****************************************************************************/
-/*
- * jsimd_ycc_extrgb_convert_mips_dspr2
- * jsimd_ycc_extbgr_convert_mips_dspr2
- * jsimd_ycc_extrgbx_convert_mips_dspr2
- * jsimd_ycc_extbgrx_convert_mips_dspr2
- * jsimd_ycc_extxbgr_convert_mips_dspr2
- * jsimd_ycc_extxrgb_convert_mips_dspr2
- *
- * Colorspace conversion YCbCr -> RGB
- */
-
-.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
-
-.macro STORE_YCC_TO_RGB  scratch0 \
-                         scratch1 \
-                         scratch2 \
-                         outptr
-    sb       \scratch0, \r_offs(\outptr)
-    sb       \scratch1, \g_offs(\outptr)
-    sb       \scratch2, \b_offs(\outptr)
-.if (\pixel_size == 4)
-    li       t0, 0xFF
-    sb       t0, \a_offs(\outptr)
-.endif
-    addiu    \outptr, \pixel_size
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - input_buf
- * a2     - input_row
- * a3     - output_buf
- * 16(sp) - num_rows
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    lw         s1, 48(sp)
-    li         t3, 0x8000
-    li         t4, 0x166e9     // FIX(1.40200)
-    li         t5, 0x1c5a2     // FIX(1.77200)
-    li         t6, 0xffff492e  // -FIX(0.71414)
-    li         t7, 0xffffa7e6  // -FIX(0.34414)
-    repl.ph    t8, 128
-
-0:
-    lw         s0, 0(a3)
-    lw         t0, 0(a1)
-    lw         t1, 4(a1)
-    lw         t2, 8(a1)
-    sll        s5, a2, 2
-    addiu      s1, -1
-    lwx        s2, s5(t0)
-    lwx        s3, s5(t1)
-    lwx        s4, s5(t2)
-    addu       t9, s2, a0
-    addiu      a2, 1
-
-1:
-    lbu        s7, 0(s4)       // cr
-    lbu        s6, 0(s3)       // cb
-    lbu        s5, 0(s2)       // y
-    addiu      s2, 1
-    addiu      s4, 1
-    addiu      s7, -128
-    addiu      s6, -128
-    mul        t2, t7, s6
-    mul        t0, t6, s7      // Crgtab[cr]
-    sll        s7, 15
-    mulq_rs.w  t1, t4, s7      // Crrtab[cr]
-    sll        s6, 15
-    addu       t2, t3          // Cbgtab[cb]
-    addu       t2, t0
-
-    mulq_rs.w  t0, t5, s6      // Cbbtab[cb]
-    sra        t2, 16
-    addu       t1, s5
-    addu       t2, s5          // add y
-    ins        t2, t1, 16, 16
-    subu.ph    t2, t2, t8
-    addu       t0, s5
-    shll_s.ph  t2, t2, 8
-    subu       t0, 128
-    shra.ph    t2, t2, 8
-    shll_s.w   t0, t0, 24
-    addu.ph    t2, t2, t8      // clip & store
-    sra        t0, t0, 24
-    sra        t1, t2, 16
-    addiu      t0, 128
-
-    STORE_YCC_TO_RGB t1, t2, t0, s0
-
-    bne        s2, t9, 1b
-     addiu     s3, 1
-    bgtz       s1, 0b
-     addiu     a3, 4
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j ra
-     nop
-END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
-
-.purgem STORE_YCC_TO_RGB
-
-.endm
-
-/*------------------------------------------id -- pix R  G  B  A */
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
-
-/*****************************************************************************/
-/*
- * jsimd_extrgb_gray_convert_mips_dspr2
- * jsimd_extbgr_gray_convert_mips_dspr2
- * jsimd_extrgbx_gray_convert_mips_dspr2
- * jsimd_extbgrx_gray_convert_mips_dspr2
- * jsimd_extxbgr_gray_convert_mips_dspr2
- * jsimd_extxrgb_gray_convert_mips_dspr2
- *
- * Colorspace conversion RGB -> GRAY
- */
-
-.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
-
-.macro DO_RGB_TO_GRAY r,    \
-                      g,    \
-                      b,    \
-                      inptr
-    lbu     \r, \r_offs(\inptr)
-    lbu     \g, \g_offs(\inptr)
-    lbu     \b, \b_offs(\inptr)
-    addiu   \inptr, \pixel_size
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - input_buf
- * a2     - output_buf
- * a3     - output_row
- * 16(sp) - num_rows
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    li      s0, 0x4c8b             // s0 = FIX(0.29900)
-    li      s1, 0x9646             // s1 = FIX(0.58700)
-    li      s2, 0x1d2f             // s2 = FIX(0.11400)
-    li      s7, 0x8000             // s7 = FIX(0.50000)
-    lw      s6, 48(sp)
-    andi    t7, a0, 3
-
-0:
-    addiu   s6, -1                 // s6 = num_rows
-    lw      t0, 0(a1)
-    lw      t1, 0(a2)
-    sll     t3, a3, 2
-    lwx     t1, t3(t1)
-    addiu   a3, 1
-    addu    t9, t1, a0
-    subu    t8, t9, t7
-    beq     t1, t8, 2f
-     nop
-
-1:
-    DO_RGB_TO_GRAY t3, t4, t5, t0
-    DO_RGB_TO_GRAY s3, s4, s5, t0
-
-    mtlo    s7, $ac0
-    maddu   $ac0, s2, t5
-    maddu   $ac0, s1, t4
-    maddu   $ac0, s0, t3
-    mtlo    s7, $ac1
-    maddu   $ac1, s2, s5
-    maddu   $ac1, s1, s4
-    maddu   $ac1, s0, s3
-    extr.w  t6, $ac0, 16
-
-    DO_RGB_TO_GRAY t3, t4, t5, t0
-    DO_RGB_TO_GRAY s3, s4, s5, t0
-
-    mtlo    s7, $ac0
-    maddu   $ac0, s2, t5
-    maddu   $ac0, s1, t4
-    extr.w  t2, $ac1, 16
-    maddu   $ac0, s0, t3
-    mtlo    s7, $ac1
-    maddu   $ac1, s2, s5
-    maddu   $ac1, s1, s4
-    maddu   $ac1, s0, s3
-    extr.w  t5, $ac0, 16
-    sb      t6, 0(t1)
-    sb      t2, 1(t1)
-    extr.w  t3, $ac1, 16
-    addiu   t1, 4
-    sb      t5, -2(t1)
-    sb      t3, -1(t1)
-    bne     t1, t8, 1b
-     nop
-
-2:
-    beqz    t7, 4f
-     nop
-
-3:
-    DO_RGB_TO_GRAY t3, t4, t5, t0
-
-    mtlo    s7, $ac0
-    maddu   $ac0, s2, t5
-    maddu   $ac0, s1, t4
-    maddu   $ac0, s0, t3
-    extr.w  t6, $ac0, 16
-    sb      t6, 0(t1)
-    addiu   t1, 1
-    bne     t1, t9, 3b
-     nop
-
-4:
-    bgtz    s6, 0b
-     addiu  a1, 4
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j ra
-     nop
-END(jsimd_\colorid\()_gray_convert_mips_dspr2)
-
-.purgem DO_RGB_TO_GRAY
-
-.endm
-
-/*------------------------------------------id --  pix R  G  B */
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
-/*****************************************************************************/
-/*
- * jsimd_h2v2_merged_upsample_mips_dspr2
- * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
- * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
- * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
- * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
- * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
- * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
- *
- * Merged h2v2 upsample routines
- */
-.macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
-                                                pixel_size, \
-                                                r1_offs,    \
-                                                g1_offs,    \
-                                                b1_offs,    \
-                                                a1_offs,    \
-                                                r2_offs,    \
-                                                g2_offs,    \
-                                                b2_offs,    \
-                                                a2_offs
-
-.macro STORE_H2V2_2_PIXELS  scratch0 \
-                            scratch1 \
-                            scratch2 \
-                            scratch3 \
-                            scratch4 \
-                            scratch5 \
-                            outptr
-    sb       \scratch0, \r1_offs(\outptr)
-    sb       \scratch1, \g1_offs(\outptr)
-    sb       \scratch2, \b1_offs(\outptr)
-    sb       \scratch3, \r2_offs(\outptr)
-    sb       \scratch4, \g2_offs(\outptr)
-    sb       \scratch5, \b2_offs(\outptr)
-.if (\pixel_size == 8)
-    li       \scratch0, 0xFF
-    sb       \scratch0, \a1_offs(\outptr)
-    sb       \scratch0, \a2_offs(\outptr)
-.endif
-    addiu    \outptr, \pixel_size
-.endm
-
-.macro STORE_H2V2_1_PIXEL  scratch0 \
-                           scratch1 \
-                           scratch2 \
-                           outptr
-    sb    \scratch0, \r1_offs(\outptr)
-    sb    \scratch1, \g1_offs(\outptr)
-    sb    \scratch2, \b1_offs(\outptr)
-
-.if (\pixel_size == 8)
-    li    t0, 0xFF
-    sb    t0, \a1_offs(\outptr)
-.endif
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
-/*
- * a0     - cinfo->output_width
- * a1     - input_buf
- * a2     - in_row_group_ctr
- * a3     - output_buf
- * 16(sp) - cinfo->sample_range_limit
- */
-
-    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
-    lw           t9, 56(sp)        // cinfo->sample_range_limit
-    lw           v0, 0(a1)
-    lw           v1, 4(a1)
-    lw           t0, 8(a1)
-    sll          t1, a2, 3
-    addiu        t2, t1, 4
-    sll          t3, a2, 2
-    lw           t4, 0(a3)         // t4 = output_buf[0]
-    lwx          t1, t1(v0)        // t1 = input_buf[0][in_row_group_ctr*2]
-    lwx          t2, t2(v0)        // t2 = input_buf[0][in_row_group_ctr*2 + 1]
-    lwx          t5, t3(v1)        // t5 = input_buf[1][in_row_group_ctr]
-    lwx          t6, t3(t0)        // t6 = input_buf[2][in_row_group_ctr]
-    lw           t7, 4(a3)         // t7 = output_buf[1]
-    li           s1, 0xe6ea
-    addiu        t8, s1, 0x7fff    // t8 = 0x166e9 [FIX(1.40200)]
-    addiu        s0, t8, 0x5eb9    // s0 = 0x1c5a2 [FIX(1.77200)]
-    addiu        s1, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
-    xori         s2, s1, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
-    srl          t3, a0, 1
-    blez         t3, 2f
-     addu        t0, t5, t3        // t0 = end address
- 1:
-    lbu          t3, 0(t5)
-    lbu          s3, 0(t6)
-    addiu        t5, t5, 1
-    addiu        t3, t3, -128      // (cb - 128)
-    addiu        s3, s3, -128      // (cr - 128)
-    mult         $ac1, s1, t3
-    madd         $ac1, s2, s3
-    sll          s3, s3, 15
-    sll          t3, t3, 15
-    mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
-    extr_r.w     s5, $ac1, 16
-    mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
-    lbu          v0, 0(t1)
-    addiu        t6, t6, 1
-    addiu        t1, t1, 2
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          AT, 0(t3)
-    lbu          s7, 0(s3)
-    lbu          ra, 0(v1)
-    lbu          v0, -1(t1)
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          t3, 0(t3)
-    lbu          s3, 0(s3)
-    lbu          v1, 0(v1)
-    lbu          v0, 0(t2)
-
-    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
-
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          AT, 0(t3)
-    lbu          s7, 0(s3)
-    lbu          ra, 0(v1)
-    lbu          v0, 1(t2)
-    addiu        t2, t2, 2
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          t3, 0(t3)
-    lbu          s3, 0(s3)
-    lbu          v1, 0(v1)
-
-    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
-
-    bne          t0, t5, 1b
-     nop
-2:
-    andi         t0, a0, 1
-    beqz         t0, 4f
-     lbu          t3, 0(t5)
-    lbu          s3, 0(t6)
-    addiu        t3, t3, -128      // (cb - 128)
-    addiu        s3, s3, -128      // (cr - 128)
-    mult         $ac1, s1, t3
-    madd         $ac1, s2, s3
-    sll          s3, s3, 15
-    sll          t3, t3, 15
-    lbu          v0, 0(t1)
-    extr_r.w     s5, $ac1, 16
-    mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
-    mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          t3, 0(t3)
-    lbu          s3, 0(s3)
-    lbu          v1, 0(v1)
-    lbu          v0, 0(t2)
-
-    STORE_H2V2_1_PIXEL t3, s3, v1, t4
-
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          t3, 0(t3)
-    lbu          s3, 0(s3)
-    lbu          v1, 0(v1)
-
-    STORE_H2V2_1_PIXEL t3, s3, v1, t7
-4:
-    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
-    j           ra
-     nop
-
-END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
-
-.purgem STORE_H2V2_1_PIXEL
-.purgem STORE_H2V2_2_PIXELS
-.endm
-
-/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
-/*****************************************************************************/
-/*
- * jsimd_h2v1_merged_upsample_mips_dspr2
- * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
- * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
- * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
- * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
- * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
- * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
- *
- * Merged h2v1 upsample routines
- */
-
-.macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
-                                                pixel_size, \
-                                                r1_offs,    \
-                                                g1_offs,    \
-                                                b1_offs,    \
-                                                a1_offs,    \
-                                                r2_offs,    \
-                                                g2_offs,    \
-                                                b2_offs,    \
-                                                a2_offs
-
-.macro STORE_H2V1_2_PIXELS  scratch0 \
-                            scratch1 \
-                            scratch2 \
-                            scratch3 \
-                            scratch4 \
-                            scratch5 \
-                            outptr
-    sb       \scratch0, \r1_offs(\outptr)
-    sb       \scratch1, \g1_offs(\outptr)
-    sb       \scratch2, \b1_offs(\outptr)
-    sb       \scratch3, \r2_offs(\outptr)
-    sb       \scratch4, \g2_offs(\outptr)
-    sb       \scratch5, \b2_offs(\outptr)
-.if (\pixel_size == 8)
-    li       t0, 0xFF
-    sb       t0, \a1_offs(\outptr)
-    sb       t0, \a2_offs(\outptr)
-.endif
-    addiu    \outptr, \pixel_size
-.endm
-
-.macro STORE_H2V1_1_PIXEL  scratch0 \
-                           scratch1 \
-                           scratch2 \
-                           outptr
-    sb    \scratch0, \r1_offs(\outptr)
-    sb    \scratch1, \g1_offs(\outptr)
-    sb    \scratch2, \b1_offs(\outptr)
-.if (\pixel_size == 8)
-    li    t0, 0xFF
-    sb    t0, \a1_offs(\outptr)
-.endif
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
-/*
- * a0     - cinfo->output_width
- * a1     - input_buf
- * a2     - in_row_group_ctr
- * a3     - output_buf
- * 16(sp) - range_limit
- */
-
-    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
-    li           t0, 0xe6ea
-    lw           t1, 0(a1)         // t1 = input_buf[0]
-    lw           t2, 4(a1)         // t2 = input_buf[1]
-    lw           t3, 8(a1)         // t3 = input_buf[2]
-    lw           t8, 56(sp)        // t8 = range_limit
-    addiu        s1, t0, 0x7fff    // s1 = 0x166e9 [FIX(1.40200)]
-    addiu        s2, s1, 0x5eb9    // s2 = 0x1c5a2 [FIX(1.77200)]
-    addiu        s0, t0, 0x9916    // s0 = 0x8000
-    addiu        s4, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
-    xori         s3, s4, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
-    srl          t0, a0, 1
-    sll          t4, a2, 2
-    lwx          s5, t4(t1)        // s5 = inptr0
-    lwx          s6, t4(t2)        // s6 = inptr1
-    lwx          s7, t4(t3)        // s7 = inptr2
-    lw           t7, 0(a3)         // t7 = outptr
-    blez         t0, 2f
-     addu        t9, s6, t0        // t9 = end address
-1:
-    lbu          t2, 0(s6)         // t2 = cb
-    lbu          t0, 0(s7)         // t0 = cr
-    lbu          t1, 0(s5)         // t1 = y
-    addiu        t2, t2, -128      // t2 = cb - 128
-    addiu        t0, t0, -128      // t0 = cr - 128
-    mult         $ac1, s4, t2
-    madd         $ac1, s3, t0
-    sll          t0, t0, 15
-    sll          t2, t2, 15
-    mulq_rs.w    t0, s1, t0        // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
-    extr_r.w     t5, $ac1, 16
-    mulq_rs.w    t6, s2, t2        // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
-    addiu        s7, s7, 1
-    addiu        s6, s6, 1
-    addu         t2, t1, t0        // t2 = y + cred
-    addu         t3, t1, t5        // t3 = y + cgreen
-    addu         t4, t1, t6        // t4 = y + cblue
-    addu         t2, t8, t2
-    addu         t3, t8, t3
-    addu         t4, t8, t4
-    lbu          t1, 1(s5)
-    lbu          v0, 0(t2)
-    lbu          v1, 0(t3)
-    lbu          ra, 0(t4)
-    addu         t2, t1, t0
-    addu         t3, t1, t5
-    addu         t4, t1, t6
-    addu         t2, t8, t2
-    addu         t3, t8, t3
-    addu         t4, t8, t4
-    lbu          t2, 0(t2)
-    lbu          t3, 0(t3)
-    lbu          t4, 0(t4)
-
-    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
-
-    bne          t9, s6, 1b
-     addiu       s5, s5, 2
-2:
-    andi         t0, a0, 1
-    beqz         t0, 4f
-     nop
-3:
-    lbu          t2, 0(s6)
-    lbu          t0, 0(s7)
-    lbu          t1, 0(s5)
-    addiu        t2, t2, -128      //(cb - 128)
-    addiu        t0, t0, -128      //(cr - 128)
-    mul          t3, s4, t2
-    mul          t4, s3, t0
-    sll          t0, t0, 15
-    sll          t2, t2, 15
-    mulq_rs.w    t0, s1, t0       // (C1*cr + ONE_HALF)>> SCALEBITS
-    mulq_rs.w    t6, s2, t2       // (C2*cb + ONE_HALF)>> SCALEBITS
-    addu         t3, t3, s0
-    addu         t3, t4, t3
-    sra          t5, t3, 16       // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
-    addu         t2, t1, t0       // y + cred
-    addu         t3, t1, t5       // y + cgreen
-    addu         t4, t1, t6       // y + cblue
-    addu         t2, t8, t2
-    addu         t3, t8, t3
-    addu         t4, t8, t4
-    lbu          t2, 0(t2)
-    lbu          t3, 0(t3)
-    lbu          t4, 0(t4)
-
-    STORE_H2V1_1_PIXEL t2, t3, t4, t7
-4:
-    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
-    j            ra
-     nop
-
-END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
-
-.purgem STORE_H2V1_1_PIXEL
-.purgem STORE_H2V1_2_PIXELS
-.endm
-
-/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
-/*****************************************************************************/
-/*
- * jsimd_h2v2_fancy_upsample_mips_dspr2
- *
- * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
- */
-LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
-/*
- * a0     - cinfo->max_v_samp_factor
- * a1     - downsampled_width
- * a2     - input_data
- * a3     - output_data_ptr
- */
-
-    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
-
-    li             s4, 0
-    lw             s2, 0(a3)       // s2 = *output_data_ptr
-0:
-    li             t9, 2
-    lw             s1, -4(a2)      // s1 = inptr1
-
-1:
-    lw             s0, 0(a2)       // s0 = inptr0
-    lwx            s3, s4(s2)
-    addiu          s5, a1, -2      // s5 = downsampled_width - 2
-    srl            t4, s5, 1
-    sll            t4, t4, 1
-    lbu            t0, 0(s0)
-    lbu            t1, 1(s0)
-    lbu            t2, 0(s1)
-    lbu            t3, 1(s1)
-    addiu          s0, 2
-    addiu          s1, 2
-    addu           t8, s0, t4      // t8 = end address
-    andi           s5, s5, 1       // s5 = residual
-    sll            t4, t0, 1
-    sll            t6, t1, 1
-    addu           t0, t0, t4      // t0 = (*inptr0++) * 3
-    addu           t1, t1, t6      // t1 = (*inptr0++) * 3
-    addu           t7, t0, t2      // t7 = thiscolsum
-    addu           t6, t1, t3      // t5 = nextcolsum
-    sll            t0, t7, 2       // t0 = thiscolsum * 4
-    subu           t1, t0, t7      // t1 = thiscolsum * 3
-    shra_r.w       t0, t0, 4
-    addiu          t1, 7
-    addu           t1, t1, t6
-    srl            t1, t1, 4
-    sb             t0, 0(s3)
-    sb             t1, 1(s3)
-    beq            t8, s0, 22f     // skip to final iteration if width == 3
-     addiu          s3, 2
-2:
-    lh             t0, 0(s0)       // t0 = A3|A2
-    lh             t2, 0(s1)       // t2 = B3|B2
-    addiu          s0, 2
-    addiu          s1, 2
-    preceu.ph.qbr  t0, t0          // t0 = 0|A3|0|A2
-    preceu.ph.qbr  t2, t2          // t2 = 0|B3|0|B2
-    shll.ph        t1, t0, 1
-    sll            t3, t6, 1
-    addu.ph        t0, t1, t0      // t0 = A3*3|A2*3
-    addu           t3, t3, t6      // t3 = this * 3
-    addu.ph        t0, t0, t2      // t0 = next2|next1
-    addu           t1, t3, t7
-    andi           t7, t0, 0xFFFF  // t7 = next1
-    sll            t2, t7, 1
-    addu           t2, t7, t2      // t2 = next1*3
-    addu           t4, t2, t6
-    srl            t6, t0, 16      // t6 = next2
-    shra_r.w       t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
-    addu           t0, t3, t7
-    addiu          t0, 7
-    srl            t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
-    shra_r.w       t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
-    addu           t2, t2, t6
-    addiu          t2, 7
-    srl            t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
-    sb             t1, 0(s3)
-    sb             t0, 1(s3)
-    sb             t4, 2(s3)
-    sb             t2, 3(s3)
-    bne            t8, s0, 2b
-     addiu         s3, 4
-22:
-    beqz           s5, 4f
-     addu          t8, s0, s5
-3:
-    lbu            t0, 0(s0)
-    lbu            t2, 0(s1)
-    addiu          s0, 1
-    addiu          s1, 1
-    sll            t3, t6, 1
-    sll            t1, t0, 1
-    addu           t1, t0, t1      // t1 = inptr0 * 3
-    addu           t3, t3, t6      // t3 = thiscolsum * 3
-    addu           t5, t1, t2
-    addu           t1, t3, t7
-    shra_r.w       t1, t1, 4
-    addu           t0, t3, t5
-    addiu          t0, 7
-    srl            t0, t0, 4
-    sb             t1, 0(s3)
-    sb             t0, 1(s3)
-    addiu          s3, 2
-    move           t7, t6
-    bne            t8, s0, 3b
-     move          t6, t5
-4:
-    sll            t0, t6, 2       // t0 = thiscolsum * 4
-    subu           t1, t0, t6      // t1 = thiscolsum * 3
-    addu           t1, t1, t7
-    addiu          s4, 4
-    shra_r.w       t1, t1, 4
-    addiu          t0, 7
-    srl            t0, t0, 4
-    sb             t1, 0(s3)
-    sb             t0, 1(s3)
-    addiu          t9, -1
-    addiu          s3, 2
-    bnez           t9, 1b
-     lw            s1, 4(a2)
-    srl            t0, s4, 2
-    subu           t0, a0, t0
-    bgtz           t0, 0b
-     addiu         a2, 4
-
-    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
-
-    j ra
-     nop
-END(jsimd_h2v2_fancy_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
-/*
- * a0     - cinfo->max_v_samp_factor
- * a1     - downsampled_width
- * a2     - input_data
- * a3     - output_data_ptr
- */
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
-    .set at
-
-    beqz           a0, 3f
-     sll           t0, a0, 2
-    lw             s1, 0(a3)
-    li             s3, 0x10001
-    addu           s0, s1, t0
-0:
-    addiu          t8, a1, -2
-    srl            t9, t8, 2
-    lw             t7, 0(a2)
-    lw             s2, 0(s1)
-    lbu            t0, 0(t7)
-    lbu            t1, 1(t7)   // t1 = inptr[1]
-    sll            t2, t0, 1
-    addu           t2, t2, t0  // t2 = invalue*3
-    addu           t2, t2, t1
-    shra_r.w       t2, t2, 2
-    sb             t0, 0(s2)
-    sb             t2, 1(s2)
-    beqz           t9, 11f
-     addiu         s2, 2
-1:
-    ulw            t0, 0(t7)   // t0 = |P3|P2|P1|P0|
-    ulw            t1, 1(t7)
-    ulh            t2, 4(t7)   // t2 = |0|0|P5|P4|
-    preceu.ph.qbl  t3, t0      // t3 = |0|P3|0|P2|
-    preceu.ph.qbr  t0, t0      // t0 = |0|P1|0|P0|
-    preceu.ph.qbr  t2, t2      // t2 = |0|P5|0|P4|
-    preceu.ph.qbl  t4, t1      // t4 = |0|P4|0|P3|
-    preceu.ph.qbr  t1, t1      // t1 = |0|P2|0|P1|
-    shll.ph        t5, t4, 1
-    shll.ph        t6, t1, 1
-    addu.ph        t5, t5, t4  // t5 = |P4*3|P3*3|
-    addu.ph        t6, t6, t1  // t6 = |P2*3|P1*3|
-    addu.ph        t4, t3, s3
-    addu.ph        t0, t0, s3
-    addu.ph        t4, t4, t5
-    addu.ph        t0, t0, t6
-    shrl.ph        t4, t4, 2   // t4 = |0|P3|0|P2|
-    shrl.ph        t0, t0, 2   // t0 = |0|P1|0|P0|
-    addu.ph        t2, t2, t5
-    addu.ph        t3, t3, t6
-    shra_r.ph      t2, t2, 2   // t2 = |0|P5|0|P4|
-    shra_r.ph      t3, t3, 2   // t3 = |0|P3|0|P2|
-    shll.ph        t2, t2, 8
-    shll.ph        t3, t3, 8
-    or             t2, t4, t2
-    or             t3, t3, t0
-    addiu          t9, -1
-    usw            t3, 0(s2)
-    usw            t2, 4(s2)
-    addiu          s2, 8
-    bgtz           t9, 1b
-     addiu         t7, 4
-11:
-    andi           t8, 3
-    beqz           t8, 22f
-     addiu         t7, 1
-
-2:
-    lbu            t0, 0(t7)
-    addiu          t7, 1
-    sll            t1, t0, 1
-    addu           t2, t0, t1  // t2 = invalue
-    lbu            t3, -2(t7)
-    lbu            t4, 0(t7)
-    addiu          t3, 1
-    addiu          t4, 2
-    addu           t3, t3, t2
-    addu           t4, t4, t2
-    srl            t3, 2
-    srl            t4, 2
-    sb             t3, 0(s2)
-    sb             t4, 1(s2)
-    addiu          t8, -1
-    bgtz           t8, 2b
-     addiu         s2, 2
-
-22:
-    lbu            t0, 0(t7)
-    lbu            t2, -1(t7)
-    sll            t1, t0, 1
-    addu           t1, t1, t0 // t1 = invalue * 3
-    addu           t1, t1, t2
-    addiu          t1, 1
-    srl            t1, t1, 2
-    sb             t1, 0(s2)
-    sb             t0, 1(s2)
-    addiu          s1, 4
-    bne            s1, s0, 0b
-     addiu         a2, 4
-3:
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
-    j              ra
-     nop
-END(jsimd_h2v1_fancy_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - cinfo->max_v_samp_factor
- * a2     - compptr->v_samp_factor
- * a3     - compptr->width_in_blocks
- * 16(sp) - input_data
- * 20(sp) - output_data
- */
-    .set at
-
-    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
-
-    beqz        a2, 7f
-     lw         s1, 44(sp)  // s1 = output_data
-    lw          s0, 40(sp)  // s0 = input_data
-    srl         s2, a0, 2
-    andi        t9, a0, 2
-    srl         t7, t9, 1
-    addu        s2, t7, s2
-    sll         t0, a3, 3   // t0 = width_in_blocks*DCT
-    srl         t7, t0, 1
-    subu        s2, t7, s2
-0:
-    andi        t6, a0, 1   // t6 = temp_index
-    addiu       t6, -1
-    lw          t4, 0(s1)   // t4 = outptr
-    lw          t5, 0(s0)   // t5 = inptr0
-    li          s3, 0       // s3 = bias
-    srl         t7, a0, 1   // t7 = image_width1
-    srl         s4, t7, 2
-    andi        t8, t7, 3
-1:
-    ulhu        t0, 0(t5)
-    ulhu        t1, 2(t5)
-    ulhu        t2, 4(t5)
-    ulhu        t3, 6(t5)
-    raddu.w.qb  t0, t0
-    raddu.w.qb  t1, t1
-    raddu.w.qb  t2, t2
-    raddu.w.qb  t3, t3
-    shra.ph     t0, t0, 1
-    shra_r.ph   t1, t1, 1
-    shra.ph     t2, t2, 1
-    shra_r.ph   t3, t3, 1
-    sb          t0, 0(t4)
-    sb          t1, 1(t4)
-    sb          t2, 2(t4)
-    sb          t3, 3(t4)
-    addiu       s4, -1
-    addiu       t4, 4
-    bgtz        s4, 1b
-     addiu      t5, 8
-    beqz        t8, 3f
-     addu       s4, t4, t8
-2:
-    ulhu        t0, 0(t5)
-    raddu.w.qb  t0, t0
-    addqh.w     t0, t0, s3
-    xori        s3, s3, 1
-    sb          t0, 0(t4)
-    addiu       t4, 1
-    bne         t4, s4, 2b
-     addiu      t5, 2
-3:
-    lbux        t1, t6(t5)
-    sll         t1, 1
-    addqh.w     t2, t1, s3  // t2 = pixval1
-    xori        s3, s3, 1
-    addqh.w     t3, t1, s3  // t3 = pixval2
-    blez        s2, 5f
-     append     t3, t2,  8
-    addu        t5, t4, s2  // t5 = loop_end2
-4:
-    ush         t3, 0(t4)
-    addiu       s2, -1
-    bgtz        s2, 4b
-     addiu      t4,  2
-5:
-    beqz        t9, 6f
-     nop
-    sb          t2, 0(t4)
-6:
-    addiu       s1, 4
-    addiu       a2, -1
-    bnez        a2, 0b
-     addiu      s0, 4
-7:
-    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
-
-    j           ra
-    nop
-END(jsimd_h2v1_downsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
-
-/*
- * a0     - cinfo->image_width
- * a1     - cinfo->max_v_samp_factor
- * a2     - compptr->v_samp_factor
- * a3     - compptr->width_in_blocks
- * 16(sp) - input_data
- * 20(sp) - output_data
- */
-    .set at
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    beqz         a2, 8f
-     lw          s1, 52(sp)      // s1 = output_data
-    lw           s0, 48(sp)      // s0 = input_data
-
-    andi         t6, a0, 1       // t6 = temp_index
-    addiu        t6, -1
-    srl          t7, a0, 1       // t7 = image_width1
-    srl          s4, t7, 2
-    andi         t8, t7, 3
-    andi         t9, a0, 2
-    srl          s2, a0, 2
-    srl          t7, t9, 1
-    addu         s2, t7, s2
-    sll          t0, a3, 3       // s2 = width_in_blocks*DCT
-    srl          t7, t0, 1
-    subu         s2, t7, s2
-0:
-    lw           t4, 0(s1)       // t4 = outptr
-    lw           t5, 0(s0)       // t5 = inptr0
-    lw           s7, 4(s0)       // s7 = inptr1
-    li           s6, 1           // s6 = bias
-2:
-    ulw          t0, 0(t5)       // t0 = |P3|P2|P1|P0|
-    ulw          t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
-    ulw          t2, 4(t5)
-    ulw          t3, 4(s7)
-    precrq.ph.w  t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
-    ins          t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
-    raddu.w.qb   t1, t7
-    raddu.w.qb   t0, t0
-    shra_r.w     t1, t1, 2
-    addiu        t0, 1
-    srl          t0, 2
-    precrq.ph.w  t7, t2, t3
-    ins          t2, t3, 16, 16
-    raddu.w.qb   t7, t7
-    raddu.w.qb   t2, t2
-    shra_r.w     t7, t7, 2
-    addiu        t2, 1
-    srl          t2, 2
-    sb           t0, 0(t4)
-    sb           t1, 1(t4)
-    sb           t2, 2(t4)
-    sb           t7, 3(t4)
-    addiu        t4, 4
-    addiu        t5, 8
-    addiu        s4, s4, -1
-    bgtz         s4, 2b
-     addiu       s7, 8
-    beqz         t8, 4f
-     addu        t8, t4, t8
-3:
-    ulhu         t0, 0(t5)
-    ulhu         t1, 0(s7)
-    ins          t0, t1, 16, 16
-    raddu.w.qb   t0, t0
-    addu         t0, t0, s6
-    srl          t0, 2
-    xori         s6, s6, 3
-    sb           t0, 0(t4)
-    addiu        t5, 2
-    addiu        t4, 1
-    bne          t8, t4, 3b
-     addiu       s7, 2
-4:
-    lbux         t1, t6(t5)
-    sll          t1, 1
-    lbux         t0, t6(s7)
-    sll          t0, 1
-    addu         t1, t1, t0
-    addu         t3, t1, s6
-    srl          t0, t3, 2       // t2 = pixval1
-    xori         s6, s6, 3
-    addu         t2, t1, s6
-    srl          t1, t2, 2       // t3 = pixval2
-    blez         s2, 6f
-     append      t1, t0, 8
-5:
-    ush          t1, 0(t4)
-    addiu        s2, -1
-    bgtz         s2, 5b
-     addiu       t4, 2
-6:
-    beqz         t9, 7f
-     nop
-    sb           t0, 0(t4)
-7:
-    addiu        s1, 4
-    addiu        a2, -1
-    bnez         a2, 0b
-     addiu       s0, 8
-8:
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j            ra
-     nop
-END(jsimd_h2v2_downsample_mips_dspr2)
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
-/*
- * a0     - input_data
- * a1     - output_data
- * a2     - compptr->v_samp_factor
- * a3     - cinfo->max_v_samp_factor
- * 16(sp) - cinfo->smoothing_factor
- * 20(sp) - compptr->width_in_blocks
- * 24(sp) - cinfo->image_width
- */
-
-    .set at
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    lw          s7, 52(sp)      // compptr->width_in_blocks
-    lw          s0, 56(sp)      // cinfo->image_width
-    lw          s6, 48(sp)      // cinfo->smoothing_factor
-    sll         s7, 3           // output_cols = width_in_blocks * DCTSIZE
-    sll         v0, s7, 1
-    subu        v0, v0, s0
-    blez        v0, 2f
-    move        v1, zero
-    addiu       t0, a3, 2       // t0 = cinfo->max_v_samp_factor + 2
-0:
-    addiu       t1, a0, -4
-    sll         t2, v1, 2
-    lwx         t1, t2(t1)
-    move        t3, v0
-    addu        t1, t1, s0
-    lbu         t2, -1(t1)
-1:
-    addiu       t3, t3, -1
-    sb          t2, 0(t1)
-    bgtz        t3, 1b
-    addiu       t1, t1, 1
-    addiu       v1, v1, 1
-    bne         v1, t0, 0b
-    nop
-2:
-    li          v0, 80
-    mul         v0, s6, v0
-    li          v1, 16384
-    move        t4, zero
-    move        t5, zero
-    subu        t6, v1, v0      // t6 = 16384 - tmp_smoot_f * 80
-    sll         t7, s6, 4       // t7 = tmp_smoot_f * 16
-3:
-/* Special case for first column: pretend column -1 is same as column 0 */
-    sll         v0, t4, 2
-    lwx         t8, v0(a1)      //  outptr = output_data[outrow]
-    sll         v1, t5, 2
-    addiu       t9, v1, 4
-    addiu       s0, v1, -4
-    addiu       s1, v1, 8
-    lwx         s2, v1(a0)      // inptr0 = input_data[inrow]
-    lwx         t9, t9(a0)      // inptr1 = input_data[inrow+1]
-    lwx         s0, s0(a0)      // above_ptr = input_data[inrow-1]
-    lwx         s1, s1(a0)      // below_ptr = input_data[inrow+2]
-    lh          v0, 0(s2)
-    lh          v1, 0(t9)
-    lh          t0, 0(s0)
-    lh          t1, 0(s1)
-    ins         v0, v1, 16, 16
-    ins         t0, t1, 16, 16
-    raddu.w.qb  t2, v0
-    raddu.w.qb  s3, t0
-    lbu         v0, 0(s2)
-    lbu         v1, 2(s2)
-    lbu         t0, 0(t9)
-    lbu         t1, 2(t9)
-    addu        v0, v0, v1
-    mult        $ac1,t2, t6
-    addu        t0, t0, t1
-    lbu         t2, 2(s0)
-    addu        t0, t0, v0
-    lbu         t3, 2(s1)
-    addu        s3, t0, s3
-    lbu         v0, 0(s0)
-    lbu         t0, 0(s1)
-    sll         s3, s3, 1
-    addu        v0, v0, t2
-    addu        t0, t0, t3
-    addu        t0, t0, v0
-    addu        s3, t0, s3
-    madd        $ac1,s3, t7
-    extr_r.w    v0, $ac1, 16
-    addiu       t8, t8, 1
-    addiu       s2, s2, 2
-    addiu       t9, t9, 2
-    addiu       s0, s0, 2
-    addiu       s1, s1, 2
-    sb          v0, -1(t8)
-    addiu       s4, s7, -2
-    and         s4, s4, 3
-    addu        s5, s4, t8      //end adress
-4:
-    lh          v0, 0(s2)
-    lh          v1, 0(t9)
-    lh          t0, 0(s0)
-    lh          t1, 0(s1)
-    ins         v0, v1, 16, 16
-    ins         t0, t1, 16, 16
-    raddu.w.qb  t2, v0
-    raddu.w.qb  s3, t0
-    lbu         v0, -1(s2)
-    lbu         v1, 2(s2)
-    lbu         t0, -1(t9)
-    lbu         t1, 2(t9)
-    addu        v0, v0, v1
-    mult        $ac1, t2, t6
-    addu        t0, t0, t1
-    lbu         t2, 2(s0)
-    addu        t0, t0, v0
-    lbu         t3, 2(s1)
-    addu        s3, t0, s3
-    lbu         v0, -1(s0)
-    lbu         t0, -1(s1)
-    sll         s3, s3, 1
-    addu        v0, v0, t2
-    addu        t0, t0, t3
-    addu        t0, t0, v0
-    addu        s3, t0, s3
-    madd        $ac1, s3, t7
-    extr_r.w    t2, $ac1, 16
-    addiu       t8, t8, 1
-    addiu       s2, s2, 2
-    addiu       t9, t9, 2
-    addiu       s0, s0, 2
-    sb          t2, -1(t8)
-    bne         s5, t8, 4b
-    addiu       s1, s1, 2
-    addiu       s5, s7, -2
-    subu        s5, s5, s4
-    addu        s5, s5, t8      //end adress
-5:
-    lh          v0, 0(s2)
-    lh          v1, 0(t9)
-    lh          t0, 0(s0)
-    lh          t1, 0(s1)
-    ins         v0, v1, 16, 16
-    ins         t0, t1, 16, 16
-    raddu.w.qb  t2, v0
-    raddu.w.qb  s3, t0
-    lbu         v0, -1(s2)
-    lbu         v1, 2(s2)
-    lbu         t0, -1(t9)
-    lbu         t1, 2(t9)
-    addu        v0, v0, v1
-    mult        $ac1, t2, t6
-    addu        t0, t0, t1
-    lbu         t2, 2(s0)
-    addu        t0, t0, v0
-    lbu         t3, 2(s1)
-    addu        s3, t0, s3
-    lbu         v0, -1(s0)
-    lbu         t0, -1(s1)
-    sll         s3, s3, 1
-    addu        v0, v0, t2
-    addu        t0, t0, t3
-    lh          v1, 2(t9)
-    addu        t0, t0, v0
-    lh          v0, 2(s2)
-    addu        s3, t0, s3
-    lh          t0, 2(s0)
-    lh          t1, 2(s1)
-    madd        $ac1, s3, t7
-    extr_r.w    t2, $ac1, 16
-    ins         t0, t1, 16, 16
-    ins         v0, v1, 16, 16
-    raddu.w.qb  s3, t0
-    lbu         v1, 4(s2)
-    lbu         t0, 1(t9)
-    lbu         t1, 4(t9)
-    sb          t2, 0(t8)
-    raddu.w.qb  t3, v0
-    lbu         v0, 1(s2)
-    addu        t0, t0, t1
-    mult        $ac1, t3, t6
-    addu        v0, v0, v1
-    lbu         t2, 4(s0)
-    addu        t0, t0, v0
-    lbu         v0, 1(s0)
-    addu        s3, t0, s3
-    lbu         t0, 1(s1)
-    lbu         t3, 4(s1)
-    addu        v0, v0, t2
-    sll         s3, s3, 1
-    addu        t0, t0, t3
-    lh          v1, 4(t9)
-    addu        t0, t0, v0
-    lh          v0, 4(s2)
-    addu        s3, t0, s3
-    lh          t0, 4(s0)
-    lh          t1, 4(s1)
-    madd        $ac1, s3, t7
-    extr_r.w    t2, $ac1, 16
-    ins         t0, t1, 16, 16
-    ins         v0, v1, 16, 16
-    raddu.w.qb  s3, t0
-    lbu         v1, 6(s2)
-    lbu         t0, 3(t9)
-    lbu         t1, 6(t9)
-    sb          t2, 1(t8)
-    raddu.w.qb  t3, v0
-    lbu         v0, 3(s2)
-    addu        t0, t0,t1
-    mult        $ac1, t3, t6
-    addu        v0, v0, v1
-    lbu         t2, 6(s0)
-    addu        t0, t0, v0
-    lbu         v0, 3(s0)
-    addu        s3, t0, s3
-    lbu         t0, 3(s1)
-    lbu         t3, 6(s1)
-    addu        v0, v0, t2
-    sll         s3, s3, 1
-    addu        t0, t0, t3
-    lh          v1, 6(t9)
-    addu        t0, t0, v0
-    lh          v0, 6(s2)
-    addu        s3, t0, s3
-    lh          t0, 6(s0)
-    lh          t1, 6(s1)
-    madd        $ac1, s3, t7
-    extr_r.w    t3, $ac1, 16
-    ins         t0, t1, 16, 16
-    ins         v0, v1, 16, 16
-    raddu.w.qb  s3, t0
-    lbu         v1, 8(s2)
-    lbu         t0, 5(t9)
-    lbu         t1, 8(t9)
-    sb          t3, 2(t8)
-    raddu.w.qb  t2, v0
-    lbu         v0, 5(s2)
-    addu        t0, t0, t1
-    mult        $ac1, t2, t6
-    addu        v0, v0, v1
-    lbu         t2, 8(s0)
-    addu        t0, t0, v0
-    lbu         v0, 5(s0)
-    addu        s3, t0, s3
-    lbu         t0, 5(s1)
-    lbu         t3, 8(s1)
-    addu        v0, v0, t2
-    sll         s3, s3, 1
-    addu        t0, t0, t3
-    addiu       t8, t8, 4
-    addu        t0, t0, v0
-    addiu       s2, s2, 8
-    addu        s3, t0, s3
-    addiu       t9, t9, 8
-    madd        $ac1, s3, t7
-    extr_r.w    t1, $ac1, 16
-    addiu       s0, s0, 8
-    addiu       s1, s1, 8
-    bne         s5, t8, 5b
-    sb          t1, -1(t8)
-/* Special case for last column */
-    lh          v0, 0(s2)
-    lh          v1, 0(t9)
-    lh          t0, 0(s0)
-    lh          t1, 0(s1)
-    ins         v0, v1, 16, 16
-    ins         t0, t1, 16, 16
-    raddu.w.qb  t2, v0
-    raddu.w.qb  s3, t0
-    lbu         v0, -1(s2)
-    lbu         v1, 1(s2)
-    lbu         t0, -1(t9)
-    lbu         t1, 1(t9)
-    addu        v0, v0, v1
-    mult        $ac1, t2, t6
-    addu        t0, t0, t1
-    lbu         t2, 1(s0)
-    addu        t0, t0, v0
-    lbu         t3, 1(s1)
-    addu        s3, t0, s3
-    lbu         v0, -1(s0)
-    lbu         t0, -1(s1)
-    sll         s3, s3, 1
-    addu        v0, v0, t2
-    addu        t0, t0, t3
-    addu        t0, t0, v0
-    addu        s3, t0, s3
-    madd        $ac1, s3, t7
-    extr_r.w    t0, $ac1, 16
-    addiu       t5, t5, 2
-    sb          t0, 0(t8)
-    addiu       t4, t4, 1
-    bne         t4, a2, 3b
-    addiu       t5, t5, 2
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j           ra
-     nop
-
-END(jsimd_h2v2_smooth_downsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
-/*
- * a0     - upsample->h_expand[compptr->component_index]
- * a1     - upsample->v_expand[compptr->component_index]
- * a2     - input_data
- * a3     - output_data_ptr
- * 16(sp) - cinfo->output_width
- * 20(sp) - cinfo->max_v_samp_factor
- */
-    .set at
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
-    lw      s0, 0(a3)    // s0 = output_data
-    lw      s1, 32(sp)   // s1 = cinfo->output_width
-    lw      s2, 36(sp)   // s2 = cinfo->max_v_samp_factor
-    li      t6, 0        // t6 = inrow
-    beqz    s2, 10f
-     li     s3, 0        // s3 = outrow
-0:
-    addu    t0, a2, t6
-    addu    t7, s0, s3
-    lw      t3, 0(t0)    // t3 = inptr
-    lw      t8, 0(t7)    // t8 = outptr
-    beqz    s1, 4f
-     addu   t5, t8, s1   // t5 = outend
-1:
-    lb      t2, 0(t3)    // t2 = invalue = *inptr++
-    addiu   t3, 1
-    beqz    a0, 3f
-     move   t0, a0       // t0 = h_expand
-2:
-    sb      t2, 0(t8)
-    addiu   t0, -1
-    bgtz    t0, 2b
-     addiu  t8, 1
-3:
-    bgt     t5, t8, 1b
-     nop
-4:
-    addiu   t9, a1, -1   // t9 = v_expand - 1
-    blez    t9, 9f
-     nop
-5:
-    lw      t3, 0(s0)
-    lw      t4, 4(s0)
-    subu    t0, s1, 0xF
-    blez    t0, 7f
-     addu   t5, t3, s1   // t5 = end address
-    andi    t7, s1, 0xF  // t7 = residual
-    subu    t8, t5, t7
-6:
-    ulw     t0, 0(t3)
-    ulw     t1, 4(t3)
-    ulw     t2, 8(t3)
-    usw     t0, 0(t4)
-    ulw     t0, 12(t3)
-    usw     t1, 4(t4)
-    usw     t2, 8(t4)
-    usw     t0, 12(t4)
-    addiu   t3, 16
-    bne     t3, t8, 6b
-     addiu  t4, 16
-    beqz    t7, 8f
-     nop
-7:
-    lbu     t0, 0(t3)
-    sb      t0, 0(t4)
-    addiu   t3, 1
-    bne     t3, t5, 7b
-     addiu  t4, 1
-8:
-    addiu   t9, -1
-    bgtz    t9, 5b
-     addiu  s0, 8
-9:
-    addu    s3, s3, a1
-    bne     s3, s2, 0b
-     addiu  t6, 1
-10:
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
-    j       ra
-     nop
-END(jsimd_int_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
-/*
- * a0     - cinfo->max_v_samp_factor
- * a1     - cinfo->output_width
- * a2     - input_data
- * a3     - output_data_ptr
- */
-    lw      t7, 0(a3)       // t7 = output_data
-    andi    t8, a1, 0xf     // t8 = residual
-    sll     t0, a0, 2
-    blez    a0, 4f
-     addu   t9, t7, t0      // t9 = output_data end address
-0:
-    lw      t5, 0(t7)       // t5 = outptr
-    lw      t6, 0(a2)       // t6 = inptr
-    addu    t3, t5, a1      // t3 = outptr + output_width (end address)
-    subu    t3, t8          // t3 = end address - residual
-    beq     t5, t3, 2f
-     move   t4, t8
-1:
-    ulw     t0, 0(t6)       // t0 = |P3|P2|P1|P0|
-    ulw     t2, 4(t6)       // t2 = |P7|P6|P5|P4|
-    srl     t1, t0, 16      // t1 = |X|X|P3|P2|
-    ins     t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
-    ins     t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
-    ins     t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
-    ins     t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
-    usw     t0, 0(t5)
-    usw     t1, 4(t5)
-    srl     t0, t2, 16      // t0 = |X|X|P7|P6|
-    ins     t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
-    ins     t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
-    ins     t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
-    ins     t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
-    usw     t2, 8(t5)
-    usw     t0, 12(t5)
-    addiu   t5, 16
-    bne     t5, t3, 1b
-     addiu  t6, 8
-    beqz    t8, 3f
-     move   t4, t8
-2:
-    lbu     t1, 0(t6)
-    sb      t1, 0(t5)
-    sb      t1, 1(t5)
-    addiu   t4, -2
-    addiu   t6, 1
-    bgtz    t4, 2b
-     addiu  t5, 2
-3:
-    addiu   t7, 4
-    bne     t9, t7, 0b
-     addiu  a2, 4
-4:
-    j       ra
-     nop
-END(jsimd_h2v1_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
-/*
- * a0     - cinfo->max_v_samp_factor
- * a1     - cinfo->output_width
- * a2     - input_data
- * a3     - output_data_ptr
- */
-    lw      t7, 0(a3)
-    blez    a0, 7f
-     andi   t9, a1, 0xf     // t9 = residual
-0:
-    lw      t6, 0(a2)       // t6 = inptr
-    lw      t5, 0(t7)       // t5 = outptr
-    addu    t8, t5, a1      // t8 = outptr end address
-    subu    t8, t9          // t8 = end address - residual
-    beq     t5, t8, 2f
-     move   t4, t9
-1:
-    ulw     t0, 0(t6)
-    srl     t1, t0, 16
-    ins     t0, t0, 16, 16
-    ins     t0, t0, 8, 16
-    ins     t1, t1, 16, 16
-    ins     t1, t1, 8, 16
-    ulw     t2, 4(t6)
-    usw     t0, 0(t5)
-    usw     t1, 4(t5)
-    srl     t3, t2, 16
-    ins     t2, t2, 16, 16
-    ins     t2, t2, 8, 16
-    ins     t3, t3, 16, 16
-    ins     t3, t3, 8, 16
-    usw     t2, 8(t5)
-    usw     t3, 12(t5)
-    addiu   t5, 16
-    bne     t5, t8, 1b
-     addiu  t6, 8
-    beqz    t9, 3f
-     move   t4, t9
-2:
-    lbu     t0, 0(t6)
-    sb      t0, 0(t5)
-    sb      t0, 1(t5)
-    addiu   t4, -2
-    addiu   t6, 1
-    bgtz    t4, 2b
-     addiu  t5, 2
-3:
-    lw      t6, 0(t7)       // t6 = outptr[0]
-    lw      t5, 4(t7)       // t5 = outptr[1]
-    addu    t4, t6, a1      // t4 = new end address
-    beq     a1, t9, 5f
-     subu   t8, t4, t9
-4:
-    ulw     t0, 0(t6)
-    ulw     t1, 4(t6)
-    ulw     t2, 8(t6)
-    usw     t0, 0(t5)
-    ulw     t0, 12(t6)
-    usw     t1, 4(t5)
-    usw     t2, 8(t5)
-    usw     t0, 12(t5)
-    addiu   t6, 16
-    bne     t6, t8, 4b
-     addiu  t5, 16
-    beqz    t9, 6f
-     nop
-5:
-    lbu     t0, 0(t6)
-    sb      t0, 0(t5)
-    addiu   t6, 1
-    bne     t6, t4, 5b
-     addiu  t5, 1
-6:
-    addiu   t7, 8
-    addiu   a0, -2
-    bgtz    a0, 0b
-     addiu  a2, 4
-7:
-    j       ra
-     nop
-END(jsimd_h2v2_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
-/*
- * a0     - coef_block
- * a1     - compptr->dcttable
- * a2     - output
- * a3     - range_limit
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    addiu     sp, sp, -256
-    move      v0, sp
-    addiu     v1, zero, 8      // v1 = DCTSIZE = 8
-1:
-    lh        s4, 32(a0)       // s4 = inptr[16]
-    lh        s5, 64(a0)       // s5 = inptr[32]
-    lh        s6, 96(a0)       // s6 = inptr[48]
-    lh        t1, 112(a0)      // t1 = inptr[56]
-    lh        t7, 16(a0)       // t7 = inptr[8]
-    lh        t5, 80(a0)       // t5 = inptr[40]
-    lh        t3, 48(a0)       // t3 = inptr[24]
-    or        s4, s4, t1
-    or        s4, s4, t3
-    or        s4, s4, t5
-    or        s4, s4, t7
-    or        s4, s4, s5
-    or        s4, s4, s6
-    bnez      s4, 2f
-     addiu    v1, v1, -1
-    lh        s5, 0(a1)        // quantptr[DCTSIZE*0]
-    lh        s6, 0(a0)        // inptr[DCTSIZE*0]
-    mul       s5, s5, s6       // DEQUANTIZE(inptr[0], quantptr[0])
-    sll       s5, s5, 2
-    sw        s5, 0(v0)
-    sw        s5, 32(v0)
-    sw        s5, 64(v0)
-    sw        s5, 96(v0)
-    sw        s5, 128(v0)
-    sw        s5, 160(v0)
-    sw        s5, 192(v0)
-    b         3f
-     sw       s5, 224(v0)
-2:
-    lh        t0, 112(a1)
-    lh        t2, 48(a1)
-    lh        t4, 80(a1)
-    lh        t6, 16(a1)
-    mul       t0, t0, t1       // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
-    mul       t1, t2, t3       // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
-    mul       t2, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
-    mul       t3, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
-    lh        t4, 32(a1)
-    lh        t5, 32(a0)
-    lh        t6, 96(a1)
-    lh        t7, 96(a0)
-    addu      s0, t0, t1       // z3 = tmp0 + tmp2
-    addu      s1, t1, t2       // z2 = tmp1 + tmp2
-    addu      s2, t2, t3       // z4 = tmp1 + tmp3
-    addu      s3, s0, s2       // z3 + z4
-    addiu     t9, zero, 9633   // FIX_1_175875602
-    mul       s3, s3, t9       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
-    addu      t8, t0, t3       // z1 = tmp0 + tmp3
-    addiu     t9, zero, 2446   // FIX_0_298631336
-    mul       t0, t0, t9       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
-    addiu     t9, zero, 16819  // FIX_2_053119869
-    mul       t2, t2, t9       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
-    addiu     t9, zero, 25172  // FIX_3_072711026
-    mul       t1, t1, t9       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
-    addiu     t9, zero, 12299  // FIX_1_501321110
-    mul       t3, t3, t9       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
-    addiu     t9, zero, 16069  // FIX_1_961570560
-    mul       s0, s0, t9       // -z3 = MULTIPLY(z3, FIX_1_961570560)
-    addiu     t9, zero, 3196   // FIX_0_390180644
-    mul       s2, s2, t9       // -z4 = MULTIPLY(z4, FIX_0_390180644)
-    addiu     t9, zero, 7373   // FIX_0_899976223
-    mul       t8, t8, t9       // -z1 = MULTIPLY(z1, FIX_0_899976223)
-    addiu     t9, zero, 20995  // FIX_2_562915447
-    mul       s1, s1, t9       // -z2 = MULTIPLY(z2, FIX_2_562915447)
-    subu      s0, s3, s0       // z3 += z5
-    addu      t0, t0, s0       // tmp0 += z3
-    addu      t1, t1, s0       // tmp2 += z3
-    subu      s2, s3, s2       // z4 += z5
-    addu      t2, t2, s2       // tmp1 += z4
-    addu      t3, t3, s2       // tmp3 += z4
-    subu      t0, t0, t8       // tmp0 += z1
-    subu      t1, t1, s1       // tmp2 += z2
-    subu      t2, t2, s1       // tmp1 += z2
-    subu      t3, t3, t8       // tmp3 += z1
-    mul       s0, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
-    addiu     t9, zero, 6270   // FIX_0_765366865
-    mul       s1, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
-    lh        t4, 0(a1)
-    lh        t5, 0(a0)
-    lh        t6, 64(a1)
-    lh        t7, 64(a0)
-    mul       s2, t9, s0       // MULTIPLY(z2, FIX_0_765366865)
-    mul       t5, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
-    mul       t6, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
-    addiu     t9, zero, 4433   // FIX_0_541196100
-    addu      s3, s0, s1       // z2 + z3
-    mul       s3, s3, t9       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
-    addiu     t9, zero, 15137  // FIX_1_847759065
-    mul       t8, s1, t9       // MULTIPLY(z3, FIX_1_847759065)
-    addu      t4, t5, t6
-    subu      t5, t5, t6
-    sll       t4, t4, 13       // tmp0 = (z2 + z3) << CONST_BITS
-    sll       t5, t5, 13       // tmp1 = (z2 - z3) << CONST_BITS
-    addu      t7, s3, s2       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
-    subu      t6, s3, t8       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
-    addu      s0, t4, t7
-    subu      s1, t4, t7
-    addu      s2, t5, t6
-    subu      s3, t5, t6
-    addu      t4, s0, t3
-    subu      s0, s0, t3
-    addu      t3, s2, t1
-    subu      s2, s2, t1
-    addu      t1, s3, t2
-    subu      s3, s3, t2
-    addu      t2, s1, t0
-    subu      s1, s1, t0
-    shra_r.w  t4, t4, 11
-    shra_r.w  t3, t3, 11
-    shra_r.w  t1, t1, 11
-    shra_r.w  t2, t2, 11
-    shra_r.w  s1, s1, 11
-    shra_r.w  s3, s3, 11
-    shra_r.w  s2, s2, 11
-    shra_r.w  s0, s0, 11
-    sw        t4, 0(v0)
-    sw        t3, 32(v0)
-    sw        t1, 64(v0)
-    sw        t2, 96(v0)
-    sw        s1, 128(v0)
-    sw        s3, 160(v0)
-    sw        s2, 192(v0)
-    sw        s0, 224(v0)
-3:
-    addiu     a1, a1, 2
-    addiu     a0, a0, 2
-    bgtz      v1, 1b
-     addiu    v0, v0, 4
-    move      v0, sp
-    addiu     v1, zero, 8
-4:
-    lw        t0, 8(v0)        // z2 = (JLONG) wsptr[2]
-    lw        t1, 24(v0)       // z3 = (JLONG) wsptr[6]
-    lw        t2, 0(v0)        // (JLONG) wsptr[0]
-    lw        t3, 16(v0)       // (JLONG) wsptr[4]
-    lw        s4, 4(v0)        // (JLONG) wsptr[1]
-    lw        s5, 12(v0)       // (JLONG) wsptr[3]
-    lw        s6, 20(v0)       // (JLONG) wsptr[5]
-    lw        s7, 28(v0)       // (JLONG) wsptr[7]
-    or        s4, s4, t0
-    or        s4, s4, t1
-    or        s4, s4, t3
-    or        s4, s4, s7
-    or        s4, s4, s5
-    or        s4, s4, s6
-    bnez      s4, 5f
-     addiu    v1, v1, -1
-    shra_r.w  s5, t2, 5
-    andi      s5, s5, 0x3ff
-    lbux      s5, s5(a3)
-    lw        s1, 0(a2)
-    replv.qb  s5, s5
-    usw       s5, 0(s1)
-    usw       s5, 4(s1)
-    b         6f
-     nop
-5:
-    addu      t4, t0, t1       // z2 + z3
-    addiu     t8, zero, 4433   // FIX_0_541196100
-    mul       t5, t4, t8       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
-    addiu     t8, zero, 15137  // FIX_1_847759065
-    mul       t1, t1, t8       // MULTIPLY(z3, FIX_1_847759065)
-    addiu     t8, zero, 6270   // FIX_0_765366865
-    mul       t0, t0, t8       // MULTIPLY(z2, FIX_0_765366865)
-    addu      t4, t2, t3       // (JLONG) wsptr[0] + (JLONG) wsptr[4]
-    subu      t2, t2, t3       // (JLONG) wsptr[0] - (JLONG) wsptr[4]
-    sll       t4, t4, 13       // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
-    sll       t2, t2, 13       // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
-    subu      t1, t5, t1       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
-    subu      t3, t2, t1       // tmp12 = tmp1 - tmp2
-    addu      t2, t2, t1       // tmp11 = tmp1 + tmp2
-    addu      t5, t5, t0       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
-    subu      t1, t4, t5       // tmp13 = tmp0 - tmp3
-    addu      t0, t4, t5       // tmp10 = tmp0 + tmp3
-    lw        t4, 28(v0)       // tmp0 = (JLONG) wsptr[7]
-    lw        t6, 12(v0)       // tmp2 = (JLONG) wsptr[3]
-    lw        t5, 20(v0)       // tmp1 = (JLONG) wsptr[5]
-    lw        t7, 4(v0)        // tmp3 = (JLONG) wsptr[1]
-    addu      s0, t4, t6       // z3 = tmp0 + tmp2
-    addiu     t8, zero, 9633   // FIX_1_175875602
-    addu      s1, t5, t7       // z4 = tmp1 + tmp3
-    addu      s2, s0, s1       // z3 + z4
-    mul       s2, s2, t8       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
-    addu      s3, t4, t7       // z1 = tmp0 + tmp3
-    addu      t9, t5, t6       // z2 = tmp1 + tmp2
-    addiu     t8, zero, 16069  // FIX_1_961570560
-    mul       s0, s0, t8       // -z3 = MULTIPLY(z3, FIX_1_961570560)
-    addiu     t8, zero, 3196   // FIX_0_390180644
-    mul       s1, s1, t8       // -z4 = MULTIPLY(z4, FIX_0_390180644)
-    addiu     t8, zero, 2446   // FIX_0_298631336
-    mul       t4, t4, t8       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
-    addiu     t8, zero, 7373   // FIX_0_899976223
-    mul       s3, s3, t8       // -z1 = MULTIPLY(z1, FIX_0_899976223)
-    addiu     t8, zero, 16819  // FIX_2_053119869
-    mul       t5, t5, t8       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
-    addiu     t8, zero, 20995  // FIX_2_562915447
-    mul       t9, t9, t8       // -z2 = MULTIPLY(z2, FIX_2_562915447)
-    addiu     t8, zero, 25172  // FIX_3_072711026
-    mul       t6, t6, t8       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
-    addiu     t8, zero, 12299  // FIX_1_501321110
-    mul       t7, t7, t8       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
-    subu      s0, s2, s0       // z3 += z5
-    subu      s1, s2, s1       // z4 += z5
-    addu      t4, t4, s0
-    subu      t4, t4, s3       // tmp0
-    addu      t5, t5, s1
-    subu      t5, t5, t9       // tmp1
-    addu      t6, t6, s0
-    subu      t6, t6, t9       // tmp2
-    addu      t7, t7, s1
-    subu      t7, t7, s3       // tmp3
-    addu      s0, t0, t7
-    subu      t0, t0, t7
-    addu      t7, t2, t6
-    subu      t2, t2, t6
-    addu      t6, t3, t5
-    subu      t3, t3, t5
-    addu      t5, t1, t4
-    subu      t1, t1, t4
-    shra_r.w  s0, s0, 18
-    shra_r.w  t7, t7, 18
-    shra_r.w  t6, t6, 18
-    shra_r.w  t5, t5, 18
-    shra_r.w  t1, t1, 18
-    shra_r.w  t3, t3, 18
-    shra_r.w  t2, t2, 18
-    shra_r.w  t0, t0, 18
-    andi      s0, s0, 0x3ff
-    andi      t7, t7, 0x3ff
-    andi      t6, t6, 0x3ff
-    andi      t5, t5, 0x3ff
-    andi      t1, t1, 0x3ff
-    andi      t3, t3, 0x3ff
-    andi      t2, t2, 0x3ff
-    andi      t0, t0, 0x3ff
-    lw        s1, 0(a2)
-    lbux      s0, s0(a3)
-    lbux      t7, t7(a3)
-    lbux      t6, t6(a3)
-    lbux      t5, t5(a3)
-    lbux      t1, t1(a3)
-    lbux      t3, t3(a3)
-    lbux      t2, t2(a3)
-    lbux      t0, t0(a3)
-    sb        s0, 0(s1)
-    sb        t7, 1(s1)
-    sb        t6, 2(s1)
-    sb        t5, 3(s1)
-    sb        t1, 4(s1)
-    sb        t3, 5(s1)
-    sb        t2, 6(s1)
-    sb        t0, 7(s1)
-6:
-    addiu     v0, v0, 32
-    bgtz      v1, 4b
-     addiu    a2, a2, 4
-    addiu     sp, sp, 256
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j         ra
-     nop
-
-END(jsimd_idct_islow_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
-/*
- * a0     - inptr
- * a1     - quantptr
- * a2     - wsptr
- * a3     - mips_idct_ifast_coefs
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    addiu          t9, a0, 16            // end address
-    or             AT, a3, zero
-
-0:
-    lw             s0, 0(a1)             // quantptr[DCTSIZE*0]
-    lw             t0, 0(a0)             // inptr[DCTSIZE*0]
-    lw             t1, 16(a0)            // inptr[DCTSIZE*1]
-    muleq_s.w.phl  v0, t0, s0            // tmp0 ...
-    lw             t2, 32(a0)            // inptr[DCTSIZE*2]
-    lw             t3, 48(a0)            // inptr[DCTSIZE*3]
-    lw             t4, 64(a0)            // inptr[DCTSIZE*4]
-    lw             t5, 80(a0)            // inptr[DCTSIZE*5]
-    muleq_s.w.phr  t0, t0, s0            // ... tmp0 ...
-    lw             t6, 96(a0)            // inptr[DCTSIZE*6]
-    lw             t7, 112(a0)           // inptr[DCTSIZE*7]
-    or             s4, t1, t2
-    or             s5, t3, t4
-    bnez           s4, 1f
-     ins           t0, v0, 16, 16        // ... tmp0
-    bnez           s5, 1f
-     or            s6, t5, t6
-    or             s6, s6, t7
-    bnez           s6, 1f
-     sw            t0, 0(a2)             // wsptr[DCTSIZE*0]
-    sw             t0, 16(a2)            // wsptr[DCTSIZE*1]
-    sw             t0, 32(a2)            // wsptr[DCTSIZE*2]
-    sw             t0, 48(a2)            // wsptr[DCTSIZE*3]
-    sw             t0, 64(a2)            // wsptr[DCTSIZE*4]
-    sw             t0, 80(a2)            // wsptr[DCTSIZE*5]
-    sw             t0, 96(a2)            // wsptr[DCTSIZE*6]
-    sw             t0, 112(a2)           // wsptr[DCTSIZE*7]
-    addiu          a0, a0, 4
-    b              2f
-     addiu         a1, a1, 4
-
-1:
-    lw             s1, 32(a1)            // quantptr[DCTSIZE*2]
-    lw             s2, 64(a1)            // quantptr[DCTSIZE*4]
-    muleq_s.w.phl  v0, t2, s1            // tmp1 ...
-    muleq_s.w.phr  t2, t2, s1            // ... tmp1 ...
-    lw             s0, 16(a1)            // quantptr[DCTSIZE*1]
-    lw             s1, 48(a1)            // quantptr[DCTSIZE*3]
-    lw             s3, 96(a1)            // quantptr[DCTSIZE*6]
-    muleq_s.w.phl  v1, t4, s2            // tmp2 ...
-    muleq_s.w.phr  t4, t4, s2            // ... tmp2 ...
-    lw             s2, 80(a1)            // quantptr[DCTSIZE*5]
-    lw             t8, 4(AT)             // FIX(1.414213562)
-    ins            t2, v0, 16, 16        // ... tmp1
-    muleq_s.w.phl  v0, t6, s3            // tmp3 ...
-    muleq_s.w.phr  t6, t6, s3            // ... tmp3 ...
-    ins            t4, v1, 16, 16        // ... tmp2
-    addq.ph        s4, t0, t4            // tmp10
-    subq.ph        s5, t0, t4            // tmp11
-    ins            t6, v0, 16, 16        // ... tmp3
-    subq.ph        s6, t2, t6            // tmp12 ...
-    addq.ph        s7, t2, t6            // tmp13
-    mulq_s.ph      s6, s6, t8            // ... tmp12 ...
-    addq.ph        t0, s4, s7            // tmp0
-    subq.ph        t6, s4, s7            // tmp3
-    muleq_s.w.phl  v0, t1, s0            // tmp4 ...
-    muleq_s.w.phr  t1, t1, s0            // ... tmp4 ...
-    shll_s.ph      s6, s6, 1             // x2
-    lw             s3, 112(a1)           // quantptr[DCTSIZE*7]
-    subq.ph        s6, s6, s7            // ... tmp12
-    muleq_s.w.phl  v1, t7, s3            // tmp7 ...
-    muleq_s.w.phr  t7, t7, s3            // ... tmp7 ...
-    ins            t1, v0, 16, 16        // ... tmp4
-    addq.ph        t2, s5, s6            // tmp1
-    subq.ph        t4, s5, s6            // tmp2
-    muleq_s.w.phl  v0, t5, s2            // tmp6 ...
-    muleq_s.w.phr  t5, t5, s2            // ... tmp6 ...
-    ins            t7, v1, 16, 16        // ... tmp7
-    addq.ph        s5, t1, t7            // z11
-    subq.ph        s6, t1, t7            // z12
-    muleq_s.w.phl  v1, t3, s1            // tmp5 ...
-    muleq_s.w.phr  t3, t3, s1            // ... tmp5 ...
-    ins            t5, v0, 16, 16        // ... tmp6
-    ins            t3, v1, 16, 16        // ... tmp5
-    addq.ph        s7, t5, t3            // z13
-    subq.ph        v0, t5, t3            // z10
-    addq.ph        t7, s5, s7            // tmp7
-    subq.ph        s5, s5, s7            // tmp11 ...
-    addq.ph        v1, v0, s6            // z5 ...
-    mulq_s.ph      s5, s5, t8            // ... tmp11
-    lw             t8, 8(AT)             // FIX(1.847759065)
-    lw             s4, 0(AT)             // FIX(1.082392200)
-    addq.ph        s0, t0, t7
-    subq.ph        s1, t0, t7
-    mulq_s.ph      v1, v1, t8            // ... z5
-    shll_s.ph      s5, s5, 1             // x2
-    lw             t8, 12(AT)            // FIX(-2.613125930)
-    sw             s0, 0(a2)             // wsptr[DCTSIZE*0]
-    shll_s.ph      v0, v0, 1             // x4
-    mulq_s.ph      v0, v0, t8            // tmp12 ...
-    mulq_s.ph      s4, s6, s4            // tmp10 ...
-    shll_s.ph      v1, v1, 1             // x2
-    addiu          a0, a0, 4
-    addiu          a1, a1, 4
-    sw             s1, 112(a2)           // wsptr[DCTSIZE*7]
-    shll_s.ph      s6, v0, 1             // x4
-    shll_s.ph      s4, s4, 1             // x2
-    addq.ph        s6, s6, v1            // ... tmp12
-    subq.ph        t5, s6, t7            // tmp6
-    subq.ph        s4, s4, v1            // ... tmp10
-    subq.ph        t3, s5, t5            // tmp5
-    addq.ph        s2, t2, t5
-    addq.ph        t1, s4, t3            // tmp4
-    subq.ph        s3, t2, t5
-    sw             s2, 16(a2)            // wsptr[DCTSIZE*1]
-    sw             s3, 96(a2)            // wsptr[DCTSIZE*6]
-    addq.ph        v0, t4, t3
-    subq.ph        v1, t4, t3
-    sw             v0, 32(a2)            // wsptr[DCTSIZE*2]
-    sw             v1, 80(a2)            // wsptr[DCTSIZE*5]
-    addq.ph        v0, t6, t1
-    subq.ph        v1, t6, t1
-    sw             v0, 64(a2)            // wsptr[DCTSIZE*4]
-    sw             v1, 48(a2)            // wsptr[DCTSIZE*3]
-
-2:
-    bne            a0, t9, 0b
-     addiu         a2, a2, 4
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j              ra
-     nop
-
-END(jsimd_idct_ifast_cols_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
-/*
- * a0     - wsptr
- * a1     - output_buf
- * a2     - output_col
- * a3     - mips_idct_ifast_coefs
- */
-
-    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
-
-    addiu         t9, a0, 128        // end address
-    lui           s8, 0x8080
-    ori           s8, s8, 0x8080
-
-0:
-    lw            AT, 36(sp)         // restore $a3 (mips_idct_ifast_coefs)
-    lw            t0, 0(a0)          // wsptr[DCTSIZE*0+0/1]  b a
-    lw            s0, 16(a0)         // wsptr[DCTSIZE*1+0/1]  B A
-    lw            t2, 4(a0)          // wsptr[DCTSIZE*0+2/3]  d c
-    lw            s2, 20(a0)         // wsptr[DCTSIZE*1+2/3]  D C
-    lw            t4, 8(a0)          // wsptr[DCTSIZE*0+4/5]  f e
-    lw            s4, 24(a0)         // wsptr[DCTSIZE*1+4/5]  F E
-    lw            t6, 12(a0)         // wsptr[DCTSIZE*0+6/7]  h g
-    lw            s6, 28(a0)         // wsptr[DCTSIZE*1+6/7]  H G
-    precrq.ph.w   t1, s0, t0         // B b
-    ins           t0, s0, 16, 16     // A a
-    bnez          t1, 1f
-     or           s0, t2, s2
-    bnez          s0, 1f
-     or           s0, t4, s4
-    bnez          s0, 1f
-     or           s0, t6, s6
-    bnez          s0, 1f
-     shll_s.ph    s0, t0, 2          // A a
-    lw            a3, 0(a1)
-    lw            AT, 4(a1)
-    precrq.ph.w   t0, s0, s0         // A A
-    ins           s0, s0, 16, 16     // a a
-    addu          a3, a3, a2
-    addu          AT, AT, a2
-    precrq.qb.ph  t0, t0, t0         // A A A A
-    precrq.qb.ph  s0, s0, s0         // a a a a
-    addu.qb       s0, s0, s8
-    addu.qb       t0, t0, s8
-    sw            s0, 0(a3)
-    sw            s0, 4(a3)
-    sw            t0, 0(AT)
-    sw            t0, 4(AT)
-    addiu         a0, a0, 32
-    bne           a0, t9, 0b
-     addiu        a1, a1, 8
-    b             2f
-     nop
-
-1:
-    precrq.ph.w   t3, s2, t2
-    ins           t2, s2, 16, 16
-    precrq.ph.w   t5, s4, t4
-    ins           t4, s4, 16, 16
-    precrq.ph.w   t7, s6, t6
-    ins           t6, s6, 16, 16
-    lw            t8, 4(AT)          // FIX(1.414213562)
-    addq.ph       s4, t0, t4         // tmp10
-    subq.ph       s5, t0, t4         // tmp11
-    subq.ph       s6, t2, t6         // tmp12 ...
-    addq.ph       s7, t2, t6         // tmp13
-    mulq_s.ph     s6, s6, t8         // ... tmp12 ...
-    addq.ph       t0, s4, s7         // tmp0
-    subq.ph       t6, s4, s7         // tmp3
-    shll_s.ph     s6, s6, 1          // x2
-    subq.ph       s6, s6, s7         // ... tmp12
-    addq.ph       t2, s5, s6         // tmp1
-    subq.ph       t4, s5, s6         // tmp2
-    addq.ph       s5, t1, t7         // z11
-    subq.ph       s6, t1, t7         // z12
-    addq.ph       s7, t5, t3         // z13
-    subq.ph       v0, t5, t3         // z10
-    addq.ph       t7, s5, s7         // tmp7
-    subq.ph       s5, s5, s7         // tmp11 ...
-    addq.ph       v1, v0, s6         // z5 ...
-    mulq_s.ph     s5, s5, t8         // ... tmp11
-    lw            t8, 8(AT)          // FIX(1.847759065)
-    lw            s4, 0(AT)          // FIX(1.082392200)
-    addq.ph       s0, t0, t7         // tmp0 + tmp7
-    subq.ph       s7, t0, t7         // tmp0 - tmp7
-    mulq_s.ph     v1, v1, t8         // ... z5
-    lw            a3, 0(a1)
-    lw            t8, 12(AT)         // FIX(-2.613125930)
-    shll_s.ph     s5, s5, 1          // x2
-    addu          a3, a3, a2
-    shll_s.ph     v0, v0, 1          // x4
-    mulq_s.ph     v0, v0, t8         // tmp12 ...
-    mulq_s.ph     s4, s6, s4         // tmp10 ...
-    shll_s.ph     v1, v1, 1          // x2
-    addiu         a0, a0, 32
-    addiu         a1, a1, 8
-    shll_s.ph     s6, v0, 1          // x4
-    shll_s.ph     s4, s4, 1          // x2
-    addq.ph       s6, s6, v1         // ... tmp12
-    shll_s.ph     s0, s0, 2
-    subq.ph       t5, s6, t7         // tmp6
-    subq.ph       s4, s4, v1         // ... tmp10
-    subq.ph       t3, s5, t5         // tmp5
-    shll_s.ph     s7, s7, 2
-    addq.ph       t1, s4, t3         // tmp4
-    addq.ph       s1, t2, t5         // tmp1 + tmp6
-    subq.ph       s6, t2, t5         // tmp1 - tmp6
-    addq.ph       s2, t4, t3         // tmp2 + tmp5
-    subq.ph       s5, t4, t3         // tmp2 - tmp5
-    addq.ph       s4, t6, t1         // tmp3 + tmp4
-    subq.ph       s3, t6, t1         // tmp3 - tmp4
-    shll_s.ph     s1, s1, 2
-    shll_s.ph     s2, s2, 2
-    shll_s.ph     s3, s3, 2
-    shll_s.ph     s4, s4, 2
-    shll_s.ph     s5, s5, 2
-    shll_s.ph     s6, s6, 2
-    precrq.ph.w   t0, s1, s0         // B A
-    ins           s0, s1, 16, 16     // b a
-    precrq.ph.w   t2, s3, s2         // D C
-    ins           s2, s3, 16, 16     // d c
-    precrq.ph.w   t4, s5, s4         // F E
-    ins           s4, s5, 16, 16     // f e
-    precrq.ph.w   t6, s7, s6         // H G
-    ins           s6, s7, 16, 16     // h g
-    precrq.qb.ph  t0, t2, t0         // D C B A
-    precrq.qb.ph  s0, s2, s0         // d c b a
-    precrq.qb.ph  t4, t6, t4         // H G F E
-    precrq.qb.ph  s4, s6, s4         // h g f e
-    addu.qb       s0, s0, s8
-    addu.qb       s4, s4, s8
-    sw            s0, 0(a3)          // outptr[0/1/2/3]       d c b a
-    sw            s4, 4(a3)          // outptr[4/5/6/7]       h g f e
-    lw            a3, -4(a1)
-    addu.qb       t0, t0, s8
-    addu          a3, a3, a2
-    addu.qb       t4, t4, s8
-    sw            t0, 0(a3)          // outptr[0/1/2/3]       D C B A
-    bne           a0, t9, 0b
-     sw           t4, 4(a3)          // outptr[4/5/6/7]       H G F E
-
-2:
-
-    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
-
-    j             ra
-     nop
-
-END(jsimd_idct_ifast_rows_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
-/*
- * a0     - data
- */
-
-    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
-
-    lui       t0, 6437
-    ori       t0, 2260
-    lui       t1, 9633
-    ori       t1, 11363
-    lui       t2, 0xd39e
-    ori       t2, 0xe6dc
-    lui       t3, 0xf72d
-    ori       t3, 9633
-    lui       t4, 2261
-    ori       t4, 9633
-    lui       t5, 0xd39e
-    ori       t5, 6437
-    lui       t6, 9633
-    ori       t6, 0xd39d
-    lui       t7, 0xe6dc
-    ori       t7, 2260
-    lui       t8, 4433
-    ori       t8, 10703
-    lui       t9, 0xd630
-    ori       t9, 4433
-    li        s8, 8
-    move      a1, a0
-1:
-    lw        s0, 0(a1)     // tmp0 = 1|0
-    lw        s1, 4(a1)     // tmp1 = 3|2
-    lw        s2, 8(a1)     // tmp2 = 5|4
-    lw        s3, 12(a1)    // tmp3 = 7|6
-    packrl.ph s1, s1, s1    // tmp1 = 2|3
-    packrl.ph s3, s3, s3    // tmp3 = 6|7
-    subq.ph   s7, s1, s2    // tmp7 = 2-5|3-4 = t5|t4
-    subq.ph   s5, s0, s3    // tmp5 = 1-6|0-7 = t6|t7
-    mult      $0, $0        // ac0  = 0
-    dpa.w.ph  $ac0, s7, t0  // ac0 += t5*  6437 + t4*  2260
-    dpa.w.ph  $ac0, s5, t1  // ac0 += t6*  9633 + t7* 11363
-    mult      $ac1, $0, $0  // ac1  = 0
-    dpa.w.ph  $ac1, s7, t2  // ac1 += t5*-11362 + t4* -6436
-    dpa.w.ph  $ac1, s5, t3  // ac1 += t6* -2259 + t7*  9633
-    mult      $ac2, $0, $0  // ac2  = 0
-    dpa.w.ph  $ac2, s7, t4  // ac2 += t5*  2261 + t4*  9633
-    dpa.w.ph  $ac2, s5, t5  // ac2 += t6*-11362 + t7*  6437
-    mult      $ac3, $0, $0  // ac3  = 0
-    dpa.w.ph  $ac3, s7, t6  // ac3 += t5*  9633 + t4*-11363
-    dpa.w.ph  $ac3, s5, t7  // ac3 += t6* -6436 + t7*  2260
-    addq.ph   s6, s1, s2    // tmp6 = 2+5|3+4 = t2|t3
-    addq.ph   s4, s0, s3    // tmp4 = 1+6|0+7 = t1|t0
-    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
-    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
-    extr_r.w  s2, $ac2, 11  // tmp2 = (ac2 + 1024) >> 11
-    extr_r.w  s3, $ac3, 11  // tmp3 = (ac3 + 1024) >> 11
-    addq.ph   s5, s4, s6    // tmp5 = t1+t2|t0+t3 = t11|t10
-    subq.ph   s7, s4, s6    // tmp7 = t1-t2|t0-t3 = t12|t13
-    sh        s0, 2(a1)
-    sh        s1, 6(a1)
-    sh        s2, 10(a1)
-    sh        s3, 14(a1)
-    mult      $0, $0        // ac0  = 0
-    dpa.w.ph  $ac0, s7, t8  // ac0 += t12*  4433 + t13* 10703
-    mult      $ac1, $0, $0  // ac1  = 0
-    dpa.w.ph  $ac1, s7, t9  // ac1 += t12*-10704 + t13*  4433
-    sra       s4, s5, 16    // tmp4 = t11
-    addiu     a1, a1, 16
-    addiu     s8, s8, -1
-    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
-    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
-    addu      s2, s5, s4    // tmp2 = t10 + t11
-    subu      s3, s5, s4    // tmp3 = t10 - t11
-    sll       s2, s2, 2     // tmp2 = (t10 + t11) << 2
-    sll       s3, s3, 2     // tmp3 = (t10 - t11) << 2
-    sh        s2, -16(a1)
-    sh        s3, -8(a1)
-    sh        s0, -12(a1)
-    bgtz      s8, 1b
-     sh       s1, -4(a1)
-    li        t0, 2260
-    li        t1, 11363
-    li        t2, 9633
-    li        t3, 6436
-    li        t4, 6437
-    li        t5, 2261
-    li        t6, 11362
-    li        t7, 2259
-    li        t8, 4433
-    li        t9, 10703
-    li        a1, 10704
-    li        s8, 8
-
-2:
-    lh        a2, 0(a0)     // 0
-    lh        a3, 16(a0)    // 8
-    lh        v0, 32(a0)    // 16
-    lh        v1, 48(a0)    // 24
-    lh        s4, 64(a0)    // 32
-    lh        s5, 80(a0)    // 40
-    lh        s6, 96(a0)    // 48
-    lh        s7, 112(a0)   // 56
-    addu      s2, v0, s5    // tmp2 = 16 + 40
-    subu      s5, v0, s5    // tmp5 = 16 - 40
-    addu      s3, v1, s4    // tmp3 = 24 + 32
-    subu      s4, v1, s4    // tmp4 = 24 - 32
-    addu      s0, a2, s7    // tmp0 =  0 + 56
-    subu      s7, a2, s7    // tmp7 =  0 - 56
-    addu      s1, a3, s6    // tmp1 =  8 + 48
-    subu      s6, a3, s6    // tmp6 =  8 - 48
-    addu      a2, s0, s3    // tmp10 = tmp0 + tmp3
-    subu      v1, s0, s3    // tmp13 = tmp0 - tmp3
-    addu      a3, s1, s2    // tmp11 = tmp1 + tmp2
-    subu      v0, s1, s2    // tmp12 = tmp1 - tmp2
-    mult      s7, t1        // ac0  = tmp7 * c1
-    madd      s4, t0        // ac0 += tmp4 * c0
-    madd      s5, t4        // ac0 += tmp5 * c4
-    madd      s6, t2        // ac0 += tmp6 * c2
-    mult      $ac1, s7, t2  // ac1  = tmp7 * c2
-    msub      $ac1, s4, t3  // ac1 -= tmp4 * c3
-    msub      $ac1, s5, t6  // ac1 -= tmp5 * c6
-    msub      $ac1, s6, t7  // ac1 -= tmp6 * c7
-    mult      $ac2, s7, t4  // ac2  = tmp7 * c4
-    madd      $ac2, s4, t2  // ac2 += tmp4 * c2
-    madd      $ac2, s5, t5  // ac2 += tmp5 * c5
-    msub      $ac2, s6, t6  // ac2 -= tmp6 * c6
-    mult      $ac3, s7, t0  // ac3  = tmp7 * c0
-    msub      $ac3, s4, t1  // ac3 -= tmp4 * c1
-    madd      $ac3, s5, t2  // ac3 += tmp5 * c2
-    msub      $ac3, s6, t3  // ac3 -= tmp6 * c3
-    extr_r.w  s0, $ac0, 15  // tmp0 = (ac0 + 16384) >> 15
-    extr_r.w  s1, $ac1, 15  // tmp1 = (ac1 + 16384) >> 15
-    extr_r.w  s2, $ac2, 15  // tmp2 = (ac2 + 16384) >> 15
-    extr_r.w  s3, $ac3, 15  // tmp3 = (ac3 + 16384) >> 15
-    addiu     s8, s8, -1
-    addu      s4, a2, a3    // tmp4 = tmp10 + tmp11
-    subu      s5, a2, a3    // tmp5 = tmp10 - tmp11
-    sh        s0, 16(a0)
-    sh        s1, 48(a0)
-    sh        s2, 80(a0)
-    sh        s3, 112(a0)
-    mult      v0, t8        // ac0  = tmp12 * c8
-    madd      v1, t9        // ac0 += tmp13 * c9
-    mult      $ac1, v1, t8  // ac1  = tmp13 * c8
-    msub      $ac1, v0, a1  // ac1 -= tmp12 * c10
-    addiu     a0, a0, 2
-    extr_r.w  s6, $ac0, 15  // tmp6 = (ac0 + 16384) >> 15
-    extr_r.w  s7, $ac1, 15  // tmp7 = (ac1 + 16384) >> 15
-    shra_r.w  s4, s4, 2     // tmp4 = (tmp4 + 2) >> 2
-    shra_r.w  s5, s5, 2     // tmp5 = (tmp5 + 2) >> 2
-    sh        s4, -2(a0)
-    sh        s5, 62(a0)
-    sh        s6, 30(a0)
-    bgtz      s8, 2b
-     sh       s7, 94(a0)
-
-    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
-
-    jr       ra
-     nop
-
-END(jsimd_fdct_islow_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
-/*
- * a0     - data
- */
-    .set at
-    SAVE_REGS_ON_STACK 8, s0, s1
-    li           a1, 0x014e014e  // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
-    li           a2, 0x008b008b  // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
-    li           a3, 0x00620062  // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
-    li           s1, 0x00b500b5  // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
-
-    move         v0, a0
-    addiu        v1, v0, 128     // end address
-
-0:
-    lw           t0, 0(v0)       // tmp0 = 1|0
-    lw           t1, 4(v0)       // tmp1 = 3|2
-    lw           t2, 8(v0)       // tmp2 = 5|4
-    lw           t3, 12(v0)      // tmp3 = 7|6
-    packrl.ph    t1, t1, t1      // tmp1 = 2|3
-    packrl.ph    t3, t3, t3      // tmp3 = 6|7
-    subq.ph      t7, t1, t2      // tmp7 = 2-5|3-4 = t5|t4
-    subq.ph      t5, t0, t3      // tmp5 = 1-6|0-7 = t6|t7
-    addq.ph      t6, t1, t2      // tmp6 = 2+5|3+4 = t2|t3
-    addq.ph      t4, t0, t3      // tmp4 = 1+6|0+7 = t1|t0
-    addq.ph      t8, t4, t6      // tmp5 = t1+t2|t0+t3 = t11|t10
-    subq.ph      t9, t4, t6      // tmp7 = t1-t2|t0-t3 = t12|t13
-    sra          t4, t8, 16      // tmp4 = t11
-    mult         $0, $0          // ac0  = 0
-    dpa.w.ph     $ac0, t9, s1
-    mult         $ac1, $0, $0    // ac1  = 0
-    dpa.w.ph     $ac1, t7, a3    // ac1 += t4*98 + t5*98
-    dpsx.w.ph    $ac1, t5, a3    // ac1 += t6*98 + t7*98
-    mult         $ac2, $0, $0    // ac2  = 0
-    dpa.w.ph     $ac2, t7, a2    // ac2 += t4*139 + t5*139
-    mult         $ac3, $0, $0    // ac3  = 0
-    dpa.w.ph     $ac3, t5, a1    // ac3 += t6*334 + t7*334
-    precrq.ph.w  t0, t5, t7      // t0 = t5|t6
-    addq.ph      t2, t8, t4      // tmp2 = t10 + t11
-    subq.ph      t3, t8, t4      // tmp3 = t10 - t11
-    extr.w       t4, $ac0, 8
-    mult         $0, $0          // ac0  = 0
-    dpa.w.ph     $ac0, t0, s1    // ac0 += t5*181 + t6*181
-    extr.w       t0, $ac1, 8     // t0 = z5
-    extr.w       t1, $ac2, 8     // t1 = MULTIPLY(tmp10, 139)
-    extr.w       t7, $ac3, 8     // t2 = MULTIPLY(tmp12, 334)
-    extr.w       t8, $ac0, 8     // t8 = z3 = MULTIPLY(tmp11, 181)
-    add          t6, t1, t0      // t6 = z2
-    add          t7, t7, t0      // t7 = z4
-    subq.ph      t0, t5, t8      // t0 = z13 = tmp7 - z3
-    addq.ph      t8, t5, t8      // t9 = z11 = tmp7 + z3
-    addq.ph      t1, t0, t6      // t1 = z13 + z2
-    subq.ph      t6, t0, t6      // t6 = z13 - z2
-    addq.ph      t0, t8, t7      // t0 = z11 + z4
-    subq.ph      t7, t8, t7      // t7 = z11 - z4
-    addq.ph      t5, t4, t9
-    subq.ph      t4, t9, t4
-    sh           t2, 0(v0)
-    sh           t5, 4(v0)
-    sh           t3, 8(v0)
-    sh           t4, 12(v0)
-    sh           t1, 10(v0)
-    sh           t6, 6(v0)
-    sh           t0, 2(v0)
-    sh           t7, 14(v0)
-    addiu        v0, 16
-    bne          v1, v0, 0b
-     nop
-    move         v0, a0
-    addiu        v1, v0, 16
-
-1:
-    lh           t0, 0(v0)       // 0
-    lh           t1, 16(v0)      // 8
-    lh           t2, 32(v0)      // 16
-    lh           t3, 48(v0)      // 24
-    lh           t4, 64(v0)      // 32
-    lh           t5, 80(v0)      // 40
-    lh           t6, 96(v0)      // 48
-    lh           t7, 112(v0)     // 56
-    add          t8, t0, t7      // t8 = tmp0
-    sub          t7, t0, t7      // t7 = tmp7
-    add          t0, t1, t6      // t0 = tmp1
-    sub          t1, t1, t6      // t1 = tmp6
-    add          t6, t2, t5      // t6 = tmp2
-    sub          t5, t2, t5      // t5 = tmp5
-    add          t2, t3, t4      // t2 = tmp3
-    sub          t3, t3, t4      // t3 = tmp4
-    add          t4, t8, t2      // t4 = tmp10 = tmp0 + tmp3
-    sub          t8, t8, t2      // t8 = tmp13 = tmp0 - tmp3
-    sub          s0, t0, t6      // s0 = tmp12 = tmp1 - tmp2
-    ins          t8, s0, 16, 16  // t8 = tmp12|tmp13
-    add          t2, t0, t6      // t2 = tmp11 = tmp1 + tmp2
-    mult         $0, $0          // ac0  = 0
-    dpa.w.ph     $ac0, t8, s1    // ac0 += t12*181 + t13*181
-    add          s0, t4, t2      // t8 = tmp10+tmp11
-    sub          t4, t4, t2      // t4 = tmp10-tmp11
-    sh           s0, 0(v0)
-    sh           t4, 64(v0)
-    extr.w       t2, $ac0, 8     // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
-    addq.ph      t4, t8, t2      // t9 = tmp13 + z1
-    subq.ph      t8, t8, t2      // t2 = tmp13 - z1
-    sh           t4, 32(v0)
-    sh           t8, 96(v0)
-    add          t3, t3, t5      // t3 = tmp10 = tmp4 + tmp5
-    add          t0, t5, t1      // t0 = tmp11 = tmp5 + tmp6
-    add          t1, t1, t7      // t1 = tmp12 = tmp6 + tmp7
-    andi         t4, a1, 0xffff
-    mul          s0, t1, t4
-    sra          s0, s0, 8       // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
-    ins          t1, t3, 16, 16  // t1 = tmp10|tmp12
-    mult         $0, $0          // ac0  = 0
-    mulsa.w.ph   $ac0, t1, a3    // ac0 += t10*98 - t12*98
-    extr.w       t8, $ac0, 8     // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
-    add          t2, t7, t8      // t2 = tmp7 + z5
-    sub          t7, t7, t8      // t7 = tmp7 - z5
-    andi         t4, a2, 0xffff
-    mul          t8, t3, t4
-    sra          t8, t8, 8       // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
-    andi         t4, s1, 0xffff
-    mul          t6, t0, t4
-    sra          t6, t6, 8       // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
-    add          t0, t6, t8      // t0 = z3 + z2
-    sub          t1, t6, t8      // t1 = z3 - z2
-    add          t3, t6, s0      // t3 = z3 + z4
-    sub          t4, t6, s0      // t4 = z3 - z4
-    sub          t5, t2, t1      // t5 = dataptr[5]
-    sub          t6, t7, t0      // t6 = dataptr[3]
-    add          t3, t2, t3      // t3 = dataptr[1]
-    add          t4, t7, t4      // t4 = dataptr[7]
-    sh           t5, 80(v0)
-    sh           t6, 48(v0)
-    sh           t3, 16(v0)
-    sh           t4, 112(v0)
-    addiu        v0, 2
-    bne          v0, v1, 1b
-     nop
-
-    RESTORE_REGS_FROM_STACK 8, s0, s1
-
-    j            ra
-     nop
-END(jsimd_fdct_ifast_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
-/*
- * a0     - coef_block
- * a1     - divisors
- * a2     - workspace
- */
-
-    .set at
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2
-
-    addiu   v0, a2, 124  // v0 = workspace_end
-    lh      t0, 0(a2)
-    lh      t1, 0(a1)
-    lh      t2, 128(a1)
-    sra     t3, t0, 15
-    sll     t3, t3, 1
-    addiu   t3, t3, 1
-    mul     t0, t0, t3
-    lh      t4, 384(a1)
-    lh      t5, 130(a1)
-    lh      t6, 2(a2)
-    lh      t7, 2(a1)
-    lh      t8, 386(a1)
-
-1:
-    andi    t1, 0xffff
-    add     t9, t0, t2
-    andi    t9, 0xffff
-    mul     v1, t9, t1
-    sra     s0, t6, 15
-    sll     s0, s0, 1
-    addiu   s0, s0, 1
-    addiu   t9, t4, 16
-    srav    v1, v1, t9
-    mul     v1, v1, t3
-    mul     t6, t6, s0
-    andi    t7, 0xffff
-    addiu   a2, a2, 4
-    addiu   a1, a1, 4
-    add     s1, t6, t5
-    andi    s1, 0xffff
-    sh      v1, 0(a0)
-
-    mul     s2, s1, t7
-    addiu   s1, t8, 16
-    srav    s2, s2, s1
-    mul     s2,s2, s0
-    lh      t0, 0(a2)
-    lh      t1, 0(a1)
-    sra     t3, t0, 15
-    sll     t3, t3, 1
-    addiu   t3, t3, 1
-    mul     t0, t0, t3
-    lh      t2, 128(a1)
-    lh      t4, 384(a1)
-    lh      t5, 130(a1)
-    lh      t8, 386(a1)
-    lh      t6, 2(a2)
-    lh      t7, 2(a1)
-    sh      s2, 2(a0)
-    lh      t0, 0(a2)
-    sra     t3, t0, 15
-    sll     t3, t3, 1
-    addiu   t3, t3, 1
-    mul     t0, t0,t3
-    bne     a2, v0, 1b
-     addiu  a0, a0, 4
-
-    andi    t1, 0xffff
-    add     t9, t0, t2
-    andi    t9, 0xffff
-    mul     v1, t9, t1
-    sra     s0, t6, 15
-    sll     s0, s0, 1
-    addiu   s0, s0, 1
-    addiu   t9, t4, 16
-    srav    v1, v1, t9
-    mul     v1, v1, t3
-    mul     t6, t6, s0
-    andi    t7, 0xffff
-    sh      v1, 0(a0)
-    add     s1, t6, t5
-    andi    s1, 0xffff
-    mul     s2, s1, t7
-    addiu   s1, t8, 16
-    addiu   a2, a2, 4
-    addiu   a1, a1, 4
-    srav    s2, s2, s1
-    mul     s2, s2, s0
-    sh      s2, 2(a0)
-
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
-
-    j       ra
-     nop
-
-END(jsimd_quantize_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
-/*
- * a0     - coef_block
- * a1     - divisors
- * a2     - workspace
- */
-
-    .set at
-
-    li         t1, 0x46800100     //integer representation 16384.5
-    mtc1       t1, f0
-    li         t0, 63
-0:
-    lwc1       f2, 0(a2)
-    lwc1       f10, 0(a1)
-    lwc1       f4, 4(a2)
-    lwc1       f12, 4(a1)
-    lwc1       f6, 8(a2)
-    lwc1       f14, 8(a1)
-    lwc1       f8, 12(a2)
-    lwc1       f16, 12(a1)
-    madd.s     f2, f0, f2, f10
-    madd.s     f4, f0, f4, f12
-    madd.s     f6, f0, f6, f14
-    madd.s     f8, f0, f8, f16
-    lwc1       f10, 16(a1)
-    lwc1       f12, 20(a1)
-    trunc.w.s  f2, f2
-    trunc.w.s  f4, f4
-    trunc.w.s  f6, f6
-    trunc.w.s  f8, f8
-    lwc1       f14, 24(a1)
-    lwc1       f16, 28(a1)
-    mfc1       t1, f2
-    mfc1       t2, f4
-    mfc1       t3, f6
-    mfc1       t4, f8
-    lwc1       f2, 16(a2)
-    lwc1       f4, 20(a2)
-    lwc1       f6, 24(a2)
-    lwc1       f8, 28(a2)
-    madd.s     f2, f0, f2, f10
-    madd.s     f4, f0, f4, f12
-    madd.s     f6, f0, f6, f14
-    madd.s     f8, f0, f8, f16
-    addiu      t1, t1, -16384
-    addiu      t2, t2, -16384
-    addiu      t3, t3, -16384
-    addiu      t4, t4, -16384
-    trunc.w.s  f2, f2
-    trunc.w.s  f4, f4
-    trunc.w.s  f6, f6
-    trunc.w.s  f8, f8
-    sh         t1, 0(a0)
-    sh         t2, 2(a0)
-    sh         t3, 4(a0)
-    sh         t4, 6(a0)
-    mfc1       t1, f2
-    mfc1       t2, f4
-    mfc1       t3, f6
-    mfc1       t4, f8
-    addiu      t0, t0, -8
-    addiu      a2, a2, 32
-    addiu      a1, a1, 32
-    addiu      t1, t1, -16384
-    addiu      t2, t2, -16384
-    addiu      t3, t3, -16384
-    addiu      t4, t4, -16384
-    sh         t1, 8(a0)
-    sh         t2, 10(a0)
-    sh         t3, 12(a0)
-    sh         t4, 14(a0)
-    bgez       t0, 0b
-     addiu     a0, a0, 16
-
-    j          ra
-     nop
-
-END(jsimd_quantize_float_mips_dspr2)
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
-/*
- * a0     - compptr->dct_table
- * a1     - coef_block
- * a2     - output_buf
- * a3     - output_col
- */
-    .set at
-
-    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
-
-    addiu     sp, sp, -40
-    move      v0, sp
-    addiu     s2, zero, 29692
-    addiu     s3, zero, -10426
-    addiu     s4, zero, 6967
-    addiu     s5, zero, -5906
-    lh        t0, 0(a1)         // t0 = inptr[DCTSIZE*0]
-    lh        t5, 0(a0)         // t5 = quantptr[DCTSIZE*0]
-    lh        t1, 48(a1)        // t1 = inptr[DCTSIZE*3]
-    lh        t6, 48(a0)        // t6 = quantptr[DCTSIZE*3]
-    mul       t4, t5, t0
-    lh        t0, 16(a1)        // t0 = inptr[DCTSIZE*1]
-    lh        t5, 16(a0)        // t5 = quantptr[DCTSIZE*1]
-    mul       t6, t6, t1
-    mul       t5, t5, t0
-    lh        t2, 80(a1)        // t2 = inptr[DCTSIZE*5]
-    lh        t7, 80(a0)        // t7 = quantptr[DCTSIZE*5]
-    lh        t3, 112(a1)       // t3 = inptr[DCTSIZE*7]
-    lh        t8, 112(a0)       // t8 = quantptr[DCTSIZE*7]
-    mul       t7, t7, t2
-    mult      zero, zero
-    mul       t8, t8, t3
-    li        s0, 0x73FCD746    // s0 = (29692 << 16) | (-10426 & 0xffff)
-    li        s1, 0x1B37E8EE    // s1 = (6967 << 16) | (-5906 & 0xffff)
-    ins       t6, t5, 16, 16    // t6 = t5|t6
-    sll       t4, t4, 15
-    dpa.w.ph  $ac0, t6, s0
-    lh        t1, 2(a1)
-    lh        t6, 2(a0)
-    ins       t8, t7, 16, 16    // t8 = t7|t8
-    dpa.w.ph  $ac0, t8, s1
-    mflo      t0, $ac0
-    mul       t5, t6, t1
-    lh        t1, 18(a1)
-    lh        t6, 18(a0)
-    lh        t2, 50(a1)
-    lh        t7, 50(a0)
-    mul       t6, t6, t1
-    subu      t8, t4, t0
-    mul       t7, t7, t2
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    lh        t1, 82(a1)
-    lh        t2, 82(a0)
-    lh        t3, 114(a1)
-    lh        t4, 114(a0)
-    shra_r.w  t8, t8, 13
-    mul       t1, t1, t2
-    mul       t3, t3, t4
-    sw        t0, 0(v0)
-    sw        t8, 20(v0)
-    sll       t4, t5, 15
-    ins       t7, t6, 16, 16
-    mult      zero, zero
-    dpa.w.ph  $ac0, t7, s0
-    ins       t3, t1, 16, 16
-    lh        t1, 6(a1)
-    lh        t6, 6(a0)
-    dpa.w.ph  $ac0, t3, s1
-    mflo      t0, $ac0
-    mul       t5, t6, t1
-    lh        t1, 22(a1)
-    lh        t6, 22(a0)
-    lh        t2, 54(a1)
-    lh        t7, 54(a0)
-    mul       t6, t6, t1
-    subu      t8, t4, t0
-    mul       t7, t7, t2
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    lh        t1, 86(a1)
-    lh        t2, 86(a0)
-    lh        t3, 118(a1)
-    lh        t4, 118(a0)
-    shra_r.w  t8, t8, 13
-    mul       t1, t1, t2
-    mul       t3, t3, t4
-    sw        t0, 4(v0)
-    sw        t8, 24(v0)
-    sll       t4, t5, 15
-    ins       t7, t6, 16, 16
-    mult      zero, zero
-    dpa.w.ph  $ac0, t7, s0
-    ins       t3, t1, 16, 16
-    lh        t1, 10(a1)
-    lh        t6, 10(a0)
-    dpa.w.ph  $ac0, t3, s1
-    mflo      t0, $ac0
-    mul       t5, t6, t1
-    lh        t1, 26(a1)
-    lh        t6, 26(a0)
-    lh        t2, 58(a1)
-    lh        t7, 58(a0)
-    mul       t6, t6, t1
-    subu      t8, t4, t0
-    mul       t7, t7, t2
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    lh        t1, 90(a1)
-    lh        t2, 90(a0)
-    lh        t3, 122(a1)
-    lh        t4, 122(a0)
-    shra_r.w  t8, t8, 13
-    mul       t1, t1, t2
-    mul       t3, t3, t4
-    sw        t0, 8(v0)
-    sw        t8, 28(v0)
-    sll       t4, t5, 15
-    ins       t7, t6, 16, 16
-    mult      zero, zero
-    dpa.w.ph  $ac0, t7, s0
-    ins       t3, t1, 16, 16
-    lh        t1, 14(a1)
-    lh        t6, 14(a0)
-    dpa.w.ph  $ac0, t3, s1
-    mflo      t0, $ac0
-    mul       t5, t6, t1
-    lh        t1, 30(a1)
-    lh        t6, 30(a0)
-    lh        t2, 62(a1)
-    lh        t7, 62(a0)
-    mul       t6, t6, t1
-    subu      t8, t4, t0
-    mul       t7, t7, t2
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    lh        t1, 94(a1)
-    lh        t2, 94(a0)
-    lh        t3, 126(a1)
-    lh        t4, 126(a0)
-    shra_r.w  t8, t8, 13
-    mul       t1, t1, t2
-    mul       t3, t3, t4
-    sw        t0, 12(v0)
-    sw        t8, 32(v0)
-    sll       t4, t5, 15
-    ins       t7, t6, 16, 16
-    mult      zero, zero
-    dpa.w.ph  $ac0, t7, s0
-    ins       t3, t1, 16, 16
-    dpa.w.ph  $ac0, t3, s1
-    mflo      t0, $ac0
-    lw        t9, 0(a2)
-    lw        t3, 0(v0)
-    lw        t7, 4(v0)
-    lw        t1, 8(v0)
-    addu      t9, t9, a3
-    sll       t3, t3, 15
-    subu      t8, t4, t0
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    shra_r.w  t8, t8, 13
-    sw        t0, 16(v0)
-    sw        t8, 36(v0)
-    lw        t5, 12(v0)
-    lw        t6, 16(v0)
-    mult      t7, s2
-    madd      t1, s3
-    madd      t5, s4
-    madd      t6, s5
-    lw        t5, 24(v0)
-    lw        t7, 28(v0)
-    mflo      t0, $ac0
-    lw        t8, 32(v0)
-    lw        t2, 36(v0)
-    mult      $ac1, t5, s2
-    madd      $ac1, t7, s3
-    madd      $ac1, t8, s4
-    madd      $ac1, t2, s5
-    addu      t1, t3, t0
-    subu      t6, t3, t0
-    shra_r.w  t1, t1, 20
-    shra_r.w  t6, t6, 20
-    mflo      t4, $ac1
-    shll_s.w  t1, t1, 24
-    shll_s.w  t6, t6, 24
-    sra       t1, t1, 24
-    sra       t6, t6, 24
-    addiu     t1, t1, 128
-    addiu     t6, t6, 128
-    lw        t0, 20(v0)
-    sb        t1, 0(t9)
-    sb        t6, 1(t9)
-    sll       t0, t0, 15
-    lw        t9, 4(a2)
-    addu      t1, t0, t4
-    subu      t6, t0, t4
-    addu      t9, t9, a3
-    shra_r.w  t1, t1, 20
-    shra_r.w  t6, t6, 20
-    shll_s.w  t1, t1, 24
-    shll_s.w  t6, t6, 24
-    sra       t1, t1, 24
-    sra       t6, t6, 24
-    addiu     t1, t1, 128
-    addiu     t6, t6, 128
-    sb        t1, 0(t9)
-    sb        t6, 1(t9)
-    addiu     sp, sp, 40
-
-    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
-
-    j         ra
-     nop
-
-END(jsimd_idct_2x2_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
-/*
- * a0     - compptr->dct_table
- * a1     - coef_block
- * a2     - output_buf
- * a3     - output_col
- * 16(sp) - workspace[DCTSIZE*4];  // buffers data between passes
- */
-
-    .set at
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    lw        v1, 48(sp)
-    move      t0, a1
-    move      t1, v1
-    li        t9, 4
-    li        s0, 0x2e75f93e
-    li        s1, 0x21f9ba79
-    li        s2, 0xecc2efb0
-    li        s3, 0x52031ccd
-
-0:
-    lh        s6, 32(t0)        // inptr[DCTSIZE*2]
-    lh        t6, 32(a0)        // quantptr[DCTSIZE*2]
-    lh        s7, 96(t0)        // inptr[DCTSIZE*6]
-    lh        t7, 96(a0)        // quantptr[DCTSIZE*6]
-    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
-    lh        s4, 0(t0)         // inptr[DCTSIZE*0]
-    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
-    lh        s5, 0(a0)         // quantptr[0]
-    li        s6, 15137
-    li        s7, 6270
-    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
-    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
-    lh        t5, 112(t0)       // inptr[DCTSIZE*7]
-    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
-    lh        s4, 112(a0)       // quantptr[DCTSIZE*7]
-    lh        v0, 80(t0)        // inptr[DCTSIZE*5]
-    lh        s5, 80(a0)        // quantptr[DCTSIZE*5]
-    lh        s6, 48(a0)        // quantptr[DCTSIZE*3]
-    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
-    lh        s7, 16(a0)        // quantptr[DCTSIZE*1]
-    lh        t8, 16(t0)        // inptr[DCTSIZE*1]
-    subu      t6, t6, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
-    lh        t7, 48(t0)        // inptr[DCTSIZE*3]
-    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
-    mul       v0, s5, v0        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
-    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
-    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
-    addu      t3, t2, t6        // tmp10 = tmp0 + z2
-    subu      t4, t2, t6        // tmp10 = tmp0 - z2
-    mult      $ac0, zero, zero
-    mult      $ac1, zero, zero
-    ins       t5, v0, 16, 16
-    ins       t7, t8, 16, 16
-    addiu     t9, t9, -1
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    mflo      s4, $ac0
-    mflo      s5, $ac1
-    addiu     a0, a0, 2
-    addiu     t1, t1, 4
-    addiu     t0, t0, 2
-    addu      t6, t4, s4
-    subu      t5, t4, s4
-    addu      s6, t3, s5
-    subu      s7, t3, s5
-    shra_r.w  t6, t6, 12        // DESCALE(tmp12 + temp1, 12)
-    shra_r.w  t5, t5, 12        // DESCALE(tmp12 - temp1, 12)
-    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
-    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
-    sw        t6, 28(t1)
-    sw        t5, 60(t1)
-    sw        s6, -4(t1)
-    bgtz      t9, 0b
-     sw       s7, 92(t1)
-    // second loop three pass
-    li        t9, 3
-1:
-    lh        s6, 34(t0)        // inptr[DCTSIZE*2]
-    lh        t6, 34(a0)        // quantptr[DCTSIZE*2]
-    lh        s7, 98(t0)        // inptr[DCTSIZE*6]
-    lh        t7, 98(a0)        // quantptr[DCTSIZE*6]
-    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
-    lh        s4, 2(t0)         // inptr[DCTSIZE*0]
-    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
-    lh        s5, 2(a0)         // quantptr[DCTSIZE*0]
-    li        s6, 15137
-    li        s7, 6270
-    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
-    mul       v0, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
-    lh        t5, 114(t0)       // inptr[DCTSIZE*7]
-    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
-    lh        s4, 114(a0)       // quantptr[DCTSIZE*7]
-    lh        s5, 82(a0)        // quantptr[DCTSIZE*5]
-    lh        t6, 82(t0)        // inptr[DCTSIZE*5]
-    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
-    lh        s6, 50(a0)        // quantptr[DCTSIZE*3]
-    lh        t8, 18(t0)        // inptr[DCTSIZE*1]
-    subu      v0, v0, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
-    lh        t7, 50(t0)        // inptr[DCTSIZE*3]
-    lh        s7, 18(a0)        // quantptr[DCTSIZE*1]
-    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
-    mul       t6, s5, t6        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
-    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
-    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
-    addu      t3, t2, v0        // tmp10 = tmp0 + z2
-    subu      t4, t2, v0        // tmp10 = tmp0 - z2
-    mult      $ac0, zero, zero
-    mult      $ac1, zero, zero
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    mflo      t5, $ac0
-    mflo      t6, $ac1
-    addiu     t9, t9, -1
-    addiu     t0, t0, 2
-    addiu     a0, a0, 2
-    addiu     t1, t1, 4
-    addu      s5, t4, t5
-    subu      s4, t4, t5
-    addu      s6, t3, t6
-    subu      s7, t3, t6
-    shra_r.w  s5, s5, 12        // DESCALE(tmp12 + temp1, 12)
-    shra_r.w  s4, s4, 12        // DESCALE(tmp12 - temp1, 12)
-    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
-    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
-    sw        s5, 32(t1)
-    sw        s4, 64(t1)
-    sw        s6, 0(t1)
-    bgtz      t9, 1b
-     sw       s7, 96(t1)
-    move      t1, v1
-    li        s4, 15137
-    lw        s6, 8(t1)         // wsptr[2]
-    li        s5, 6270
-    lw        s7, 24(t1)        // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-    lw        t2, 0(t1)         // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
-    lh        t5, 28(t1)        // wsptr[7]
-    lh        t6, 20(t1)        // wsptr[5]
-    lh        t7, 12(t1)        // wsptr[3]
-    lh        t8, 4(t1)         // wsptr[1]
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    mult      $ac0, zero, zero
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    mult      $ac1, zero, zero
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
-    mflo      s6, $ac0
-    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
-    subu      s4, s4, s5
-    addu      t3, t2, s4        // tmp10 = tmp0 + z2
-    mflo      s7, $ac1
-    subu      t4, t2, s4        // tmp10 = tmp0 - z2
-    addu      t7, t4, s6
-    subu      t8, t4, s6
-    addu      t5, t3, s7
-    subu      t6, t3, s7
-    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
-    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
-    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
-    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
-    sll       s4, t9, 2
-    lw        v0, 0(a2)         // output_buf[ctr]
-    shll_s.w  t5, t5, 24
-    shll_s.w  t6, t6, 24
-    shll_s.w  t7, t7, 24
-    shll_s.w  t8, t8, 24
-    sra       t5, t5, 24
-    sra       t6, t6, 24
-    sra       t7, t7, 24
-    sra       t8, t8, 24
-    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
-    addiu     t5, t5, 128
-    addiu     t6, t6, 128
-    addiu     t7, t7, 128
-    addiu     t8, t8, 128
-    sb        t5, 0(v0)
-    sb        t7, 1(v0)
-    sb        t8, 2(v0)
-    sb        t6, 3(v0)
-    // 2
-    li        s4, 15137
-    lw        s6, 40(t1)        // wsptr[2]
-    li        s5, 6270
-    lw        s7, 56(t1)        // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-    lw        t2, 32(t1)        // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
-    lh        t5, 60(t1)        // wsptr[7]
-    lh        t6, 52(t1)        // wsptr[5]
-    lh        t7, 44(t1)        // wsptr[3]
-    lh        t8, 36(t1)        // wsptr[1]
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    mult      $ac0, zero, zero
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    mult      $ac1, zero, zero
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
-    mflo      s6, $ac0
-    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
-    subu      s4, s4, s5
-    addu      t3, t2, s4        // tmp10 = tmp0 + z2
-    mflo      s7, $ac1
-    subu      t4, t2, s4        // tmp10 = tmp0 - z2
-    addu      t7, t4, s6
-    subu      t8, t4, s6
-    addu      t5, t3, s7
-    subu      t6, t3, s7
-    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
-    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
-    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
-    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
-    sll       s4, t9, 2
-    lw        v0, 4(a2)         // output_buf[ctr]
-    shll_s.w  t5, t5, 24
-    shll_s.w  t6, t6, 24
-    shll_s.w  t7, t7, 24
-    shll_s.w  t8, t8, 24
-    sra       t5, t5, 24
-    sra       t6, t6, 24
-    sra       t7, t7, 24
-    sra       t8, t8, 24
-    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
-    addiu     t5, t5, 128
-    addiu     t6, t6, 128
-    addiu     t7, t7, 128
-    addiu     t8, t8, 128
-    sb        t5, 0(v0)
-    sb        t7, 1(v0)
-    sb        t8, 2(v0)
-    sb        t6, 3(v0)
-    // 3
-    li        s4, 15137
-    lw        s6, 72(t1)        // wsptr[2]
-    li        s5, 6270
-    lw        s7, 88(t1)        // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-    lw        t2, 64(t1)        // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
-    lh        t5, 92(t1)        // wsptr[7]
-    lh        t6, 84(t1)        // wsptr[5]
-    lh        t7, 76(t1)        // wsptr[3]
-    lh        t8, 68(t1)        // wsptr[1]
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    mult      $ac0, zero, zero
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    mult      $ac1, zero, zero
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
-    mflo      s6, $ac0
-    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
-    subu      s4, s4, s5
-    addu      t3, t2, s4        // tmp10 = tmp0 + z2
-    mflo      s7, $ac1
-    subu      t4, t2, s4        // tmp10 = tmp0 - z2
-    addu      t7, t4, s6
-    subu      t8, t4, s6
-    addu      t5, t3, s7
-    subu      t6, t3, s7
-    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
-    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
-    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
-    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
-    sll       s4, t9, 2
-    lw        v0, 8(a2)         // output_buf[ctr]
-    shll_s.w  t5, t5, 24
-    shll_s.w  t6, t6, 24
-    shll_s.w  t7, t7, 24
-    shll_s.w  t8, t8, 24
-    sra       t5, t5, 24
-    sra       t6, t6, 24
-    sra       t7, t7, 24
-    sra       t8, t8, 24
-    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
-    addiu     t5, t5, 128
-    addiu     t6, t6, 128
-    addiu     t7, t7, 128
-    addiu     t8, t8, 128
-    sb        t5, 0(v0)
-    sb        t7, 1(v0)
-    sb        t8, 2(v0)
-    sb        t6, 3(v0)
-    li        s4, 15137
-    lw        s6, 104(t1)       // wsptr[2]
-    li        s5, 6270
-    lw        s7, 120(t1)       // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-    lw        t2, 96(t1)        // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865)
-    lh        t5, 124(t1)       // wsptr[7]
-    lh        t6, 116(t1)       // wsptr[5]
-    lh        t7, 108(t1)       // wsptr[3]
-    lh        t8, 100(t1)       // wsptr[1]
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    mult      $ac0, zero, zero
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    mult      $ac1, zero, zero
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
-    mflo      s6, $ac0
-    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
-    subu      s4, s4, s5
-    addu      t3, t2, s4        // tmp10 = tmp0 + z2;
-    mflo      s7, $ac1
-    subu      t4, t2, s4        // tmp10 = tmp0 - z2;
-    addu      t7, t4, s6
-    subu      t8, t4, s6
-    addu      t5, t3, s7
-    subu      t6, t3, s7
-    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
-    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
-    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
-    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
-    sll       s4, t9, 2
-    lw        v0, 12(a2)        // output_buf[ctr]
-    shll_s.w  t5, t5, 24
-    shll_s.w  t6, t6, 24
-    shll_s.w  t7, t7, 24
-    shll_s.w  t8, t8, 24
-    sra       t5, t5, 24
-    sra       t6, t6, 24
-    sra       t7, t7, 24
-    sra       t8, t8, 24
-    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
-    addiu     t5, t5, 128
-    addiu     t6, t6, 128
-    addiu     t7, t7, 128
-    addiu     t8, t8, 128
-    sb        t5, 0(v0)
-    sb        t7, 1(v0)
-    sb        t8, 2(v0)
-    sb        t6, 3(v0)
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j         ra
-     nop
-END(jsimd_idct_4x4_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
-/*
- * a0     - compptr->dct_table
- * a1     - coef_block
- * a2     - output_buf
- * a3     - output_col
- */
-    .set at
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    addiu     sp, sp, -144
-    move      v0, sp
-    addiu     v1, v0, 24
-    addiu     t9, zero, 5793
-    addiu     s0, zero, 10033
-    addiu     s1, zero, 2998
-
-1:
-    lh        s2, 0(a0)   // q0 = quantptr[ 0]
-    lh        s3, 32(a0)  // q1 = quantptr[16]
-    lh        s4, 64(a0)  // q2 = quantptr[32]
-    lh        t2, 64(a1)  // tmp2 = inptr[32]
-    lh        t1, 32(a1)  // tmp1 = inptr[16]
-    lh        t0, 0(a1)   // tmp0 = inptr[ 0]
-    mul       t2, t2, s4  // tmp2 = tmp2 * q2
-    mul       t1, t1, s3  // tmp1 = tmp1 * q1
-    mul       t0, t0, s2  // tmp0 = tmp0 * q0
-    lh        t6, 16(a1)  // z1 = inptr[ 8]
-    lh        t8, 80(a1)  // z3 = inptr[40]
-    lh        t7, 48(a1)  // z2 = inptr[24]
-    lh        s2, 16(a0)  // q0 = quantptr[ 8]
-    lh        s4, 80(a0)  // q2 = quantptr[40]
-    lh        s3, 48(a0)  // q1 = quantptr[24]
-    mul       t2, t2, t9  // tmp2 = tmp2 * 5793
-    mul       t1, t1, s0  // tmp1 = tmp1 * 10033
-    sll       t0, t0, 13  // tmp0 = tmp0 << 13
-    mul       t6, t6, s2  // z1 = z1 * q0
-    mul       t8, t8, s4  // z3 = z3 * q2
-    mul       t7, t7, s3  // z2 = z2 * q1
-    addu      t3, t0, t2  // tmp10 = tmp0 + tmp2
-    sll       t2, t2, 1   // tmp2 = tmp2 << 2
-    subu      t4, t0, t2  // tmp11 = tmp0 - tmp2;
-    subu      t5, t3, t1  // tmp12 = tmp10 - tmp1
-    addu      t3, t3, t1  // tmp10 = tmp10 + tmp1
-    addu      t1, t6, t8  // tmp1 = z1 + z3
-    mul       t1, t1, s1  // tmp1 = tmp1 * 2998
-    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
-    subu      t2, t6, t8  // tmp2 = z1 - z3
-    subu      t2, t2, t7  // tmp2 = tmp2 - z2
-    sll       t2, t2, 2   // tmp2 = tmp2 << 2
-    addu      t0, t6, t7  // tmp0 = z1 + z2
-    sll       t0, t0, 13  // tmp0 = tmp0 << 13
-    subu      s2, t8, t7  // q0 = z3 - z2
-    sll       s2, s2, 13  // q0 = q0 << 13
-    addu      t0, t0, t1  // tmp0 = tmp0 + tmp1
-    addu      t1, s2, t1  // tmp1 = q0 + tmp1
-    addu      s2, t4, t2  // q0 = tmp11 + tmp2
-    subu      s3, t4, t2  // q1 = tmp11 - tmp2
-    addu      t6, t3, t0  // z1 = tmp10 + tmp0
-    subu      t7, t3, t0  // z2 = tmp10 - tmp0
-    addu      t4, t5, t1  // tmp11 = tmp12 + tmp1
-    subu      t5, t5, t1  // tmp12 = tmp12 - tmp1
-    shra_r.w  t6, t6, 11  // z1 = (z1 + 1024) >> 11
-    shra_r.w  t7, t7, 11  // z2 = (z2 + 1024) >> 11
-    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
-    shra_r.w  t5, t5, 11  // tmp12 = (tmp12 + 1024) >> 11
-    sw        s2, 24(v0)
-    sw        s3, 96(v0)
-    sw        t6, 0(v0)
-    sw        t7, 120(v0)
-    sw        t4, 48(v0)
-    sw        t5, 72(v0)
-    addiu     v0, v0, 4
-    addiu     a1, a1, 2
-    bne       v0, v1, 1b
-     addiu    a0, a0, 2
-
-    /* Pass 2: process 6 rows from work array, store into output array. */
-    move      v0, sp
-    addiu     v1, v0, 144
-
-2:
-    lw        t0, 0(v0)
-    lw        t2, 16(v0)
-    lw        s5, 0(a2)
-    addiu     t0, t0, 16
-    sll       t0, t0, 13
-    mul       t3, t2, t9
-    lw        t6, 4(v0)
-    lw        t8, 20(v0)
-    lw        t7, 12(v0)
-    addu      s5, s5, a3
-    addu      s6, t6, t8
-    mul       s6, s6, s1
-    addu      t1, t0, t3
-    subu      t4, t0, t3
-    subu      t4, t4, t3
-    lw        t3, 8(v0)
-    mul       t0, t3, s0
-    addu      s7, t6, t7
-    sll       s7, s7, 13
-    addu      s7, s6, s7
-    subu      t2, t8, t7
-    sll       t2, t2, 13
-    addu      t2, s6, t2
-    subu      s6, t6, t7
-    subu      s6, s6, t8
-    sll       s6, s6, 13
-    addu      t3, t1, t0
-    subu      t5, t1, t0
-    addu      t6, t3, s7
-    subu      t3, t3, s7
-    addu      t7, t4, s6
-    subu      t4, t4, s6
-    addu      t8, t5, t2
-    subu      t5, t5, t2
-    shll_s.w  t6, t6, 6
-    shll_s.w  t3, t3, 6
-    shll_s.w  t7, t7, 6
-    shll_s.w  t4, t4, 6
-    shll_s.w  t8, t8, 6
-    shll_s.w  t5, t5, 6
-    sra       t6, t6, 24
-    addiu     t6, t6, 128
-    sra       t3, t3, 24
-    addiu     t3, t3, 128
-    sb        t6, 0(s5)
-    sra       t7, t7, 24
-    addiu     t7, t7, 128
-    sb        t3, 5(s5)
-    sra       t4, t4, 24
-    addiu     t4, t4, 128
-    sb        t7, 1(s5)
-    sra       t8, t8, 24
-    addiu     t8, t8, 128
-    sb        t4, 4(s5)
-    addiu     v0, v0, 24
-    sra       t5, t5, 24
-    addiu     t5, t5, 128
-    sb        t8, 2(s5)
-    addiu     a2, a2,  4
-    bne       v0, v1, 2b
-     sb       t5, 3(s5)
-
-    addiu     sp, sp, 144
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j         ra
-     nop
-
-END(jsimd_idct_6x6_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
-/*
- * a0     - compptr->dct_table
- * a1     - coef_block
- * a2     - workspace
- */
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
-    li         a3, 8
-
-1:
-    // odd part
-    lh         t0, 48(a1)
-    lh         t1, 48(a0)
-    lh         t2, 16(a1)
-    lh         t3, 16(a0)
-    lh         t4, 80(a1)
-    lh         t5, 80(a0)
-    lh         t6, 112(a1)
-    lh         t7, 112(a0)
-    mul        t0, t0, t1    // z2
-    mul        t1, t2, t3    // z1
-    mul        t2, t4, t5    // z3
-    mul        t3, t6, t7    // z4
-    li         t4, 10703     // FIX(1.306562965)
-    li         t5, 4433      // FIX_0_541196100
-    li         t6, 7053      // FIX(0.860918669)
-    mul        t4, t0,t4     // tmp11
-    mul        t5, t0,t5     // -tmp14
-    addu       t7, t1,t2     // tmp10
-    addu       t8, t7,t3     // tmp10 + z4
-    mul        t6, t6, t8    // tmp15
-    li         t8, 2139      // FIX(0.261052384)
-    mul        t8, t7, t8    // MULTIPLY(tmp10, FIX(0.261052384))
-    li         t7, 2295      // FIX(0.280143716)
-    mul        t7, t1, t7    // MULTIPLY(z1, FIX(0.280143716))
-    addu       t9, t2, t3    // z3 + z4
-    li         s0, 8565      // FIX(1.045510580)
-    mul        t9, t9, s0    // -tmp13
-    li         s0, 12112     // FIX(1.478575242)
-    mul        s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242)
-    li         s1, 12998     // FIX(1.586706681)
-    mul        s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
-    li         s2, 5540      // FIX(0.676326758)
-    mul        s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
-    li         s3, 16244     // FIX(1.982889723)
-    mul        s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
-    subu       t1, t1, t3    // z1-=z4
-    subu       t0, t0, t2    // z2-=z3
-    addu       t2, t0, t1    // z1+z2
-    li         t3, 4433      // FIX_0_541196100
-    mul        t2, t2, t3    // z3
-    li         t3, 6270      // FIX_0_765366865
-    mul        t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
-    li         t3, 15137     // FIX_0_765366865
-    mul        t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
-    addu       t8, t6, t8    // tmp12
-    addu       t3, t8, t4    // tmp12 + tmp11
-    addu       t3, t3, t7    // tmp10
-    subu       t8, t8, t9    // tmp12 + tmp13
-    addu       s0, t5, s0
-    subu       t8, t8, s0    // tmp12
-    subu       t9, t6, t9
-    subu       s1, s1, t4
-    addu       t9, t9, s1    // tmp13
-    subu       t6, t6, t5
-    subu       t6, t6, s2
-    subu       t6, t6, s3    // tmp15
-    // even part start
-    lh         t4, 64(a1)
-    lh         t5, 64(a0)
-    lh         t7, 32(a1)
-    lh         s0, 32(a0)
-    lh         s1, 0(a1)
-    lh         s2, 0(a0)
-    lh         s3, 96(a1)
-    lh         v0, 96(a0)
-    mul        t4, t4, t5    // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
-    mul        t5, t7, s0    // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
-    mul        t7, s1, s2    // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
-    mul        s0, s3, v0    // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
-    // odd part end
-    addu       t1, t2, t1    // tmp11
-    subu       t0, t2, t0    // tmp14
-    // update counter and pointers
-    addiu      a3, a3, -1
-    addiu      a0, a0, 2
-    addiu      a1, a1, 2
-    // even part rest
-    li         s1, 10033
-    li         s2, 11190
-    mul        t4, t4, s1    // z4
-    mul        s1, t5, s2    // z4
-    sll        t5, t5, 13    // z1
-    sll        t7, t7, 13
-    addiu      t7, t7, 1024  // z3
-    sll        s0, s0, 13    // z2
-    addu       s2, t7, t4    // tmp10
-    subu       t4, t7, t4    // tmp11
-    subu       s3, t5, s0    // tmp12
-    addu       t2, t7, s3    // tmp21
-    subu       s3, t7, s3    // tmp24
-    addu       t7, s1, s0    // tmp12
-    addu       v0, s2, t7    // tmp20
-    subu       s2, s2, t7    // tmp25
-    subu       s1, s1, t5    // z4 - z1
-    subu       s1, s1, s0    // tmp12
-    addu       s0, t4, s1    // tmp22
-    subu       t4, t4, s1    // tmp23
-    // final output stage
-    addu       t5, v0, t3
-    subu       v0, v0, t3
-    addu       t3, t2, t1
-    subu       t2, t2, t1
-    addu       t1, s0, t8
-    subu       s0, s0, t8
-    addu       t8, t4, t9
-    subu       t4, t4, t9
-    addu       t9, s3, t0
-    subu       s3, s3, t0
-    addu       t0, s2, t6
-    subu       s2, s2, t6
-    sra        t5, t5, 11
-    sra        t3, t3, 11
-    sra        t1, t1, 11
-    sra        t8, t8, 11
-    sra        t9, t9, 11
-    sra        t0, t0, 11
-    sra        s2, s2, 11
-    sra        s3, s3, 11
-    sra        t4, t4, 11
-    sra        s0, s0, 11
-    sra        t2, t2, 11
-    sra        v0, v0, 11
-    sw         t5, 0(a2)
-    sw         t3, 32(a2)
-    sw         t1, 64(a2)
-    sw         t8, 96(a2)
-    sw         t9, 128(a2)
-    sw         t0, 160(a2)
-    sw         s2, 192(a2)
-    sw         s3, 224(a2)
-    sw         t4, 256(a2)
-    sw         s0, 288(a2)
-    sw         t2, 320(a2)
-    sw         v0, 352(a2)
-    bgtz       a3, 1b
-     addiu     a2, a2, 4
-
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
-    j          ra
-     nop
-
-END(jsimd_idct_12x12_pass1_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
-/*
- * a0     - workspace
- * a1     - output
- */
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
-    li        a3, 12
-
-1:
-    // Odd part
-    lw        t0, 12(a0)
-    lw        t1, 4(a0)
-    lw        t2, 20(a0)
-    lw        t3, 28(a0)
-    li        t4, 10703     // FIX(1.306562965)
-    li        t5, 4433      // FIX_0_541196100
-    mul       t4, t0, t4    // tmp11
-    mul       t5, t0, t5    // -tmp14
-    addu      t6, t1, t2    // tmp10
-    li        t7, 2139      // FIX(0.261052384)
-    mul       t7, t6, t7    // MULTIPLY(tmp10, FIX(0.261052384))
-    addu      t6, t6, t3    // tmp10 + z4
-    li        t8, 7053      // FIX(0.860918669)
-    mul       t6, t6, t8    // tmp15
-    li        t8, 2295      // FIX(0.280143716)
-    mul       t8, t1, t8    // MULTIPLY(z1, FIX(0.280143716))
-    addu      t9, t2, t3    // z3 + z4
-    li        s0, 8565      // FIX(1.045510580)
-    mul       t9, t9, s0    // -tmp13
-    li        s0, 12112     // FIX(1.478575242)
-    mul       s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242))
-    li        s1, 12998     // FIX(1.586706681)
-    mul       s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
-    li        s2, 5540      // FIX(0.676326758)
-    mul       s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
-    li        s3, 16244     // FIX(1.982889723)
-    mul       s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
-    subu      t1, t1, t3    // z1 -= z4
-    subu      t0, t0, t2    // z2 -= z3
-    addu      t2, t1, t0    // z1 + z2
-    li        t3, 4433      // FIX_0_541196100
-    mul       t2, t2, t3    // z3
-    li        t3, 6270      // FIX_0_765366865
-    mul       t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
-    li        t3, 15137     // FIX_1_847759065
-    mul       t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
-    addu      t3, t6, t7    // tmp12
-    addu      t7, t3, t4
-    addu      t7, t7, t8    // tmp10
-    subu      t3, t3, t9
-    subu      t3, t3, t5
-    subu      t3, t3, s0    // tmp12
-    subu      t9, t6, t9
-    subu      t9, t9, t4
-    addu      t9, t9, s1    // tmp13
-    subu      t6, t6, t5
-    subu      t6, t6, s2
-    subu      t6, t6, s3    // tmp15
-    addu      t1, t2, t1    // tmp11
-    subu      t0, t2, t0    // tmp14
-    // even part
-    lw        t2, 16(a0)    // z4
-    lw        t4, 8(a0)     // z1
-    lw        t5, 0(a0)     // z3
-    lw        t8, 24(a0)    // z2
-    li        s0, 10033     // FIX(1.224744871)
-    li        s1, 11190     // FIX(1.366025404)
-    mul       t2, t2, s0    // z4
-    mul       s0, t4, s1    // z4
-    addiu     t5, t5, 0x10
-    sll       t5, t5, 13    // z3
-    sll       t4, t4, 13    // z1
-    sll       t8, t8, 13    // z2
-    subu      s1, t4, t8    // tmp12
-    addu      s2, t5, t2    // tmp10
-    subu      t2, t5, t2    // tmp11
-    addu      s3, t5, s1    // tmp21
-    subu      s1, t5, s1    // tmp24
-    addu      t5, s0, t8    // tmp12
-    addu      v0, s2, t5    // tmp20
-    subu      t5, s2, t5    // tmp25
-    subu      t4, s0, t4
-    subu      t4, t4, t8    // tmp12
-    addu      t8, t2, t4    // tmp22
-    subu      t2, t2, t4    // tmp23
-    // increment counter and pointers
-    addiu     a3, a3, -1
-    addiu     a0, a0, 32
-    // Final stage
-    addu      t4, v0, t7
-    subu      v0, v0, t7
-    addu      t7, s3, t1
-    subu      s3, s3, t1
-    addu      t1, t8, t3
-    subu      t8, t8, t3
-    addu      t3, t2, t9
-    subu      t2, t2, t9
-    addu      t9, s1, t0
-    subu      s1, s1, t0
-    addu      t0, t5, t6
-    subu      t5, t5, t6
-    sll       t4, t4, 4
-    sll       t7, t7, 4
-    sll       t1, t1, 4
-    sll       t3, t3, 4
-    sll       t9, t9, 4
-    sll       t0, t0, 4
-    sll       t5, t5, 4
-    sll       s1, s1, 4
-    sll       t2, t2, 4
-    sll       t8, t8, 4
-    sll       s3, s3, 4
-    sll       v0, v0, 4
-    shll_s.w  t4, t4, 2
-    shll_s.w  t7, t7, 2
-    shll_s.w  t1, t1, 2
-    shll_s.w  t3, t3, 2
-    shll_s.w  t9, t9, 2
-    shll_s.w  t0, t0, 2
-    shll_s.w  t5, t5, 2
-    shll_s.w  s1, s1, 2
-    shll_s.w  t2, t2, 2
-    shll_s.w  t8, t8, 2
-    shll_s.w  s3, s3, 2
-    shll_s.w  v0, v0, 2
-    srl       t4, t4, 24
-    srl       t7, t7, 24
-    srl       t1, t1, 24
-    srl       t3, t3, 24
-    srl       t9, t9, 24
-    srl       t0, t0, 24
-    srl       t5, t5, 24
-    srl       s1, s1, 24
-    srl       t2, t2, 24
-    srl       t8, t8, 24
-    srl       s3, s3, 24
-    srl       v0, v0, 24
-    lw        t6, 0(a1)
-    addiu     t4, t4, 0x80
-    addiu     t7, t7, 0x80
-    addiu     t1, t1, 0x80
-    addiu     t3, t3, 0x80
-    addiu     t9, t9, 0x80
-    addiu     t0, t0, 0x80
-    addiu     t5, t5, 0x80
-    addiu     s1, s1, 0x80
-    addiu     t2, t2, 0x80
-    addiu     t8, t8, 0x80
-    addiu     s3, s3, 0x80
-    addiu     v0, v0, 0x80
-    sb        t4, 0(t6)
-    sb        t7, 1(t6)
-    sb        t1, 2(t6)
-    sb        t3, 3(t6)
-    sb        t9, 4(t6)
-    sb        t0, 5(t6)
-    sb        t5, 6(t6)
-    sb        s1, 7(t6)
-    sb        t2, 8(t6)
-    sb        t8, 9(t6)
-    sb        s3, 10(t6)
-    sb        v0, 11(t6)
-    bgtz      a3, 1b
-     addiu    a1, a1, 4
-
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
-    jr        ra
-     nop
-
-END(jsimd_idct_12x12_pass2_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
-/*
- * a0     - sample_data
- * a1     - start_col
- * a2     - workspace
- */
-
-    lw             t0, 0(a0)
-    li             t7, 0xff80ff80
-    addu           t0, t0, a1
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    lw             t0, 4(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 0(a2)
-    usw            t4, 4(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 8(a2)
-    usw            t6, 12(a2)
-
-    lw             t0, 8(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 16(a2)
-    usw            t4, 20(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 24(a2)
-    usw            t6, 28(a2)
-
-    lw             t0, 12(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 32(a2)
-    usw            t4, 36(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 40(a2)
-    usw            t6, 44(a2)
-
-    lw             t0, 16(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 48(a2)
-    usw            t4, 52(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 56(a2)
-    usw            t6, 60(a2)
-
-    lw             t0, 20(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 64(a2)
-    usw            t4, 68(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 72(a2)
-    usw            t6, 76(a2)
-
-    lw             t0, 24(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 80(a2)
-    usw            t4, 84(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 88(a2)
-    usw            t6, 92(a2)
-
-    lw             t0, 28(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 96(a2)
-    usw            t4, 100(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 104(a2)
-    usw            t6, 108(a2)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 112(a2)
-    usw            t4, 116(a2)
-    usw            t5, 120(a2)
-    usw            t6, 124(a2)
-
-    j              ra
-     nop
-
-END(jsimd_convsamp_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
-/*
- * a0     - sample_data
- * a1     - start_col
- * a2     - workspace
- */
-
-    .set at
-
-    lw       t0, 0(a0)
-    addu     t0, t0, a1
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 4(a0)
-    swc1     f2, 0(a2)
-    swc1     f4, 4(a2)
-    swc1     f6, 8(a2)
-    addu     t0, t0, a1
-    swc1     f8, 12(a2)
-    swc1     f10, 16(a2)
-    swc1     f12, 20(a2)
-    swc1     f14, 24(a2)
-    swc1     f16, 28(a2)
-    //elemr 1
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 8(a0)
-    swc1     f2, 32(a2)
-    swc1     f4, 36(a2)
-    swc1     f6, 40(a2)
-    addu     t0, t0, a1
-    swc1     f8, 44(a2)
-    swc1     f10, 48(a2)
-    swc1     f12, 52(a2)
-    swc1     f14, 56(a2)
-    swc1     f16, 60(a2)
-    //elemr 2
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 12(a0)
-    swc1     f2, 64(a2)
-    swc1     f4, 68(a2)
-    swc1     f6, 72(a2)
-    addu     t0, t0, a1
-    swc1     f8, 76(a2)
-    swc1     f10, 80(a2)
-    swc1     f12, 84(a2)
-    swc1     f14, 88(a2)
-    swc1     f16, 92(a2)
-    //elemr 3
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 16(a0)
-    swc1     f2, 96(a2)
-    swc1     f4, 100(a2)
-    swc1     f6, 104(a2)
-    addu     t0, t0, a1
-    swc1     f8, 108(a2)
-    swc1     f10, 112(a2)
-    swc1     f12, 116(a2)
-    swc1     f14, 120(a2)
-    swc1     f16, 124(a2)
-    //elemr 4
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 20(a0)
-    swc1     f2, 128(a2)
-    swc1     f4, 132(a2)
-    swc1     f6, 136(a2)
-    addu     t0, t0, a1
-    swc1     f8, 140(a2)
-    swc1     f10, 144(a2)
-    swc1     f12, 148(a2)
-    swc1     f14, 152(a2)
-    swc1     f16, 156(a2)
-    //elemr 5
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 24(a0)
-    swc1     f2, 160(a2)
-    swc1     f4, 164(a2)
-    swc1     f6, 168(a2)
-    addu     t0, t0, a1
-    swc1     f8, 172(a2)
-    swc1     f10, 176(a2)
-    swc1     f12, 180(a2)
-    swc1     f14, 184(a2)
-    swc1     f16, 188(a2)
-    //elemr 6
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 28(a0)
-    swc1     f2, 192(a2)
-    swc1     f4, 196(a2)
-    swc1     f6, 200(a2)
-    addu     t0, t0, a1
-    swc1     f8, 204(a2)
-    swc1     f10, 208(a2)
-    swc1     f12, 212(a2)
-    swc1     f14, 216(a2)
-    swc1     f16, 220(a2)
-    //elemr 7
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    swc1     f2, 224(a2)
-    swc1     f4, 228(a2)
-    swc1     f6, 232(a2)
-    swc1     f8, 236(a2)
-    swc1     f10, 240(a2)
-    swc1     f12, 244(a2)
-    swc1     f14, 248(a2)
-    swc1     f16, 252(a2)
-
-    j        ra
-     nop
-
-END(jsimd_convsamp_float_mips_dspr2)
-
-/*****************************************************************************/
-
diff --git a/media/libjpeg/simd/jsimd_mips_dspr2_asm.h b/media/libjpeg/simd/jsimd_mips_dspr2_asm.h
deleted file mode 100644
index 64f9880482..0000000000
--- a/media/libjpeg/simd/jsimd_mips_dspr2_asm.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * MIPS DSPr2 optimizations for libjpeg-turbo
- *
- * Copyright (C) 2013, MIPS Technologies, Inc., California.
- * All Rights Reserved.
- * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
- *           Darko Laus       (darko.laus@imgtec.com)
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#define zero $0
-#define AT   $1
-#define v0   $2
-#define v1   $3
-#define a0   $4
-#define a1   $5
-#define a2   $6
-#define a3   $7
-#define t0   $8
-#define t1   $9
-#define t2   $10
-#define t3   $11
-#define t4   $12
-#define t5   $13
-#define t6   $14
-#define t7   $15
-#define s0   $16
-#define s1   $17
-#define s2   $18
-#define s3   $19
-#define s4   $20
-#define s5   $21
-#define s6   $22
-#define s7   $23
-#define t8   $24
-#define t9   $25
-#define k0   $26
-#define k1   $27
-#define gp   $28
-#define sp   $29
-#define fp   $30
-#define s8   $30
-#define ra   $31
-
-#define f0   $f0
-#define f1   $f1
-#define f2   $f2
-#define f3   $f3
-#define f4   $f4
-#define f5   $f5
-#define f6   $f6
-#define f7   $f7
-#define f8   $f8
-#define f9   $f9
-#define f10  $f10
-#define f11  $f11
-#define f12  $f12
-#define f13  $f13
-#define f14  $f14
-#define f15  $f15
-#define f16  $f16
-#define f17  $f17
-#define f18  $f18
-#define f19  $f19
-#define f20  $f20
-#define f21  $f21
-#define f22  $f22
-#define f23  $f23
-#define f24  $f24
-#define f25  $f25
-#define f26  $f26
-#define f27  $f27
-#define f28  $f28
-#define f29  $f29
-#define f30  $f30
-#define f31  $f31
-
-/*
- * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
- */
-#define LEAF_MIPS32R2(symbol)                           \
-                .globl  symbol;                         \
-                .align  2;                              \
-                .type   symbol, @function;              \
-                .ent    symbol, 0;                      \
-symbol:         .frame  sp, 0, ra;                      \
-                .set    push;                           \
-                .set    arch=mips32r2;                  \
-                .set    noreorder;                      \
-                .set    noat;
-
-/*
- * LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2
- */
-#define LEAF_MIPS_DSPR2(symbol)                         \
-LEAF_MIPS32R2(symbol)                                   \
-                .set    dspr2;
-
-/*
- * END - mark end of function
- */
-#define END(function)                                   \
-                .set    pop;                            \
-                .end    function;                       \
-                .size   function,.-function
-
-/*
- * Checks if stack offset is big enough for storing/restoring regs_num
- * number of register to/from stack. Stack offset must be greater than
- * or equal to the number of bytes needed for storing registers (regs_num*4).
- * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
- * preserved for input arguments of the functions, already stored in a0-a3),
- * stack size can be further optimized by utilizing this space.
- */
-.macro CHECK_STACK_OFFSET regs_num, stack_offset
-.if \stack_offset < \regs_num * 4 - 16
-.error "Stack offset too small."
-.endif
-.endm
-
-/*
- * Saves set of registers on stack. Maximum number of registers that
- * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
- * Stack offset is number of bytes that are added to stack pointer (sp)
- * before registers are pushed in order to provide enough space on stack
- * (offset must be multiple of 4, and must be big enough, as described by
- * CHECK_STACK_OFFSET macro). This macro is intended to be used in
- * combination with RESTORE_REGS_FROM_STACK macro. Example:
- *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
- *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
- */
-.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
-                          r2  = 0, r3  = 0, r4  = 0, \
-                          r5  = 0, r6  = 0, r7  = 0, \
-                          r8  = 0, r9  = 0, r10 = 0, \
-                          r11 = 0, r12 = 0, r13 = 0, \
-                          r14 = 0
-    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
-    .error "Stack offset must be pozitive and multiple of 4."
-    .endif
-    .if \stack_offset != 0
-    addiu           sp, sp, -\stack_offset
-    .endif
-    sw              \r1, 0(sp)
-    .if \r2 != 0
-    sw              \r2, 4(sp)
-    .endif
-    .if \r3 != 0
-    sw              \r3, 8(sp)
-    .endif
-    .if \r4 != 0
-    sw              \r4, 12(sp)
-    .endif
-    .if \r5 != 0
-    CHECK_STACK_OFFSET 5, \stack_offset
-    sw              \r5, 16(sp)
-    .endif
-    .if \r6 != 0
-    CHECK_STACK_OFFSET 6, \stack_offset
-    sw              \r6, 20(sp)
-    .endif
-    .if \r7 != 0
-    CHECK_STACK_OFFSET 7, \stack_offset
-    sw              \r7, 24(sp)
-    .endif
-    .if \r8 != 0
-    CHECK_STACK_OFFSET 8, \stack_offset
-    sw              \r8, 28(sp)
-    .endif
-    .if \r9 != 0
-    CHECK_STACK_OFFSET 9, \stack_offset
-    sw              \r9, 32(sp)
-    .endif
-    .if \r10 != 0
-    CHECK_STACK_OFFSET 10, \stack_offset
-    sw              \r10, 36(sp)
-    .endif
-    .if \r11 != 0
-    CHECK_STACK_OFFSET 11, \stack_offset
-    sw              \r11, 40(sp)
-    .endif
-    .if \r12 != 0
-    CHECK_STACK_OFFSET 12, \stack_offset
-    sw              \r12, 44(sp)
-    .endif
-    .if \r13 != 0
-    CHECK_STACK_OFFSET 13, \stack_offset
-    sw              \r13, 48(sp)
-    .endif
-    .if \r14 != 0
-    CHECK_STACK_OFFSET 14, \stack_offset
-    sw              \r14, 52(sp)
-    .endif
-.endm
-
-/*
- * Restores set of registers from stack. Maximum number of registers that
- * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
- * Stack offset is number of bytes that are added to stack pointer (sp)
- * after registers are restored (offset must be multiple of 4, and must
- * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
- * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
- * Example:
- *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
- *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
- */
-.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
-                               r2  = 0, r3  = 0, r4  = 0, \
-                               r5  = 0, r6  = 0, r7  = 0, \
-                               r8  = 0, r9  = 0, r10 = 0, \
-                               r11 = 0, r12 = 0, r13 = 0, \
-                               r14 = 0
-    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
-    .error "Stack offset must be pozitive and multiple of 4."
-    .endif
-    lw              \r1, 0(sp)
-    .if \r2 != 0
-    lw              \r2, 4(sp)
-    .endif
-    .if \r3 != 0
-    lw              \r3, 8(sp)
-    .endif
-    .if \r4 != 0
-    lw              \r4, 12(sp)
-    .endif
-    .if \r5 != 0
-    CHECK_STACK_OFFSET 5, \stack_offset
-    lw              \r5, 16(sp)
-    .endif
-    .if \r6 != 0
-    CHECK_STACK_OFFSET 6, \stack_offset
-    lw              \r6, 20(sp)
-    .endif
-    .if \r7 != 0
-    CHECK_STACK_OFFSET 7, \stack_offset
-    lw              \r7, 24(sp)
-    .endif
-    .if \r8 != 0
-    CHECK_STACK_OFFSET 8, \stack_offset
-    lw              \r8, 28(sp)
-    .endif
-    .if \r9 != 0
-    CHECK_STACK_OFFSET 9, \stack_offset
-    lw              \r9, 32(sp)
-    .endif
-    .if \r10 != 0
-    CHECK_STACK_OFFSET 10, \stack_offset
-    lw              \r10, 36(sp)
-    .endif
-    .if \r11 != 0
-    CHECK_STACK_OFFSET 11, \stack_offset
-    lw              \r11, 40(sp)
-    .endif
-    .if \r12 != 0
-    CHECK_STACK_OFFSET 12, \stack_offset
-    lw              \r12, 44(sp)
-    .endif
-    .if \r13 != 0
-    CHECK_STACK_OFFSET 13, \stack_offset
-    lw              \r13, 48(sp)
-    .endif
-    .if \r14 != 0
-    CHECK_STACK_OFFSET 14, \stack_offset
-    lw              \r14, 52(sp)
-    .endif
-    .if \stack_offset != 0
-    addiu           sp, sp, \stack_offset
-    .endif
-.endm
-
-
diff --git a/media/libjpeg/simd/jsimd_powerpc.c b/media/libjpeg/simd/jsimd_powerpc.c
deleted file mode 100644
index 42dc1e0868..0000000000
--- a/media/libjpeg/simd/jsimd_powerpc.c
+++ /dev/null
@@ -1,828 +0,0 @@
-/*
- * jsimd_powerpc.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * PowerPC architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-static unsigned int simd_support = ~0;
-
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_feature (char *buffer, char *feature)
-{
-  char *p;
-  if (*feature == 0)
-    return 0;
-  if (strncmp(buffer, "cpu", 3) != 0)
-    return 0;
-  buffer += 3;
-  while (isspace(*buffer))
-    buffer++;
-
-  /* Check if 'feature' is present in the buffer as a separate word */
-  while ((p = strstr(buffer, feature))) {
-    if (p > buffer && !isspace(*(p - 1))) {
-      buffer++;
-      continue;
-    }
-    p += strlen(feature);
-    if (*p != 0 && !isspace(*p)) {
-      buffer++;
-      continue;
-    }
-    return 1;
-  }
-  return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
-  char *buffer = (char *)malloc(bufsize);
-  FILE *fd;
-  simd_support = 0;
-
-  if (!buffer)
-    return 0;
-
-  fd = fopen("/proc/cpuinfo", "r");
-  if (fd) {
-    while (fgets(buffer, bufsize, fd)) {
-      if (!strchr(buffer, '\n') && !feof(fd)) {
-        /* "impossible" happened - insufficient size of the buffer! */
-        fclose(fd);
-        free(buffer);
-        return 0;
-      }
-      if (check_feature(buffer, "altivec"))
-        simd_support |= JSIMD_ALTIVEC;
-    }
-    fclose(fd);
-  }
-  free(buffer);
-  return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
-  int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#endif
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = 0;
-
-#if defined(__ALTIVEC__) || defined(__APPLE__)
-  simd_support |= JSIMD_ALTIVEC;
-#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  while (!parse_proc_cpuinfo(bufsize)) {
-    bufsize *= 2;
-    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
-      break;
-  }
-#endif
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCEALTIVEC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = JSIMD_ALTIVEC;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_extrgb_ycc_convert_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_extrgbx_ycc_convert_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_extbgr_ycc_convert_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_extbgrx_ycc_convert_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_extxbgr_ycc_convert_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_extxrgb_ycc_convert_altivec;
-      break;
-    default:
-      altivecfct=jsimd_rgb_ycc_convert_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_extrgb_gray_convert_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_extrgbx_gray_convert_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_extbgr_gray_convert_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_extbgrx_gray_convert_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_extxbgr_gray_convert_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_extxrgb_gray_convert_altivec;
-      break;
-    default:
-      altivecfct=jsimd_rgb_gray_convert_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_ycc_extrgb_convert_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_ycc_extrgbx_convert_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_ycc_extbgr_convert_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_ycc_extbgrx_convert_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_ycc_extxbgr_convert_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_ycc_extxrgb_convert_altivec;
-      break;
-    default:
-      altivecfct=jsimd_ycc_rgb_convert_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
-                                compptr->v_samp_factor,
-                                compptr->width_in_blocks,
-                                input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
-                                compptr->v_samp_factor,
-                                compptr->width_in_blocks,
-                                input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
-                              input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
-                              input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
-                                    compptr->downsampled_width, input_data,
-                                    output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
-                                    compptr->downsampled_width, input_data,
-                                    output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_h2v2_extrgb_merged_upsample_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_h2v2_extrgbx_merged_upsample_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_h2v2_extbgr_merged_upsample_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_h2v2_extbgrx_merged_upsample_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_h2v2_extxbgr_merged_upsample_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_h2v2_extxrgb_merged_upsample_altivec;
-      break;
-    default:
-      altivecfct=jsimd_h2v2_merged_upsample_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_h2v1_extrgb_merged_upsample_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_h2v1_extrgbx_merged_upsample_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_h2v1_extbgr_merged_upsample_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_h2v1_extbgrx_merged_upsample_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_h2v1_extxbgr_merged_upsample_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_h2v1_extxrgb_merged_upsample_altivec;
-      break;
-    default:
-      altivecfct=jsimd_h2v1_merged_upsample_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  jsimd_convsamp_altivec(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  jsimd_fdct_islow_altivec(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  jsimd_fdct_ifast_altivec(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  jsimd_quantize_altivec(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
-                           output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
-                           output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return NULL;
-}
diff --git a/media/libjpeg/simd/jsimd_x86_64.c b/media/libjpeg/simd/jsimd_x86_64.c
deleted file mode 100644
index a62bcdb0ee..0000000000
--- a/media/libjpeg/simd/jsimd_x86_64.c
+++ /dev/null
@@ -1,887 +0,0 @@
-/*
- * jsimd_x86_64.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 64-bit x86 architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-/*
- * In the PIC cases, we have no guarantee that constants will keep
- * their alignment. This macro allows us to verify it at runtime.
- */
-#define IS_ALIGNED(ptr, order) (((size_t)ptr & ((1 << order) - 1)) == 0)
-
-#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = JSIMD_SSE2 | JSIMD_SSE;
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_huffman = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_extrgb_ycc_convert_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_extrgbx_ycc_convert_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_extbgr_ycc_convert_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_extbgrx_ycc_convert_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_extxbgr_ycc_convert_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_extxrgb_ycc_convert_sse2;
-      break;
-    default:
-      sse2fct=jsimd_rgb_ycc_convert_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_extrgb_gray_convert_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_extrgbx_gray_convert_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_extbgr_gray_convert_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_extbgrx_gray_convert_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_extxbgr_gray_convert_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_extxrgb_gray_convert_sse2;
-      break;
-    default:
-      sse2fct=jsimd_rgb_gray_convert_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_ycc_extrgb_convert_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_ycc_extrgbx_convert_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_ycc_extbgr_convert_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_ycc_extbgrx_convert_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_ycc_extxbgr_convert_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_ycc_extxrgb_convert_sse2;
-      break;
-    default:
-      sse2fct=jsimd_ycc_rgb_convert_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor, compptr->width_in_blocks,
-                             input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor, compptr->width_in_blocks,
-                             input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
-                           input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
-                           input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                 compptr->downsampled_width, input_data,
-                                 output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                 compptr->downsampled_width, input_data,
-                                 output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
-      break;
-    default:
-      sse2fct=jsimd_h2v2_merged_upsample_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
-      break;
-    default:
-      sse2fct=jsimd_h2v1_merged_upsample_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  jsimd_convsamp_sse2(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-  jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  jsimd_fdct_islow_sse2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  jsimd_fdct_ifast_sse2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-  jsimd_fdct_float_sse(data);
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  jsimd_quantize_sse2(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-  jsimd_quantize_float_sse2(coef_block, divisors, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-  if (sizeof(FLOAT_MULT_TYPE) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
-      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
-                                          dctbl, actbl);
-}
diff --git a/media/libjpeg/simd/jsimdcpu.asm b/media/libjpeg/simd/jsimdcpu.asm
deleted file mode 100644
index 599083b182..0000000000
--- a/media/libjpeg/simd/jsimdcpu.asm
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-; jsimdcpu.asm - SIMD instruction support check
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Check if the CPU supports SIMD instructions
-;
-; GLOBAL(unsigned int)
-; jpeg_simd_cpu_support (void)
-;
-
-        align   16
-        global  EXTN(jpeg_simd_cpu_support)
-
-EXTN(jpeg_simd_cpu_support):
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-        push    edi
-
-        xor     edi,edi                 ; simd support flag
-
-        pushfd
-        pop     eax
-        mov     edx,eax
-        xor     eax, 1<<21              ; flip ID bit in EFLAGS
-        push    eax
-        popfd
-        pushfd
-        pop     eax
-        xor     eax,edx
-        jz      short .return           ; CPUID is not supported
-
-        ; Check for MMX instruction support
-        xor     eax,eax
-        cpuid
-        test    eax,eax
-        jz      short .return
-
-        xor     eax,eax
-        inc     eax
-        cpuid
-        mov     eax,edx                 ; eax = Standard feature flags
-
-        test    eax, 1<<23              ; bit23:MMX
-        jz      short .no_mmx
-        or      edi, byte JSIMD_MMX
-.no_mmx:
-        test    eax, 1<<25              ; bit25:SSE
-        jz      short .no_sse
-        or      edi, byte JSIMD_SSE
-.no_sse:
-        test    eax, 1<<26              ; bit26:SSE2
-        jz      short .no_sse2
-        or      edi, byte JSIMD_SSE2
-.no_sse2:
-
-        ; Check for 3DNow! instruction support
-        mov     eax, 0x80000000
-        cpuid
-        cmp     eax, 0x80000000
-        jbe     short .return
-
-        mov     eax, 0x80000001
-        cpuid
-        mov     eax,edx                 ; eax = Extended feature flags
-
-        test    eax, 1<<31              ; bit31:3DNow!(vendor independent)
-        jz      short .no_3dnow
-        or      edi, byte JSIMD_3DNOW
-.no_3dnow:
-
-.return:
-        mov     eax,edi
-
-        pop     edi
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jsimdext.inc b/media/libjpeg/simd/jsimdext.inc
deleted file mode 100644
index f28db60b57..0000000000
--- a/media/libjpeg/simd/jsimdext.inc
+++ /dev/null
@@ -1,375 +0,0 @@
-;
-; jsimdext.inc - common declarations
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2010, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
-;
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-;
-; This software is provided 'as-is', without any express or implied
-; warranty.  In no event will the authors be held liable for any damages
-; arising from the use of this software.
-;
-; Permission is granted to anyone to use this software for any purpose,
-; including commercial applications, and to alter it and redistribute it
-; freely, subject to the following restrictions:
-;
-; 1. The origin of this software must not be misrepresented; you must not
-;    claim that you wrote the original software. If you use this software
-;    in a product, an acknowledgment in the product documentation would be
-;    appreciated but is not required.
-; 2. Altered source versions must be plainly marked as such, and must not be
-;    misrepresented as being the original software.
-; 3. This notice may not be removed or altered from any source distribution.
-;
-; [TAB8]
-
-; ==========================================================================
-;  System-dependent configurations
-
-%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
-; * Microsoft Visual C++
-; * MinGW (Minimalist GNU for Windows)
-; * CygWin
-; * LCC-Win32
-
-; -- segment definition --
-;
-%ifdef __YASM_VER__
-%define SEG_TEXT    .text  align=16
-%define SEG_CONST   .rdata align=16
-%else
-%define SEG_TEXT    .text  align=16 public use32 class=CODE
-%define SEG_CONST   .rdata align=16 public use32 class=CONST
-%endif
-
-%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
-; * Microsoft Visual C++
-
-; -- segment definition --
-;
-%ifdef __YASM_VER__
-%define SEG_TEXT    .text  align=16
-%define SEG_CONST   .rdata align=16
-%else
-%define SEG_TEXT    .text  align=16 public use64 class=CODE
-%define SEG_CONST   .rdata align=16 public use64 class=CONST
-%endif
-%define EXTN(name)  name                        ; foo() -> foo
-
-%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
-; * Borland C++ (Win32)
-
-; -- segment definition --
-;
-%define SEG_TEXT    _text  align=16 public use32 class=CODE
-%define SEG_CONST   _data  align=16 public use32 class=DATA
-
-%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
-; * Linux
-; * *BSD family Unix using elf format
-; * Unix System V, including Solaris x86, UnixWare and SCO Unix
-
-; mark stack as non-executable
-section .note.GNU-stack noalloc noexec nowrite progbits
-
-; -- segment definition --
-;
-%ifdef __x86_64__
-%define SEG_TEXT    .text   progbits align=16
-%define SEG_CONST   .rodata progbits align=16
-%else
-%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
-%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
-%endif
-
-; To make the code position-independent, append -DPIC to the commandline
-;
-%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_       ; ELF supports PIC
-%define EXTN(name)  name                        ; foo() -> foo
-
-%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
-; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
-; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
-
-; -- segment definition --
-;
-%define SEG_TEXT    .text
-%define SEG_CONST   .data
-
-; To make the code position-independent, append -DPIC to the commandline
-;
-%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_      ; BSD-style a.out supports PIC
-
-%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
-; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
-
-; -- segment definition --
-;
-%define SEG_TEXT    .text  ;align=16    ; nasm doesn't accept align=16. why?
-%define SEG_CONST   .rodata align=16
-
-; The generation of position-independent code (PIC) is the default on Darwin.
-;
-%define PIC
-%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
-
-%else           ; ----(Other case)----------------------
-
-; -- segment definition --
-;
-%define SEG_TEXT    .text
-%define SEG_CONST   .data
-
-%endif  ; ----------------------------------------------
-
-; ==========================================================================
-
-; --------------------------------------------------------------------------
-;  Common types
-;
-%ifdef __x86_64__
-%define POINTER                 qword           ; general pointer type
-%define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
-%define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
-%else
-%define POINTER                 dword           ; general pointer type
-%define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
-%define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
-%endif
-
-%define INT                     dword           ; signed integer type
-%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
-%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT
-
-%define FP32                    dword           ; IEEE754 single
-%define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
-%define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
-
-%define MMWORD                  qword           ; int64  (MMX register)
-%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
-%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
-
-; NASM is buggy and doesn't properly handle operand sizes for SSE
-; instructions, so for now we have to define XMMWORD as blank.
-%define XMMWORD                                 ; int128 (SSE register)
-%define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
-%define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
-
-; Similar hacks for when we load a dword or MMWORD into an xmm# register
-%define XMM_DWORD
-%define XMM_MMWORD
-
-%define SIZEOF_BYTE             1               ; sizeof(BYTE)
-%define SIZEOF_WORD             2               ; sizeof(WORD)
-%define SIZEOF_DWORD            4               ; sizeof(DWORD)
-%define SIZEOF_QWORD            8               ; sizeof(QWORD)
-%define SIZEOF_OWORD            16              ; sizeof(OWORD)
-
-%define BYTE_BIT                8               ; CHAR_BIT in C
-%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
-%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
-%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
-%define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT
-
-; --------------------------------------------------------------------------
-;  External Symbol Name
-;
-%ifndef EXTN
-%define EXTN(name)   _ %+ name          ; foo() -> _foo
-%endif
-
-; --------------------------------------------------------------------------
-;  Macros for position-independent code (PIC) support
-;
-%ifndef GOT_SYMBOL
-%undef PIC
-%endif
-
-%ifdef PIC ; -------------------------------------------
-
-%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
-
-; At present, nasm doesn't seem to support PIC generation for Mach-O.
-; The PIC support code below is a little tricky.
-
-        SECTION SEG_CONST
-const_base:
-
-%define GOTOFF(got,sym) (got) + (sym) - const_base
-
-%imacro get_GOT 1
-        ; NOTE: this macro destroys ecx resister.
-        call    %%geteip
-        add     ecx, byte (%%ref - $)
-        jmp     short %%adjust
-%%geteip:
-        mov     ecx, POINTER [esp]
-        ret
-%%adjust:
-        push    ebp
-        xor     ebp,ebp         ; ebp = 0
-%ifidni %1,ebx  ; (%1 == ebx)
-        ; db 0x8D,0x9C + jmp near const_base =
-        ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
-        db      0x8D,0x9C               ; 8D,9C
-        jmp     near const_base         ; E9,(const_base-%%ref)
-%%ref:
-%else  ; (%1 != ebx)
-        ; db 0x8D,0x8C + jmp near const_base =
-        ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
-        db      0x8D,0x8C               ; 8D,8C
-        jmp     near const_base         ; E9,(const_base-%%ref)
-%%ref:  mov     %1, ecx
-%endif ; (%1 == ebx)
-        pop     ebp
-%endmacro
-
-%else   ; GOT_SYMBOL != _MACHO_PIC_ ----------------
-
-%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
-
-%imacro get_GOT 1
-        extern  GOT_SYMBOL
-        call    %%geteip
-        add     %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
-        jmp     short %%done
-%%geteip:
-        mov     %1, POINTER [esp]
-        ret
-%%done:
-%endmacro
-
-%endif  ; GOT_SYMBOL == _MACHO_PIC_ ----------------
-
-%imacro pushpic 1.nolist
-        push    %1
-%endmacro
-%imacro poppic  1.nolist
-        pop     %1
-%endmacro
-%imacro movpic  2.nolist
-        mov     %1,%2
-%endmacro
-
-%else   ; !PIC -----------------------------------------
-
-%define GOTOFF(got,sym) (sym)
-
-%imacro get_GOT 1.nolist
-%endmacro
-%imacro pushpic 1.nolist
-%endmacro
-%imacro poppic  1.nolist
-%endmacro
-%imacro movpic  2.nolist
-%endmacro
-
-%endif  ;  PIC -----------------------------------------
-
-; --------------------------------------------------------------------------
-;  Align the next instruction on {2,4,8,16,..}-byte boundary.
-;  ".balign n,,m" in GNU as
-;
-%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
-%define FILLB(b,n)  (($$-(b)) & ((n)-1))
-
-%imacro alignx 1-2.nolist 0xFFFF
-%%bs:   times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
-               db 0x90                               ; nop
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
-               db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
-               db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
-               db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
-               db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
-               db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
-               db 0x8B,0xED                          ; mov ebp,ebp
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
-               db 0x90                               ; nop
-%endmacro
-
-; Align the next data on {2,4,8,16,..}-byte boundary.
-;
-%imacro alignz 1.nolist
-        align %1, db 0          ; filling zeros
-%endmacro
-
-%ifdef __x86_64__
-
-%ifdef WIN64
-
-%imacro collect_args 0
-        push r12
-        push r13
-        push r14
-        push r15
-        mov r10, rcx
-        mov r11, rdx
-        mov r12, r8
-        mov r13, r9
-        mov r14, [rax+48]
-        mov r15, [rax+56]
-        push rsi
-        push rdi
-        sub     rsp, SIZEOF_XMMWORD
-        movaps  XMMWORD [rsp], xmm6
-        sub     rsp, SIZEOF_XMMWORD
-        movaps  XMMWORD [rsp], xmm7
-%endmacro
-
-%imacro uncollect_args 0
-        movaps  xmm7, XMMWORD [rsp]
-        add     rsp, SIZEOF_XMMWORD
-        movaps  xmm6, XMMWORD [rsp]
-        add     rsp, SIZEOF_XMMWORD
-        pop rdi
-        pop rsi
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-%endmacro
-
-%else
-
-%imacro collect_args 0
-        push r10
-        push r11
-        push r12
-        push r13
-        push r14
-        push r15
-        mov r10, rdi
-        mov r11, rsi
-        mov r12, rdx
-        mov r13, rcx
-        mov r14, r8
-        mov r15, r9
-%endmacro
-
-%imacro uncollect_args 0
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-        pop r11
-        pop r10
-%endmacro
-
-%endif
-
-%endif
-
-; --------------------------------------------------------------------------
-;  Defines picked up from the C headers
-;
-%include "jsimdcfg.inc"
-
-; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/mips/jsimd.c b/media/libjpeg/simd/mips/jsimd.c
new file mode 100644
index 0000000000..d2546eed32
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd.c
@@ -0,0 +1,1147 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2020, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if !(defined(__mips_dsp) && (__mips_dsp_rev >= 2)) && defined(__linux__)
+
+LOCAL(void)
+parse_proc_cpuinfo(const char *search_string)
+{
+  const char *file_name = "/proc/cpuinfo";
+  char cpuinfo_line[256];
+  FILE *f = NULL;
+
+  simd_support = 0;
+
+  if ((f = fopen(file_name, "r")) != NULL) {
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+      if (strstr(cpuinfo_line, search_string) != NULL) {
+        fclose(f);
+        simd_support |= JSIMD_DSPR2;
+        return;
+      }
+    }
+    fclose(f);
+  }
+  /* Did not find string in the proc file, or not Linux ELF. */
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+  simd_support |= JSIMD_DSPR2;
+#elif defined(__linux__)
+  /* We still have a chance to use MIPS DSPR2 regardless of globally used
+   * -mdspr2 options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux */
+  parse_proc_cpuinfo("MIPS 74K");
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEDSPR2");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_DSPR2;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+#endif
+}
+
+static const int mips_idct_ifast_coefs[4] = {
+  0x45404540,           /* FIX( 1.082392200 / 2) =  17734 = 0x4546 */
+  0x5A805A80,           /* FIX( 1.414213562 / 2) =  23170 = 0x5A82 */
+  0x76407640,           /* FIX( 1.847759065 / 2) =  30274 = 0x7642 */
+  0xAC60AC60            /* FIX(-2.613125930 / 4) = -21407 = 0xAC61 */
+};
+
+/* The following struct is borrowed from jdsample.c */
+typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
+                               jpeg_component_info *compptr,
+                               JSAMPARRAY input_data,
+                               JSAMPARRAY *output_data_ptr);
+typedef struct {
+  struct jpeg_upsampler pub;
+  JSAMPARRAY color_buf[MAX_COMPONENTS];
+  upsample1_ptr methods[MAX_COMPONENTS];
+  int next_row_out;
+  JDIMENSION rows_to_go;
+  int rowgroup_height[MAX_COMPONENTS];
+  UINT8 h_expand[MAX_COMPONENTS];
+  UINT8 v_expand[MAX_COMPONENTS];
+} my_upsampler;
+
+typedef my_upsampler *my_upsample_ptr;
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_extrgbx_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_extbgr_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_extbgrx_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_extxbgr_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_extxrgb_ycc_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_extrgbx_gray_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_extbgr_gray_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_extbgrx_gray_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_extxbgr_gray_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_extxrgb_gray_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_ycc_extrgbx_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_ycc_extbgr_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_ycc_extbgrx_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_ycc_extxbgr_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_ycc_extxrgb_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
+{
+  jsimd_c_null_convert_dspr2(cinfo->image_width, input_buf, output_buf,
+                             output_row, num_rows, cinfo->num_components);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  /* FIXME: jsimd_h2v2_downsample_dspr2() fails some of the TJBench tiling
+   * regression tests, probably because the DSPr2 SIMD implementation predates
+   * those tests. */
+#if 0
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  /* FIXME: jsimd_h2v1_downsample_dspr2() fails some of the TJBench tiling
+   * regression tests, probably because the DSPr2 SIMD implementation predates
+   * those tests. */
+#if 0
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_smooth_downsample_dspr2(input_data, output_data,
+                                     compptr->v_samp_factor,
+                                     cinfo->max_v_samp_factor,
+                                     cinfo->smoothing_factor,
+                                     compptr->width_in_blocks,
+                                     cinfo->image_width);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
+
+  jsimd_int_upsample_dspr2(upsample->h_expand[compptr->component_index],
+                           upsample->v_expand[compptr->component_index],
+                           input_data, output_data_ptr, cinfo->output_width,
+                           cinfo->max_v_samp_factor);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_h2v2_extrgbx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_h2v2_extbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_h2v2_extbgrx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_h2v2_extxbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_h2v2_extxrgb_merged_upsample_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+           cinfo->sample_range_limit);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_h2v1_extrgbx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_h2v1_extbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_h2v1_extbgrx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_h2v1_extxbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_h2v1_extxrgb_merged_upsample_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+           cinfo->sample_range_limit);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+#ifndef __mips_soft_float
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+#ifndef __mips_soft_float
+  jsimd_convsamp_float_dspr2(sample_data, start_col, workspace);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+#ifndef __mips_soft_float
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+#ifndef __mips_soft_float
+  jsimd_quantize_float_dspr2(coef_block, divisors, workspace);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+  init_simd();
+
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  int workspace[DCTSIZE * 4]; /* buffers data between passes */
+
+  jsimd_idct_4x4_dspr2(compptr->dct_table, coef_block, output_buf, output_col,
+                       workspace);
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_6x6_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  int workspace[96];
+  int output[12] = {
+    (int)(output_buf[0] + output_col),
+    (int)(output_buf[1] + output_col),
+    (int)(output_buf[2] + output_col),
+    (int)(output_buf[3] + output_col),
+    (int)(output_buf[4] + output_col),
+    (int)(output_buf[5] + output_col),
+    (int)(output_buf[6] + output_col),
+    (int)(output_buf[7] + output_col),
+    (int)(output_buf[8] + output_col),
+    (int)(output_buf[9] + output_col),
+    (int)(output_buf[10] + output_col),
+    (int)(output_buf[11] + output_col)
+  };
+
+  jsimd_idct_12x12_pass1_dspr2(coef_block, compptr->dct_table, workspace);
+  jsimd_idct_12x12_pass2_dspr2(workspace, output);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  int output[8] = {
+    (int)(output_buf[0] + output_col),
+    (int)(output_buf[1] + output_col),
+    (int)(output_buf[2] + output_col),
+    (int)(output_buf[3] + output_col),
+    (int)(output_buf[4] + output_col),
+    (int)(output_buf[5] + output_col),
+    (int)(output_buf[6] + output_col),
+    (int)(output_buf[7] + output_col)
+  };
+
+  jsimd_idct_islow_dspr2(coef_block, compptr->dct_table, output,
+                         IDCT_range_limit(cinfo));
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  JCOEFPTR inptr;
+  IFAST_MULT_TYPE *quantptr;
+  DCTELEM workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
+
+  jsimd_idct_ifast_cols_dspr2(inptr, quantptr, workspace,
+                              mips_idct_ifast_coefs);
+
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  jsimd_idct_ifast_rows_dspr2(workspace, output_buf, output_col,
+                              mips_idct_ifast_coefs);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/media/libjpeg/simd/mips/jsimd_dspr2.S b/media/libjpeg/simd/mips/jsimd_dspr2.S
new file mode 100644
index 0000000000..c99288a8d1
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd_dspr2.S
@@ -0,0 +1,4543 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ *                          All Rights Reserved.
+ * Authors:  Teodora Novkovic <teodora.novkovic@imgtec.com>
+ *           Darko Laus       <darko.laus@imgtec.com>
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_dspr2_asm.h"
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_c_null_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ * 20(sp) = cinfo->num_components
+ *
+ * Null conversion for compression
+ */
+    SAVE_REGS_ON_STACK 8, s0, s1
+
+    lw          t9, 24(sp)      /* t9 = num_rows */
+    lw          s0, 28(sp)      /* s0 = cinfo->num_components */
+    andi        t0, a0, 3       /* t0 = cinfo->image_width & 3 */
+    beqz        t0, 4f          /* no residual */
+     nop
+0:
+    addiu       t9, t9, -1
+    bltz        t9, 7f
+     li         t1, 0
+1:
+    sll         t3, t1, 2
+    lwx         t5, t3(a2)      /* t5 = outptr = output_buf[ci] */
+    lw          t2, 0(a1)       /* t2 = inptr = *input_buf */
+    sll         t4, a3, 2
+    lwx         t5, t4(t5)      /* t5 = outptr = output_buf[ci][output_row] */
+    addu        t2, t2, t1
+    addu        s1, t5, a0
+    addu        t6, t5, t0
+2:
+    lbu         t3, 0(t2)
+    addiu       t5, t5, 1
+    sb          t3, -1(t5)
+    bne         t6, t5, 2b
+     addu       t2, t2, s0
+3:
+    lbu         t3, 0(t2)
+    addu        t4, t2, s0
+    addu        t7, t4, s0
+    addu        t8, t7, s0
+    addu        t2, t8, s0
+    lbu         t4, 0(t4)
+    lbu         t7, 0(t7)
+    lbu         t8, 0(t8)
+    addiu       t5, t5, 4
+    sb          t3, -4(t5)
+    sb          t4, -3(t5)
+    sb          t7, -2(t5)
+    bne         s1, t5, 3b
+     sb         t8, -1(t5)
+    addiu       t1, t1, 1
+    bne         t1, s0, 1b
+     nop
+    addiu       a1, a1, 4
+    bgez        t9, 0b
+     addiu      a3, a3, 1
+    b           7f
+     nop
+4:
+    addiu       t9, t9, -1
+    bltz        t9, 7f
+     li         t1, 0
+5:
+    sll         t3, t1, 2
+    lwx         t5, t3(a2)      /* t5 = outptr = output_buf[ci] */
+    lw          t2, 0(a1)       /* t2 = inptr = *input_buf */
+    sll         t4, a3, 2
+    lwx         t5, t4(t5)      /* t5 = outptr = output_buf[ci][output_row] */
+    addu        t2, t2, t1
+    addu        s1, t5, a0
+    addu        t6, t5, t0
+6:
+    lbu         t3, 0(t2)
+    addu        t4, t2, s0
+    addu        t7, t4, s0
+    addu        t8, t7, s0
+    addu        t2, t8, s0
+    lbu         t4, 0(t4)
+    lbu         t7, 0(t7)
+    lbu         t8, 0(t8)
+    addiu       t5, t5, 4
+    sb          t3, -4(t5)
+    sb          t4, -3(t5)
+    sb          t7, -2(t5)
+    bne         s1, t5, 6b
+     sb         t8, -1(t5)
+    addiu       t1, t1, 1
+    bne         t1, s0, 5b
+     nop
+    addiu       a1, a1, 4
+    bgez        t9, 4b
+     addiu      a3, a3, 1
+7:
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j           ra
+     nop
+
+END(jsimd_c_null_convert_dspr2)
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_dspr2
+ * jsimd_extbgr_ycc_convert_dspr2
+ * jsimd_extrgbx_ycc_convert_dspr2
+ * jsimd_extbgrx_ycc_convert_dspr2
+ * jsimd_extxbgr_ycc_convert_dspr2
+ * jsimd_extxrgb_ycc_convert_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2  colorid, pixel_size, \
+                                             r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC  r, g, b, inptr
+    lbu         \r, \r_offs(\inptr)
+    lbu         \g, \g_offs(\inptr)
+    lbu         \b, \b_offs(\inptr)
+    addiu       \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          t7, 48(sp)      /* t7 = num_rows */
+    li          s0, 0x4c8b      /* FIX(0.29900) */
+    li          s1, 0x9646      /* FIX(0.58700) */
+    li          s2, 0x1d2f      /* FIX(0.11400) */
+    li          s3, 0xffffd4cd  /* -FIX(0.16874) */
+    li          s4, 0xffffab33  /* -FIX(0.33126) */
+    li          s5, 0x8000      /* FIX(0.50000) */
+    li          s6, 0xffff94d1  /* -FIX(0.41869) */
+    li          s7, 0xffffeb2f  /* -FIX(0.08131) */
+    li          t8, 0x807fff    /* CBCR_OFFSET + ONE_HALF-1 */
+
+0:
+    addiu       t7, -1          /* --num_rows */
+    lw          t6, 0(a1)       /* t6 = input_buf[0] */
+    lw          t0, 0(a2)
+    lw          t1, 4(a2)
+    lw          t2, 8(a2)
+    sll         t3, a3, 2
+    lwx         t0, t3(t0)      /* t0 = output_buf[0][output_row] */
+    lwx         t1, t3(t1)      /* t1 = output_buf[1][output_row] */
+    lwx         t2, t3(t2)      /* t2 = output_buf[2][output_row] */
+
+    addu        t9, t2, a0      /* t9 = end address */
+    addiu       a3, 1
+
+1:
+    DO_RGB_TO_YCC t3, t4, t5, t6
+
+    mtlo        s5, $ac0
+    mtlo        t8, $ac1
+    mtlo        t8, $ac2
+    maddu       $ac0, s2, t5
+    maddu       $ac1, s5, t5
+    maddu       $ac2, s5, t3
+    maddu       $ac0, s0, t3
+    maddu       $ac1, s3, t3
+    maddu       $ac2, s6, t4
+    maddu       $ac0, s1, t4
+    maddu       $ac1, s4, t4
+    maddu       $ac2, s7, t5
+    extr.w      t3, $ac0, 16
+    extr.w      t4, $ac1, 16
+    extr.w      t5, $ac2, 16
+    sb          t3, 0(t0)
+    sb          t4, 0(t1)
+    sb          t5, 0(t2)
+    addiu       t0, 1
+    addiu       t2, 1
+    bne         t2, t9, 1b
+     addiu      t1, 1
+    bgtz        t7, 0b
+     addiu      a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_\colorid\()_ycc_convert_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*-------------------------------------id -- pix R  G  B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_dspr2
+ * jsimd_ycc_extbgr_convert_dspr2
+ * jsimd_ycc_extrgbx_convert_dspr2
+ * jsimd_ycc_extbgrx_convert_dspr2
+ * jsimd_ycc_extxbgr_convert_dspr2
+ * jsimd_ycc_extxrgb_convert_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2  colorid, pixel_size, \
+                                             r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r_offs(\outptr)
+    sb          \scratch1, \g_offs(\outptr)
+    sb          \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+    li          t0, 0xFF
+    sb          t0, \a_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = input_row
+ * a3     = output_buf
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          s1, 48(sp)
+    li          t3, 0x8000
+    li          t4, 0x166e9     /* FIX(1.40200) */
+    li          t5, 0x1c5a2     /* FIX(1.77200) */
+    li          t6, 0xffff492e  /* -FIX(0.71414) */
+    li          t7, 0xffffa7e6  /* -FIX(0.34414) */
+    repl.ph     t8, 128
+
+0:
+    lw          s0, 0(a3)
+    lw          t0, 0(a1)
+    lw          t1, 4(a1)
+    lw          t2, 8(a1)
+    sll         s5, a2, 2
+    addiu       s1, -1
+    lwx         s2, s5(t0)
+    lwx         s3, s5(t1)
+    lwx         s4, s5(t2)
+    addu        t9, s2, a0
+    addiu       a2, 1
+
+1:
+    lbu         s7, 0(s4)       /* cr */
+    lbu         s6, 0(s3)       /* cb */
+    lbu         s5, 0(s2)       /* y */
+    addiu       s2, 1
+    addiu       s4, 1
+    addiu       s7, -128
+    addiu       s6, -128
+    mul         t2, t7, s6
+    mul         t0, t6, s7      /* Crgtab[cr] */
+    sll         s7, 15
+    mulq_rs.w   t1, t4, s7      /* Crrtab[cr] */
+    sll         s6, 15
+    addu        t2, t3          /* Cbgtab[cb] */
+    addu        t2, t0
+
+    mulq_rs.w   t0, t5, s6      /* Cbbtab[cb] */
+    sra         t2, 16
+    addu        t1, s5
+    addu        t2, s5          /* add y */
+    ins         t2, t1, 16, 16
+    subu.ph     t2, t2, t8
+    addu        t0, s5
+    shll_s.ph   t2, t2, 8
+    subu        t0, 128
+    shra.ph     t2, t2, 8
+    shll_s.w    t0, t0, 24
+    addu.ph     t2, t2, t8      /* clip & store */
+    sra         t0, t0, 24
+    sra         t1, t2, 16
+    addiu       t0, 128
+
+    STORE_YCC_TO_RGB t1, t2, t0, s0
+
+    bne         s2, t9, 1b
+     addiu      s3, 1
+    bgtz        s1, 0b
+     addiu      a3, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_ycc_\colorid\()_convert_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*-------------------------------------id -- pix R  G  B  A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb,  3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr,  3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_gray_convert_dspr2
+ * jsimd_extbgr_gray_convert_dspr2
+ * jsimd_extrgbx_gray_convert_dspr2
+ * jsimd_extbgrx_gray_convert_dspr2
+ * jsimd_extxbgr_gray_convert_dspr2
+ * jsimd_extxrgb_gray_convert_dspr2
+ *
+ * Colorspace conversion RGB -> GRAY
+ */
+
+.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2  colorid, pixel_size, \
+                                              r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_GRAY  r, g, b, inptr
+    lbu         \r, \r_offs(\inptr)
+    lbu         \g, \g_offs(\inptr)
+    lbu         \b, \b_offs(\inptr)
+    addiu       \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    li          s0, 0x4c8b      /* s0 = FIX(0.29900) */
+    li          s1, 0x9646      /* s1 = FIX(0.58700) */
+    li          s2, 0x1d2f      /* s2 = FIX(0.11400) */
+    li          s7, 0x8000      /* s7 = FIX(0.50000) */
+    lw          s6, 48(sp)
+    andi        t7, a0, 3
+
+0:
+    addiu       s6, -1          /* s6 = num_rows */
+    lw          t0, 0(a1)
+    lw          t1, 0(a2)
+    sll         t3, a3, 2
+    lwx         t1, t3(t1)
+    addiu       a3, 1
+    addu        t9, t1, a0
+    subu        t8, t9, t7
+    beq         t1, t8, 2f
+     nop
+
+1:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    maddu       $ac0, s0, t3
+    mtlo        s7, $ac1
+    maddu       $ac1, s2, s5
+    maddu       $ac1, s1, s4
+    maddu       $ac1, s0, s3
+    extr.w      t6, $ac0, 16
+
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    extr.w      t2, $ac1, 16
+    maddu       $ac0, s0, t3
+    mtlo        s7, $ac1
+    maddu       $ac1, s2, s5
+    maddu       $ac1, s1, s4
+    maddu       $ac1, s0, s3
+    extr.w      t5, $ac0, 16
+    sb          t6, 0(t1)
+    sb          t2, 1(t1)
+    extr.w      t3, $ac1, 16
+    addiu       t1, 4
+    sb          t5, -2(t1)
+    sb          t3, -1(t1)
+    bne         t1, t8, 1b
+     nop
+
+2:
+    beqz        t7, 4f
+     nop
+
+3:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    maddu       $ac0, s0, t3
+    extr.w      t6, $ac0, 16
+    sb          t6, 0(t1)
+    addiu       t1, 1
+    bne         t1, t9, 3b
+     nop
+
+4:
+    bgtz        s6, 0b
+     addiu      a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_\colorid\()_gray_convert_dspr2)
+
+.purgem DO_RGB_TO_GRAY
+
+.endm
+
+/*-------------------------------------id --  pix R  G  B */
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_merged_upsample_dspr2
+ * jsimd_h2v2_extrgb_merged_upsample_dspr2
+ * jsimd_h2v2_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v2_extbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v2_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v2 upsample routines
+ */
+.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
+                                            r1_offs, g1_offs, \
+                                            b1_offs, a1_offs, \
+                                            r2_offs, g2_offs, \
+                                            b2_offs, a2_offs
+
+.macro STORE_H2V2_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
+                            scratch5 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+    sb          \scratch3, \r2_offs(\outptr)
+    sb          \scratch4, \g2_offs(\outptr)
+    sb          \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+    li          \scratch0, 0xFF
+    sb          \scratch0, \a1_offs(\outptr)
+    sb          \scratch0, \a2_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V2_1_PIXEL  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0     = cinfo->output_width
+ * a1     = input_buf
+ * a2     = in_row_group_ctr
+ * a3     = output_buf
+ * 16(sp) = cinfo->sample_range_limit
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    lw          t9, 56(sp)      /* cinfo->sample_range_limit */
+    lw          v0, 0(a1)
+    lw          v1, 4(a1)
+    lw          t0, 8(a1)
+    sll         t1, a2, 3
+    addiu       t2, t1, 4
+    sll         t3, a2, 2
+    lw          t4, 0(a3)       /* t4 = output_buf[0] */
+    lwx         t1, t1(v0)      /* t1 = input_buf[0][in_row_group_ctr*2] */
+    lwx         t2, t2(v0)      /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */
+    lwx         t5, t3(v1)      /* t5 = input_buf[1][in_row_group_ctr] */
+    lwx         t6, t3(t0)      /* t6 = input_buf[2][in_row_group_ctr] */
+    lw          t7, 4(a3)       /* t7 = output_buf[1] */
+    li          s1, 0xe6ea
+    addiu       t8, s1, 0x7fff    /* t8 = 0x166e9 [FIX(1.40200)] */
+    addiu       s0, t8, 0x5eb9    /* s0 = 0x1c5a2 [FIX(1.77200)] */
+    addiu       s1, zero, 0xa7e6  /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
+    xori        s2, s1, 0xeec8    /* s3 = 0xffff492e [-FIX(0.71414)] */
+    srl         t3, a0, 1
+    blez        t3, 2f
+     addu       t0, t5, t3      /* t0 = end address */
+ 1:
+    lbu         t3, 0(t5)
+    lbu         s3, 0(t6)
+    addiu       t5, t5, 1
+    addiu       t3, t3, -128    /* (cb - 128) */
+    addiu       s3, s3, -128    /* (cr - 128) */
+    mult        $ac1, s1, t3
+    madd        $ac1, s2, s3
+    sll         s3, s3, 15
+    sll         t3, t3, 15
+    mulq_rs.w   s4, t8, s3      /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
+    extr_r.w    s5, $ac1, 16
+    mulq_rs.w   s6, s0, t3      /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
+    lbu         v0, 0(t1)
+    addiu       t6, t6, 1
+    addiu       t1, t1, 2
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         AT, 0(t3)
+    lbu         s7, 0(s3)
+    lbu         ra, 0(v1)
+    lbu         v0, -1(t1)
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+    lbu         v0, 0(t2)
+
+    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
+
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         AT, 0(t3)
+    lbu         s7, 0(s3)
+    lbu         ra, 0(v1)
+    lbu         v0, 1(t2)
+    addiu       t2, t2, 2
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+
+    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
+
+    bne         t0, t5, 1b
+     nop
+2:
+    andi        t0, a0, 1
+    beqz        t0, 4f
+     lbu        t3, 0(t5)
+    lbu         s3, 0(t6)
+    addiu       t3, t3, -128    /* (cb - 128) */
+    addiu       s3, s3, -128    /* (cr - 128) */
+    mult        $ac1, s1, t3
+    madd        $ac1, s2, s3
+    sll         s3, s3, 15
+    sll         t3, t3, 15
+    lbu         v0, 0(t1)
+    extr_r.w    s5, $ac1, 16
+    mulq_rs.w   s4, t8, s3      /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
+    mulq_rs.w   s6, s0, t3      /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+    lbu         v0, 0(t2)
+
+    STORE_H2V2_1_PIXEL t3, s3, v1, t4
+
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+
+    STORE_H2V2_1_PIXEL t3, s3, v1, t7
+4:
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    j           ra
+     nop
+
+END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V2_1_PIXEL
+.purgem STORE_H2V2_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v1_merged_upsample_dspr2
+ * jsimd_h2v1_extrgb_merged_upsample_dspr2
+ * jsimd_h2v1_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v1_extbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v1_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v1 upsample routines
+ */
+
+.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
+                                            r1_offs, g1_offs, \
+                                            b1_offs, a1_offs, \
+                                            r2_offs, g2_offs, \
+                                            b2_offs, a2_offs
+
+.macro STORE_H2V1_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
+                            scratch5 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+    sb          \scratch3, \r2_offs(\outptr)
+    sb          \scratch4, \g2_offs(\outptr)
+    sb          \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+    sb          t0, \a2_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V1_1_PIXEL  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0     = cinfo->output_width
+ * a1     = input_buf
+ * a2     = in_row_group_ctr
+ * a3     = output_buf
+ * 16(sp) = range_limit
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    li          t0, 0xe6ea
+    lw          t1, 0(a1)         /* t1 = input_buf[0] */
+    lw          t2, 4(a1)         /* t2 = input_buf[1] */
+    lw          t3, 8(a1)         /* t3 = input_buf[2] */
+    lw          t8, 56(sp)        /* t8 = range_limit */
+    addiu       s1, t0, 0x7fff    /* s1 = 0x166e9 [FIX(1.40200)] */
+    addiu       s2, s1, 0x5eb9    /* s2 = 0x1c5a2 [FIX(1.77200)] */
+    addiu       s0, t0, 0x9916    /* s0 = 0x8000 */
+    addiu       s4, zero, 0xa7e6  /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
+    xori        s3, s4, 0xeec8    /* s3 = 0xffff492e [-FIX(0.71414)] */
+    srl         t0, a0, 1
+    sll         t4, a2, 2
+    lwx         s5, t4(t1)      /* s5 = inptr0 */
+    lwx         s6, t4(t2)      /* s6 = inptr1 */
+    lwx         s7, t4(t3)      /* s7 = inptr2 */
+    lw          t7, 0(a3)       /* t7 = outptr */
+    blez        t0, 2f
+     addu       t9, s6, t0      /* t9 = end address */
+1:
+    lbu         t2, 0(s6)       /* t2 = cb */
+    lbu         t0, 0(s7)       /* t0 = cr */
+    lbu         t1, 0(s5)       /* t1 = y */
+    addiu       t2, t2, -128    /* t2 = cb - 128 */
+    addiu       t0, t0, -128    /* t0 = cr - 128 */
+    mult        $ac1, s4, t2
+    madd        $ac1, s3, t0
+    sll         t0, t0, 15
+    sll         t2, t2, 15
+    mulq_rs.w   t0, s1, t0      /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */
+    extr_r.w    t5, $ac1, 16
+    mulq_rs.w   t6, s2, t2      /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */
+    addiu       s7, s7, 1
+    addiu       s6, s6, 1
+    addu        t2, t1, t0      /* t2 = y + cred */
+    addu        t3, t1, t5      /* t3 = y + cgreen */
+    addu        t4, t1, t6      /* t4 = y + cblue */
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t1, 1(s5)
+    lbu         v0, 0(t2)
+    lbu         v1, 0(t3)
+    lbu         ra, 0(t4)
+    addu        t2, t1, t0
+    addu        t3, t1, t5
+    addu        t4, t1, t6
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t2, 0(t2)
+    lbu         t3, 0(t3)
+    lbu         t4, 0(t4)
+
+    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
+
+    bne         t9, s6, 1b
+     addiu      s5, s5, 2
+2:
+    andi        t0, a0, 1
+    beqz        t0, 4f
+     nop
+3:
+    lbu         t2, 0(s6)
+    lbu         t0, 0(s7)
+    lbu         t1, 0(s5)
+    addiu       t2, t2, -128    /* (cb - 128) */
+    addiu       t0, t0, -128    /* (cr - 128) */
+    mul         t3, s4, t2
+    mul         t4, s3, t0
+    sll         t0, t0, 15
+    sll         t2, t2, 15
+    mulq_rs.w   t0, s1, t0      /* (C1*cr + ONE_HALF)>> SCALEBITS */
+    mulq_rs.w   t6, s2, t2      /* (C2*cb + ONE_HALF)>> SCALEBITS */
+    addu        t3, t3, s0
+    addu        t3, t4, t3
+    sra         t5, t3, 16      /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */
+    addu        t2, t1, t0      /* y + cred */
+    addu        t3, t1, t5      /* y + cgreen */
+    addu        t4, t1, t6      /* y + cblue */
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t2, 0(t2)
+    lbu         t3, 0(t3)
+    lbu         t4, 0(t4)
+
+    STORE_H2V1_1_PIXEL t2, t3, t4, t7
+4:
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    j           ra
+     nop
+
+END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V1_1_PIXEL
+.purgem STORE_H2V1_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_fancy_upsample_dspr2
+ *
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+ */
+LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    li            s4, 0
+    lw            s2, 0(a3)       /* s2 = *output_data_ptr */
+0:
+    li            t9, 2
+    lw            s1, -4(a2)      /* s1 = inptr1 */
+
+1:
+    lw            s0, 0(a2)       /* s0 = inptr0 */
+    lwx           s3, s4(s2)
+    addiu         s5, a1, -2      /* s5 = downsampled_width - 2 */
+    srl           t4, s5, 1
+    sll           t4, t4, 1
+    lbu           t0, 0(s0)
+    lbu           t1, 1(s0)
+    lbu           t2, 0(s1)
+    lbu           t3, 1(s1)
+    addiu         s0, 2
+    addiu         s1, 2
+    addu          t8, s0, t4      /* t8 = end address */
+    andi          s5, s5, 1       /* s5 = residual */
+    sll           t4, t0, 1
+    sll           t6, t1, 1
+    addu          t0, t0, t4      /* t0 = (*inptr0++) * 3 */
+    addu          t1, t1, t6      /* t1 = (*inptr0++) * 3 */
+    addu          t7, t0, t2      /* t7 = thiscolsum */
+    addu          t6, t1, t3      /* t5 = nextcolsum */
+    sll           t0, t7, 2       /* t0 = thiscolsum * 4 */
+    subu          t1, t0, t7      /* t1 = thiscolsum * 3 */
+    shra_r.w      t0, t0, 4
+    addiu         t1, 7
+    addu          t1, t1, t6
+    srl           t1, t1, 4
+    sb            t0, 0(s3)
+    sb            t1, 1(s3)
+    beq           t8, s0, 22f     /* skip to final iteration if width == 3 */
+     addiu        s3, 2
+2:
+    lh            t0, 0(s0)       /* t0 = A3|A2 */
+    lh            t2, 0(s1)       /* t2 = B3|B2 */
+    addiu         s0, 2
+    addiu         s1, 2
+    preceu.ph.qbr t0, t0          /* t0 = 0|A3|0|A2 */
+    preceu.ph.qbr t2, t2          /* t2 = 0|B3|0|B2 */
+    shll.ph       t1, t0, 1
+    sll           t3, t6, 1
+    addu.ph       t0, t1, t0      /* t0 = A3*3|A2*3 */
+    addu          t3, t3, t6      /* t3 = this * 3 */
+    addu.ph       t0, t0, t2      /* t0 = next2|next1 */
+    addu          t1, t3, t7
+    andi          t7, t0, 0xFFFF  /* t7 = next1 */
+    sll           t2, t7, 1
+    addu          t2, t7, t2      /* t2 = next1*3 */
+    addu          t4, t2, t6
+    srl           t6, t0, 16      /* t6 = next2 */
+    shra_r.w      t1, t1, 4       /* t1 = (this*3 + last + 8) >> 4 */
+    addu          t0, t3, t7
+    addiu         t0, 7
+    srl           t0, t0, 4       /* t0 = (this*3 + next1 + 7) >> 4 */
+    shra_r.w      t4, t4, 4       /* t3 = (next1*3 + this + 8) >> 4 */
+    addu          t2, t2, t6
+    addiu         t2, 7
+    srl           t2, t2, 4       /* t2 = (next1*3 + next2 + 7) >> 4 */
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    sb            t4, 2(s3)
+    sb            t2, 3(s3)
+    bne           t8, s0, 2b
+     addiu        s3, 4
+22:
+    beqz          s5, 4f
+     addu         t8, s0, s5
+3:
+    lbu           t0, 0(s0)
+    lbu           t2, 0(s1)
+    addiu         s0, 1
+    addiu         s1, 1
+    sll           t3, t6, 1
+    sll           t1, t0, 1
+    addu          t1, t0, t1      /* t1 = inptr0 * 3 */
+    addu          t3, t3, t6      /* t3 = thiscolsum * 3 */
+    addu          t5, t1, t2
+    addu          t1, t3, t7
+    shra_r.w      t1, t1, 4
+    addu          t0, t3, t5
+    addiu         t0, 7
+    srl           t0, t0, 4
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    addiu         s3, 2
+    move          t7, t6
+    bne           t8, s0, 3b
+     move         t6, t5
+4:
+    sll           t0, t6, 2       /* t0 = thiscolsum * 4 */
+    subu          t1, t0, t6      /* t1 = thiscolsum * 3 */
+    addu          t1, t1, t7
+    addiu         s4, 4
+    shra_r.w      t1, t1, 4
+    addiu         t0, 7
+    srl           t0, t0, 4
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    addiu         t9, -1
+    addiu         s3, 2
+    bnez          t9, 1b
+     lw           s1, 4(a2)
+    srl           t0, s4, 2
+    subu          t0, a0, t0
+    bgtz          t0, 0b
+     addiu        a2, 4
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j             ra
+     nop
+END(jsimd_h2v2_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    .set at
+
+    beqz          a0, 3f
+     sll          t0, a0, 2
+    lw            s1, 0(a3)
+    li            s3, 0x10001
+    addu          s0, s1, t0
+0:
+    addiu         t8, a1, -2
+    srl           t9, t8, 2
+    lw            t7, 0(a2)
+    lw            s2, 0(s1)
+    lbu           t0, 0(t7)
+    lbu           t1, 1(t7)       /* t1 = inptr[1] */
+    sll           t2, t0, 1
+    addu          t2, t2, t0      /* t2 = invalue*3 */
+    addu          t2, t2, t1
+    shra_r.w      t2, t2, 2
+    sb            t0, 0(s2)
+    sb            t2, 1(s2)
+    beqz          t9, 11f
+     addiu        s2, 2
+1:
+    ulw           t0, 0(t7)       /* t0 = |P3|P2|P1|P0| */
+    ulw           t1, 1(t7)
+    ulh           t2, 4(t7)       /* t2 = |0|0|P5|P4| */
+    preceu.ph.qbl t3, t0          /* t3 = |0|P3|0|P2| */
+    preceu.ph.qbr t0, t0          /* t0 = |0|P1|0|P0| */
+    preceu.ph.qbr t2, t2          /* t2 = |0|P5|0|P4| */
+    preceu.ph.qbl t4, t1          /* t4 = |0|P4|0|P3| */
+    preceu.ph.qbr t1, t1          /* t1 = |0|P2|0|P1| */
+    shll.ph       t5, t4, 1
+    shll.ph       t6, t1, 1
+    addu.ph       t5, t5, t4      /* t5 = |P4*3|P3*3| */
+    addu.ph       t6, t6, t1      /* t6 = |P2*3|P1*3| */
+    addu.ph       t4, t3, s3
+    addu.ph       t0, t0, s3
+    addu.ph       t4, t4, t5
+    addu.ph       t0, t0, t6
+    shrl.ph       t4, t4, 2       /* t4 = |0|P3|0|P2| */
+    shrl.ph       t0, t0, 2       /* t0 = |0|P1|0|P0| */
+    addu.ph       t2, t2, t5
+    addu.ph       t3, t3, t6
+    shra_r.ph     t2, t2, 2       /* t2 = |0|P5|0|P4| */
+    shra_r.ph     t3, t3, 2       /* t3 = |0|P3|0|P2| */
+    shll.ph       t2, t2, 8
+    shll.ph       t3, t3, 8
+    or            t2, t4, t2
+    or            t3, t3, t0
+    addiu         t9, -1
+    usw           t3, 0(s2)
+    usw           t2, 4(s2)
+    addiu         s2, 8
+    bgtz          t9, 1b
+     addiu        t7, 4
+11:
+    andi          t8, 3
+    beqz          t8, 22f
+     addiu        t7, 1
+
+2:
+    lbu           t0, 0(t7)
+    addiu         t7, 1
+    sll           t1, t0, 1
+    addu          t2, t0, t1      /* t2 = invalue */
+    lbu           t3, -2(t7)
+    lbu           t4, 0(t7)
+    addiu         t3, 1
+    addiu         t4, 2
+    addu          t3, t3, t2
+    addu          t4, t4, t2
+    srl           t3, 2
+    srl           t4, 2
+    sb            t3, 0(s2)
+    sb            t4, 1(s2)
+    addiu         t8, -1
+    bgtz          t8, 2b
+     addiu        s2, 2
+
+22:
+    lbu           t0, 0(t7)
+    lbu           t2, -1(t7)
+    sll           t1, t0, 1
+    addu          t1, t1, t0      /* t1 = invalue * 3 */
+    addu          t1, t1, t2
+    addiu         t1, 1
+    srl           t1, t1, 2
+    sb            t1, 0(s2)
+    sb            t0, 1(s2)
+    addiu         s1, 4
+    bne           s1, s0, 0b
+     addiu        a2, 4
+3:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j             ra
+     nop
+END(jsimd_h2v1_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = cinfo->max_v_samp_factor
+ * a2     = compptr->v_samp_factor
+ * a3     = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
+
+    beqz        a2, 7f
+     lw         s1, 44(sp)      /* s1 = output_data */
+    lw          s0, 40(sp)      /* s0 = input_data */
+    srl         s2, a0, 2
+    andi        t9, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3       /* t0 = width_in_blocks*DCT */
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    andi        t6, a0, 1       /* t6 = temp_index */
+    addiu       t6, -1
+    lw          t4, 0(s1)       /* t4 = outptr */
+    lw          t5, 0(s0)       /* t5 = inptr0 */
+    li          s3, 0           /* s3 = bias */
+    srl         t7, a0, 1       /* t7 = image_width1 */
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+1:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 2(t5)
+    ulhu        t2, 4(t5)
+    ulhu        t3, 6(t5)
+    raddu.w.qb  t0, t0
+    raddu.w.qb  t1, t1
+    raddu.w.qb  t2, t2
+    raddu.w.qb  t3, t3
+    shra.ph     t0, t0, 1
+    shra_r.ph   t1, t1, 1
+    shra.ph     t2, t2, 1
+    shra_r.ph   t3, t3, 1
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t3, 3(t4)
+    addiu       s4, -1
+    addiu       t4, 4
+    bgtz        s4, 1b
+     addiu      t5, 8
+    beqz        t8, 3f
+     addu       s4, t4, t8
+2:
+    ulhu        t0, 0(t5)
+    raddu.w.qb  t0, t0
+    addqh.w     t0, t0, s3
+    xori        s3, s3, 1
+    sb          t0, 0(t4)
+    addiu       t4, 1
+    bne         t4, s4, 2b
+     addiu      t5, 2
+3:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    addqh.w     t2, t1, s3      /* t2 = pixval1 */
+    xori        s3, s3, 1
+    addqh.w     t3, t1, s3      /* t3 = pixval2 */
+    blez        s2, 5f
+     append     t3, t2,  8
+    addu        t5, t4, s2      /* t5 = loop_end2 */
+4:
+    ush         t3, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 4b
+     addiu      t4,  2
+5:
+    beqz        t9, 6f
+     nop
+    sb          t2, 0(t4)
+6:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 4
+7:
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
+
+    j           ra
+    nop
+END(jsimd_h2v1_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = cinfo->max_v_samp_factor
+ * a2     = compptr->v_samp_factor
+ * a3     = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    beqz        a2, 8f
+     lw         s1, 52(sp)      /* s1 = output_data */
+    lw          s0, 48(sp)      /* s0 = input_data */
+
+    andi        t6, a0, 1       /* t6 = temp_index */
+    addiu       t6, -1
+    srl         t7, a0, 1       /* t7 = image_width1 */
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+    andi        t9, a0, 2
+    srl         s2, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3       /* s2 = width_in_blocks*DCT */
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    lw          t4, 0(s1)       /* t4 = outptr */
+    lw          t5, 0(s0)       /* t5 = inptr0 */
+    lw          s7, 4(s0)       /* s7 = inptr1 */
+    li          s6, 1           /* s6 = bias */
+2:
+    ulw         t0, 0(t5)       /* t0 = |P3|P2|P1|P0| */
+    ulw         t1, 0(s7)       /* t1 = |Q3|Q2|Q1|Q0| */
+    ulw         t2, 4(t5)
+    ulw         t3, 4(s7)
+    precrq.ph.w t7, t0, t1      /* t2 = |P3|P2|Q3|Q2| */
+    ins         t0, t1, 16, 16  /* t0 = |Q1|Q0|P1|P0| */
+    raddu.w.qb  t1, t7
+    raddu.w.qb  t0, t0
+    shra_r.w    t1, t1, 2
+    addiu       t0, 1
+    srl         t0, 2
+    precrq.ph.w t7, t2, t3
+    ins         t2, t3, 16, 16
+    raddu.w.qb  t7, t7
+    raddu.w.qb  t2, t2
+    shra_r.w    t7, t7, 2
+    addiu       t2, 1
+    srl         t2, 2
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t7, 3(t4)
+    addiu       t4, 4
+    addiu       t5, 8
+    addiu       s4, s4, -1
+    bgtz        s4, 2b
+     addiu      s7, 8
+    beqz        t8, 4f
+     addu       t8, t4, t8
+3:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 0(s7)
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t0, t0
+    addu        t0, t0, s6
+    srl         t0, 2
+    xori        s6, s6, 3
+    sb          t0, 0(t4)
+    addiu       t5, 2
+    addiu       t4, 1
+    bne         t8, t4, 3b
+     addiu      s7, 2
+4:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    lbux        t0, t6(s7)
+    sll         t0, 1
+    addu        t1, t1, t0
+    addu        t3, t1, s6
+    srl         t0, t3, 2       /* t2 = pixval1 */
+    xori        s6, s6, 3
+    addu        t2, t1, s6
+    srl         t1, t2, 2       /* t3 = pixval2 */
+    blez        s2, 6f
+     append     t1, t0, 8
+5:
+    ush         t1, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 5b
+     addiu      t4, 2
+6:
+    beqz        t9, 7f
+     nop
+    sb          t0, 0(t4)
+7:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 8
+8:
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_h2v2_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
+/*
+ * a0     = input_data
+ * a1     = output_data
+ * a2     = compptr->v_samp_factor
+ * a3     = cinfo->max_v_samp_factor
+ * 16(sp) = cinfo->smoothing_factor
+ * 20(sp) = compptr->width_in_blocks
+ * 24(sp) = cinfo->image_width
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          s7, 52(sp)      /* compptr->width_in_blocks */
+    lw          s0, 56(sp)      /* cinfo->image_width */
+    lw          s6, 48(sp)      /* cinfo->smoothing_factor */
+    sll         s7, 3           /* output_cols = width_in_blocks * DCTSIZE */
+    sll         v0, s7, 1
+    subu        v0, v0, s0
+    blez        v0, 2f
+    move        v1, zero
+    addiu       t0, a3, 2       /* t0 = cinfo->max_v_samp_factor + 2 */
+0:
+    addiu       t1, a0, -4
+    sll         t2, v1, 2
+    lwx         t1, t2(t1)
+    move        t3, v0
+    addu        t1, t1, s0
+    lbu         t2, -1(t1)
+1:
+    addiu       t3, t3, -1
+    sb          t2, 0(t1)
+    bgtz        t3, 1b
+    addiu       t1, t1, 1
+    addiu       v1, v1, 1
+    bne         v1, t0, 0b
+    nop
+2:
+    li          v0, 80
+    mul         v0, s6, v0
+    li          v1, 16384
+    move        t4, zero
+    move        t5, zero
+    subu        t6, v1, v0      /* t6 = 16384 - tmp_smoot_f * 80 */
+    sll         t7, s6, 4       /* t7 = tmp_smoot_f * 16 */
+3:
+/* Special case for first column: pretend column -1 is same as column 0 */
+    sll         v0, t4, 2
+    lwx         t8, v0(a1)      /*  outptr = output_data[outrow] */
+    sll         v1, t5, 2
+    addiu       t9, v1, 4
+    addiu       s0, v1, -4
+    addiu       s1, v1, 8
+    lwx         s2, v1(a0)      /* inptr0 = input_data[inrow] */
+    lwx         t9, t9(a0)      /* inptr1 = input_data[inrow+1] */
+    lwx         s0, s0(a0)      /* above_ptr = input_data[inrow-1] */
+    lwx         s1, s1(a0)      /* below_ptr = input_data[inrow+2] */
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, 0(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, 0(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, 0(s0)
+    lbu         t0, 0(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    v0, $ac1, 16
+    addiu       t8, t8, 1
+    addiu       s2, s2, 2
+    addiu       t9, t9, 2
+    addiu       s0, s0, 2
+    addiu       s1, s1, 2
+    sb          v0, -1(t8)
+    addiu       s4, s7, -2
+    and         s4, s4, 3
+    addu        s5, s4, t8      /* end address */
+4:
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    addiu       t8, t8, 1
+    addiu       s2, s2, 2
+    addiu       t9, t9, 2
+    addiu       s0, s0, 2
+    sb          t2, -1(t8)
+    bne         s5, t8, 4b
+    addiu       s1, s1, 2
+    addiu       s5, s7, -2
+    subu        s5, s5, s4
+    addu        s5, s5, t8      /* end address */
+5:
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    lh          v1, 2(t9)
+    addu        t0, t0, v0
+    lh          v0, 2(s2)
+    addu        s3, t0, s3
+    lh          t0, 2(s0)
+    lh          t1, 2(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 4(s2)
+    lbu         t0, 1(t9)
+    lbu         t1, 4(t9)
+    sb          t2, 0(t8)
+    raddu.w.qb  t3, v0
+    lbu         v0, 1(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t3, t6
+    addu        v0, v0, v1
+    lbu         t2, 4(s0)
+    addu        t0, t0, v0
+    lbu         v0, 1(s0)
+    addu        s3, t0, s3
+    lbu         t0, 1(s1)
+    lbu         t3, 4(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    lh          v1, 4(t9)
+    addu        t0, t0, v0
+    lh          v0, 4(s2)
+    addu        s3, t0, s3
+    lh          t0, 4(s0)
+    lh          t1, 4(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 6(s2)
+    lbu         t0, 3(t9)
+    lbu         t1, 6(t9)
+    sb          t2, 1(t8)
+    raddu.w.qb  t3, v0
+    lbu         v0, 3(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t3, t6
+    addu        v0, v0, v1
+    lbu         t2, 6(s0)
+    addu        t0, t0, v0
+    lbu         v0, 3(s0)
+    addu        s3, t0, s3
+    lbu         t0, 3(s1)
+    lbu         t3, 6(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    lh          v1, 6(t9)
+    addu        t0, t0, v0
+    lh          v0, 6(s2)
+    addu        s3, t0, s3
+    lh          t0, 6(s0)
+    lh          t1, 6(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t3, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 8(s2)
+    lbu         t0, 5(t9)
+    lbu         t1, 8(t9)
+    sb          t3, 2(t8)
+    raddu.w.qb  t2, v0
+    lbu         v0, 5(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t2, t6
+    addu        v0, v0, v1
+    lbu         t2, 8(s0)
+    addu        t0, t0, v0
+    lbu         v0, 5(s0)
+    addu        s3, t0, s3
+    lbu         t0, 5(s1)
+    lbu         t3, 8(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    addiu       t8, t8, 4
+    addu        t0, t0, v0
+    addiu       s2, s2, 8
+    addu        s3, t0, s3
+    addiu       t9, t9, 8
+    madd        $ac1, s3, t7
+    extr_r.w    t1, $ac1, 16
+    addiu       s0, s0, 8
+    addiu       s1, s1, 8
+    bne         s5, t8, 5b
+    sb          t1, -1(t8)
+/* Special case for last column */
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 1(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 1(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 1(s0)
+    addu        t0, t0, v0
+    lbu         t3, 1(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    t0, $ac1, 16
+    addiu       t5, t5, 2
+    sb          t0, 0(t8)
+    addiu       t4, t4, 1
+    bne         t4, a2, 3b
+    addiu       t5, t5, 2
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_h2v2_smooth_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_int_upsample_dspr2)
+/*
+ * a0     = upsample->h_expand[compptr->component_index]
+ * a1     = upsample->v_expand[compptr->component_index]
+ * a2     = input_data
+ * a3     = output_data_ptr
+ * 16(sp) = cinfo->output_width
+ * 20(sp) = cinfo->max_v_samp_factor
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    lw          s0, 0(a3)       /* s0 = output_data */
+    lw          s1, 32(sp)      /* s1 = cinfo->output_width */
+    lw          s2, 36(sp)      /* s2 = cinfo->max_v_samp_factor */
+    li          t6, 0           /* t6 = inrow */
+    beqz        s2, 10f
+     li         s3, 0           /* s3 = outrow */
+0:
+    addu        t0, a2, t6
+    addu        t7, s0, s3
+    lw          t3, 0(t0)       /* t3 = inptr */
+    lw          t8, 0(t7)       /* t8 = outptr */
+    beqz        s1, 4f
+     addu       t5, t8, s1      /* t5 = outend */
+1:
+    lb          t2, 0(t3)       /* t2 = invalue = *inptr++ */
+    addiu       t3, 1
+    beqz        a0, 3f
+     move       t0, a0          /* t0 = h_expand */
+2:
+    sb          t2, 0(t8)
+    addiu       t0, -1
+    bgtz        t0, 2b
+     addiu      t8, 1
+3:
+    bgt         t5, t8, 1b
+     nop
+4:
+    addiu       t9, a1, -1      /* t9 = v_expand - 1 */
+    blez        t9, 9f
+     nop
+5:
+    lw          t3, 0(s0)
+    lw          t4, 4(s0)
+    subu        t0, s1, 0xF
+    blez        t0, 7f
+     addu       t5, t3, s1      /* t5 = end address */
+    andi        t7, s1, 0xF     /* t7 = residual */
+    subu        t8, t5, t7
+6:
+    ulw         t0, 0(t3)
+    ulw         t1, 4(t3)
+    ulw         t2, 8(t3)
+    usw         t0, 0(t4)
+    ulw         t0, 12(t3)
+    usw         t1, 4(t4)
+    usw         t2, 8(t4)
+    usw         t0, 12(t4)
+    addiu       t3, 16
+    bne         t3, t8, 6b
+     addiu      t4, 16
+    beqz        t7, 8f
+     nop
+7:
+    lbu         t0, 0(t3)
+    sb          t0, 0(t4)
+    addiu       t3, 1
+    bne         t3, t5, 7b
+     addiu      t4, 1
+8:
+    addiu       t9, -1
+    bgtz        t9, 5b
+     addiu      s0, 8
+9:
+    addu        s3, s3, a1
+    bne         s3, s2, 0b
+     addiu      t6, 1
+10:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j           ra
+     nop
+END(jsimd_int_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    lw          t7, 0(a3)       /* t7 = output_data */
+    andi        t8, a1, 0xf     /* t8 = residual */
+    sll         t0, a0, 2
+    blez        a0, 4f
+     addu       t9, t7, t0      /* t9 = output_data end address */
+0:
+    lw          t5, 0(t7)       /* t5 = outptr */
+    lw          t6, 0(a2)       /* t6 = inptr */
+    addu        t3, t5, a1      /* t3 = outptr + output_width (end address) */
+    subu        t3, t8          /* t3 = end address - residual */
+    beq         t5, t3, 2f
+     move       t4, t8
+1:
+    ulw         t0, 0(t6)       /* t0 = |P3|P2|P1|P0| */
+    ulw         t2, 4(t6)       /* t2 = |P7|P6|P5|P4| */
+    srl         t1, t0, 16      /* t1 = |X|X|P3|P2| */
+    ins         t0, t0, 16, 16  /* t0 = |P1|P0|P1|P0| */
+    ins         t1, t1, 16, 16  /* t1 = |P3|P2|P3|P2| */
+    ins         t0, t0, 8, 16   /* t0 = |P1|P1|P0|P0| */
+    ins         t1, t1, 8, 16   /* t1 = |P3|P3|P2|P2| */
+    usw         t0, 0(t5)
+    usw         t1, 4(t5)
+    srl         t0, t2, 16      /* t0 = |X|X|P7|P6| */
+    ins         t2, t2, 16, 16  /* t2 = |P5|P4|P5|P4| */
+    ins         t0, t0, 16, 16  /* t0 = |P7|P6|P7|P6| */
+    ins         t2, t2, 8, 16   /* t2 = |P5|P5|P4|P4| */
+    ins         t0, t0, 8, 16   /* t0 = |P7|P7|P6|P6| */
+    usw         t2, 8(t5)
+    usw         t0, 12(t5)
+    addiu       t5, 16
+    bne         t5, t3, 1b
+     addiu      t6, 8
+    beqz        t8, 3f
+     move       t4, t8
+2:
+    lbu         t1, 0(t6)
+    sb          t1, 0(t5)
+    sb          t1, 1(t5)
+    addiu       t4, -2
+    addiu       t6, 1
+    bgtz        t4, 2b
+     addiu      t5, 2
+3:
+    addiu       t7, 4
+    bne         t9, t7, 0b
+     addiu      a2, 4
+4:
+    j           ra
+     nop
+END(jsimd_h2v1_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    lw          t7, 0(a3)
+    blez        a0, 7f
+     andi       t9, a1, 0xf     /* t9 = residual */
+0:
+    lw          t6, 0(a2)       /* t6 = inptr */
+    lw          t5, 0(t7)       /* t5 = outptr */
+    addu        t8, t5, a1      /* t8 = outptr end address */
+    subu        t8, t9          /* t8 = end address - residual */
+    beq         t5, t8, 2f
+     move       t4, t9
+1:
+    ulw         t0, 0(t6)
+    srl         t1, t0, 16
+    ins         t0, t0, 16, 16
+    ins         t0, t0, 8, 16
+    ins         t1, t1, 16, 16
+    ins         t1, t1, 8, 16
+    ulw         t2, 4(t6)
+    usw         t0, 0(t5)
+    usw         t1, 4(t5)
+    srl         t3, t2, 16
+    ins         t2, t2, 16, 16
+    ins         t2, t2, 8, 16
+    ins         t3, t3, 16, 16
+    ins         t3, t3, 8, 16
+    usw         t2, 8(t5)
+    usw         t3, 12(t5)
+    addiu       t5, 16
+    bne         t5, t8, 1b
+     addiu      t6, 8
+    beqz        t9, 3f
+     move       t4, t9
+2:
+    lbu         t0, 0(t6)
+    sb          t0, 0(t5)
+    sb          t0, 1(t5)
+    addiu       t4, -2
+    addiu       t6, 1
+    bgtz        t4, 2b
+     addiu      t5, 2
+3:
+    lw          t6, 0(t7)       /* t6 = outptr[0] */
+    lw          t5, 4(t7)       /* t5 = outptr[1] */
+    addu        t4, t6, a1      /* t4 = new end address */
+    beq         a1, t9, 5f
+     subu       t8, t4, t9
+4:
+    ulw         t0, 0(t6)
+    ulw         t1, 4(t6)
+    ulw         t2, 8(t6)
+    usw         t0, 0(t5)
+    ulw         t0, 12(t6)
+    usw         t1, 4(t5)
+    usw         t2, 8(t5)
+    usw         t0, 12(t5)
+    addiu       t6, 16
+    bne         t6, t8, 4b
+     addiu      t5, 16
+    beqz        t9, 6f
+     nop
+5:
+    lbu         t0, 0(t6)
+    sb          t0, 0(t5)
+    addiu       t6, 1
+    bne         t6, t4, 5b
+     addiu      t5, 1
+6:
+    addiu       t7, 8
+    addiu       a0, -2
+    bgtz        a0, 0b
+     addiu      a2, 4
+7:
+    j           ra
+     nop
+END(jsimd_h2v2_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_islow_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = compptr->dcttable
+ * a2 = output
+ * a3 = range_limit
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu       sp, sp, -256
+    move        v0, sp
+    addiu       v1, zero, 8     /* v1 = DCTSIZE = 8 */
+1:
+    lh          s4, 32(a0)      /* s4 = inptr[16] */
+    lh          s5, 64(a0)      /* s5 = inptr[32] */
+    lh          s6, 96(a0)      /* s6 = inptr[48] */
+    lh          t1, 112(a0)     /* t1 = inptr[56] */
+    lh          t7, 16(a0)      /* t7 = inptr[8] */
+    lh          t5, 80(a0)      /* t5 = inptr[40] */
+    lh          t3, 48(a0)      /* t3 = inptr[24] */
+    or          s4, s4, t1
+    or          s4, s4, t3
+    or          s4, s4, t5
+    or          s4, s4, t7
+    or          s4, s4, s5
+    or          s4, s4, s6
+    bnez        s4, 2f
+     addiu      v1, v1, -1
+    lh          s5, 0(a1)       /* quantptr[DCTSIZE*0] */
+    lh          s6, 0(a0)       /* inptr[DCTSIZE*0] */
+    mul         s5, s5, s6      /* DEQUANTIZE(inptr[0], quantptr[0]) */
+    sll         s5, s5, 2
+    sw          s5, 0(v0)
+    sw          s5, 32(v0)
+    sw          s5, 64(v0)
+    sw          s5, 96(v0)
+    sw          s5, 128(v0)
+    sw          s5, 160(v0)
+    sw          s5, 192(v0)
+    b           3f
+     sw         s5, 224(v0)
+2:
+    lh          t0, 112(a1)
+    lh          t2, 48(a1)
+    lh          t4, 80(a1)
+    lh          t6, 16(a1)
+    mul         t0, t0, t1      /* DEQUANTIZE(inptr[DCTSIZE*7],
+                                              quantptr[DCTSIZE*7]) */
+    mul         t1, t2, t3      /* DEQUANTIZE(inptr[DCTSIZE*3],
+                                              quantptr[DCTSIZE*3]) */
+    mul         t2, t4, t5      /* DEQUANTIZE(inptr[DCTSIZE*5],
+                                              quantptr[DCTSIZE*5]) */
+    mul         t3, t6, t7      /* DEQUANTIZE(inptr[DCTSIZE*1],
+                                              quantptr[DCTSIZE*1]) */
+    lh          t4, 32(a1)
+    lh          t5, 32(a0)
+    lh          t6, 96(a1)
+    lh          t7, 96(a0)
+    addu        s0, t0, t1       /* z3 = tmp0 + tmp2 */
+    addu        s1, t1, t2       /* z2 = tmp1 + tmp2 */
+    addu        s2, t2, t3       /* z4 = tmp1 + tmp3 */
+    addu        s3, s0, s2       /* z3 + z4 */
+    addiu       t9, zero, 9633   /* FIX_1_175875602 */
+    mul         s3, s3, t9       /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    addu        t8, t0, t3       /* z1 = tmp0 + tmp3 */
+    addiu       t9, zero, 2446   /* FIX_0_298631336 */
+    mul         t0, t0, t9       /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    addiu       t9, zero, 16819  /* FIX_2_053119869 */
+    mul         t2, t2, t9       /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    addiu       t9, zero, 25172  /* FIX_3_072711026 */
+    mul         t1, t1, t9       /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    addiu       t9, zero, 12299  /* FIX_1_501321110 */
+    mul         t3, t3, t9       /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    addiu       t9, zero, 16069  /* FIX_1_961570560 */
+    mul         s0, s0, t9       /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
+    addiu       t9, zero, 3196   /* FIX_0_390180644 */
+    mul         s2, s2, t9       /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
+    addiu       t9, zero, 7373   /* FIX_0_899976223 */
+    mul         t8, t8, t9       /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
+    addiu       t9, zero, 20995  /* FIX_2_562915447 */
+    mul         s1, s1, t9       /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
+    subu        s0, s3, s0       /* z3 += z5 */
+    addu        t0, t0, s0       /* tmp0 += z3 */
+    addu        t1, t1, s0       /* tmp2 += z3 */
+    subu        s2, s3, s2       /* z4 += z5 */
+    addu        t2, t2, s2       /* tmp1 += z4 */
+    addu        t3, t3, s2       /* tmp3 += z4 */
+    subu        t0, t0, t8       /* tmp0 += z1 */
+    subu        t1, t1, s1       /* tmp2 += z2 */
+    subu        t2, t2, s1       /* tmp1 += z2 */
+    subu        t3, t3, t8       /* tmp3 += z1 */
+    mul         s0, t4, t5       /* DEQUANTIZE(inptr[DCTSIZE*2],
+                                               quantptr[DCTSIZE*2]) */
+    addiu       t9, zero, 6270   /* FIX_0_765366865 */
+    mul         s1, t6, t7       /* DEQUANTIZE(inptr[DCTSIZE*6],
+                                               quantptr[DCTSIZE*6]) */
+    lh          t4, 0(a1)
+    lh          t5, 0(a0)
+    lh          t6, 64(a1)
+    lh          t7, 64(a0)
+    mul         s2, t9, s0       /* MULTIPLY(z2, FIX_0_765366865) */
+    mul         t5, t4, t5       /* DEQUANTIZE(inptr[DCTSIZE*0],
+                                               quantptr[DCTSIZE*0]) */
+    mul         t6, t6, t7       /* DEQUANTIZE(inptr[DCTSIZE*4],
+                                               quantptr[DCTSIZE*4]) */
+    addiu       t9, zero, 4433   /* FIX_0_541196100 */
+    addu        s3, s0, s1       /* z2 + z3 */
+    mul         s3, s3, t9       /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
+    addiu       t9, zero, 15137  /* FIX_1_847759065 */
+    mul         t8, s1, t9       /* MULTIPLY(z3, FIX_1_847759065) */
+    addu        t4, t5, t6
+    subu        t5, t5, t6
+    sll         t4, t4, 13      /* tmp0 = (z2 + z3) << CONST_BITS */
+    sll         t5, t5, 13      /* tmp1 = (z2 - z3) << CONST_BITS */
+    addu        t7, s3, s2      /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */
+    subu        t6, s3, t8      /* tmp2 =
+                                     z1 + MULTIPLY(z3, -FIX_1_847759065) */
+    addu        s0, t4, t7
+    subu        s1, t4, t7
+    addu        s2, t5, t6
+    subu        s3, t5, t6
+    addu        t4, s0, t3
+    subu        s0, s0, t3
+    addu        t3, s2, t1
+    subu        s2, s2, t1
+    addu        t1, s3, t2
+    subu        s3, s3, t2
+    addu        t2, s1, t0
+    subu        s1, s1, t0
+    shra_r.w    t4, t4, 11
+    shra_r.w    t3, t3, 11
+    shra_r.w    t1, t1, 11
+    shra_r.w    t2, t2, 11
+    shra_r.w    s1, s1, 11
+    shra_r.w    s3, s3, 11
+    shra_r.w    s2, s2, 11
+    shra_r.w    s0, s0, 11
+    sw          t4, 0(v0)
+    sw          t3, 32(v0)
+    sw          t1, 64(v0)
+    sw          t2, 96(v0)
+    sw          s1, 128(v0)
+    sw          s3, 160(v0)
+    sw          s2, 192(v0)
+    sw          s0, 224(v0)
+3:
+    addiu       a1, a1, 2
+    addiu       a0, a0, 2
+    bgtz        v1, 1b
+     addiu      v0, v0, 4
+    move        v0, sp
+    addiu       v1, zero, 8
+4:
+    lw          t0, 8(v0)       /* z2 = (JLONG)wsptr[2] */
+    lw          t1, 24(v0)      /* z3 = (JLONG)wsptr[6] */
+    lw          t2, 0(v0)       /* (JLONG)wsptr[0] */
+    lw          t3, 16(v0)      /* (JLONG)wsptr[4] */
+    lw          s4, 4(v0)       /* (JLONG)wsptr[1] */
+    lw          s5, 12(v0)      /* (JLONG)wsptr[3] */
+    lw          s6, 20(v0)      /* (JLONG)wsptr[5] */
+    lw          s7, 28(v0)      /* (JLONG)wsptr[7] */
+    or          s4, s4, t0
+    or          s4, s4, t1
+    or          s4, s4, t3
+    or          s4, s4, s7
+    or          s4, s4, s5
+    or          s4, s4, s6
+    bnez        s4, 5f
+     addiu      v1, v1, -1
+    shra_r.w    s5, t2, 5
+    andi        s5, s5, 0x3ff
+    lbux        s5, s5(a3)
+    lw          s1, 0(a2)
+    replv.qb    s5, s5
+    usw         s5, 0(s1)
+    usw         s5, 4(s1)
+    b           6f
+     nop
+5:
+    addu        t4, t0, t1       /* z2 + z3 */
+    addiu       t8, zero, 4433   /* FIX_0_541196100 */
+    mul         t5, t4, t8       /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
+    addiu       t8, zero, 15137  /* FIX_1_847759065 */
+    mul         t1, t1, t8       /* MULTIPLY(z3, FIX_1_847759065) */
+    addiu       t8, zero, 6270   /* FIX_0_765366865 */
+    mul         t0, t0, t8       /* MULTIPLY(z2, FIX_0_765366865) */
+    addu        t4, t2, t3       /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */
+    subu        t2, t2, t3       /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */
+    sll         t4, t4, 13       /* tmp0 =
+                                      (wsptr[0] + wsptr[4]) << CONST_BITS */
+    sll         t2, t2, 13       /* tmp1 =
+                                      (wsptr[0] - wsptr[4]) << CONST_BITS */
+    subu        t1, t5, t1       /* tmp2 =
+                                      z1 + MULTIPLY(z3, -FIX_1_847759065) */
+    subu        t3, t2, t1       /* tmp12 = tmp1 - tmp2 */
+    addu        t2, t2, t1       /* tmp11 = tmp1 + tmp2 */
+    addu        t5, t5, t0       /* tmp3 =
+                                      z1 + MULTIPLY(z2, FIX_0_765366865) */
+    subu        t1, t4, t5       /* tmp13 = tmp0 - tmp3 */
+    addu        t0, t4, t5       /* tmp10 = tmp0 + tmp3 */
+    lw          t4, 28(v0)       /* tmp0 = (JLONG)wsptr[7] */
+    lw          t6, 12(v0)       /* tmp2 = (JLONG)wsptr[3] */
+    lw          t5, 20(v0)       /* tmp1 = (JLONG)wsptr[5] */
+    lw          t7, 4(v0)        /* tmp3 = (JLONG)wsptr[1] */
+    addu        s0, t4, t6       /* z3 = tmp0 + tmp2 */
+    addiu       t8, zero, 9633   /* FIX_1_175875602 */
+    addu        s1, t5, t7       /* z4 = tmp1 + tmp3 */
+    addu        s2, s0, s1       /* z3 + z4 */
+    mul         s2, s2, t8       /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    addu        s3, t4, t7       /* z1 = tmp0 + tmp3 */
+    addu        t9, t5, t6       /* z2 = tmp1 + tmp2 */
+    addiu       t8, zero, 16069  /* FIX_1_961570560 */
+    mul         s0, s0, t8       /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
+    addiu       t8, zero, 3196   /* FIX_0_390180644 */
+    mul         s1, s1, t8       /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
+    addiu       t8, zero, 2446   /* FIX_0_298631336 */
+    mul         t4, t4, t8       /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    addiu       t8, zero, 7373   /* FIX_0_899976223 */
+    mul         s3, s3, t8       /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
+    addiu       t8, zero, 16819  /* FIX_2_053119869 */
+    mul         t5, t5, t8       /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    addiu       t8, zero, 20995  /* FIX_2_562915447 */
+    mul         t9, t9, t8       /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
+    addiu       t8, zero, 25172  /* FIX_3_072711026 */
+    mul         t6, t6, t8       /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    addiu       t8, zero, 12299  /* FIX_1_501321110 */
+    mul         t7, t7, t8       /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    subu        s0, s2, s0       /* z3 += z5 */
+    subu        s1, s2, s1       /* z4 += z5 */
+    addu        t4, t4, s0
+    subu        t4, t4, s3      /* tmp0 */
+    addu        t5, t5, s1
+    subu        t5, t5, t9      /* tmp1 */
+    addu        t6, t6, s0
+    subu        t6, t6, t9      /* tmp2 */
+    addu        t7, t7, s1
+    subu        t7, t7, s3      /* tmp3 */
+    addu        s0, t0, t7
+    subu        t0, t0, t7
+    addu        t7, t2, t6
+    subu        t2, t2, t6
+    addu        t6, t3, t5
+    subu        t3, t3, t5
+    addu        t5, t1, t4
+    subu        t1, t1, t4
+    shra_r.w    s0, s0, 18
+    shra_r.w    t7, t7, 18
+    shra_r.w    t6, t6, 18
+    shra_r.w    t5, t5, 18
+    shra_r.w    t1, t1, 18
+    shra_r.w    t3, t3, 18
+    shra_r.w    t2, t2, 18
+    shra_r.w    t0, t0, 18
+    andi        s0, s0, 0x3ff
+    andi        t7, t7, 0x3ff
+    andi        t6, t6, 0x3ff
+    andi        t5, t5, 0x3ff
+    andi        t1, t1, 0x3ff
+    andi        t3, t3, 0x3ff
+    andi        t2, t2, 0x3ff
+    andi        t0, t0, 0x3ff
+    lw          s1, 0(a2)
+    lbux        s0, s0(a3)
+    lbux        t7, t7(a3)
+    lbux        t6, t6(a3)
+    lbux        t5, t5(a3)
+    lbux        t1, t1(a3)
+    lbux        t3, t3(a3)
+    lbux        t2, t2(a3)
+    lbux        t0, t0(a3)
+    sb          s0, 0(s1)
+    sb          t7, 1(s1)
+    sb          t6, 2(s1)
+    sb          t5, 3(s1)
+    sb          t1, 4(s1)
+    sb          t3, 5(s1)
+    sb          t2, 6(s1)
+    sb          t0, 7(s1)
+6:
+    addiu       v0, v0, 32
+    bgtz        v1, 4b
+     addiu      a2, a2, 4
+    addiu       sp, sp, 256
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_idct_islow_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
+/*
+ * a0 = inptr
+ * a1 = quantptr
+ * a2 = wsptr
+ * a3 = mips_idct_ifast_coefs
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu         t9, a0, 16      /* end address */
+    or            AT, a3, zero
+
+0:
+    lw            s0, 0(a1)       /* quantptr[DCTSIZE*0] */
+    lw            t0, 0(a0)       /* inptr[DCTSIZE*0] */
+    lw            t1, 16(a0)      /* inptr[DCTSIZE*1] */
+    muleq_s.w.phl v0, t0, s0      /* tmp0 ... */
+    lw            t2, 32(a0)      /* inptr[DCTSIZE*2] */
+    lw            t3, 48(a0)      /* inptr[DCTSIZE*3] */
+    lw            t4, 64(a0)      /* inptr[DCTSIZE*4] */
+    lw            t5, 80(a0)      /* inptr[DCTSIZE*5] */
+    muleq_s.w.phr t0, t0, s0      /* ... tmp0 ... */
+    lw            t6, 96(a0)      /* inptr[DCTSIZE*6] */
+    lw            t7, 112(a0)     /* inptr[DCTSIZE*7] */
+    or            s4, t1, t2
+    or            s5, t3, t4
+    bnez          s4, 1f
+     ins          t0, v0, 16, 16  /* ... tmp0 */
+    bnez          s5, 1f
+     or           s6, t5, t6
+    or            s6, s6, t7
+    bnez          s6, 1f
+     sw           t0, 0(a2)       /* wsptr[DCTSIZE*0] */
+    sw            t0, 16(a2)      /* wsptr[DCTSIZE*1] */
+    sw            t0, 32(a2)      /* wsptr[DCTSIZE*2] */
+    sw            t0, 48(a2)      /* wsptr[DCTSIZE*3] */
+    sw            t0, 64(a2)      /* wsptr[DCTSIZE*4] */
+    sw            t0, 80(a2)      /* wsptr[DCTSIZE*5] */
+    sw            t0, 96(a2)      /* wsptr[DCTSIZE*6] */
+    sw            t0, 112(a2)     /* wsptr[DCTSIZE*7] */
+    addiu         a0, a0, 4
+    b             2f
+     addiu        a1, a1, 4
+
+1:
+    lw            s1, 32(a1)      /* quantptr[DCTSIZE*2] */
+    lw            s2, 64(a1)      /* quantptr[DCTSIZE*4] */
+    muleq_s.w.phl v0, t2, s1      /* tmp1 ... */
+    muleq_s.w.phr t2, t2, s1      /* ... tmp1 ... */
+    lw            s0, 16(a1)      /* quantptr[DCTSIZE*1] */
+    lw            s1, 48(a1)      /* quantptr[DCTSIZE*3] */
+    lw            s3, 96(a1)      /* quantptr[DCTSIZE*6] */
+    muleq_s.w.phl v1, t4, s2      /* tmp2 ... */
+    muleq_s.w.phr t4, t4, s2      /* ... tmp2 ... */
+    lw            s2, 80(a1)      /* quantptr[DCTSIZE*5] */
+    lw            t8, 4(AT)       /* FIX(1.414213562) */
+    ins           t2, v0, 16, 16  /* ... tmp1 */
+    muleq_s.w.phl v0, t6, s3      /* tmp3 ... */
+    muleq_s.w.phr t6, t6, s3      /* ... tmp3 ... */
+    ins           t4, v1, 16, 16  /* ... tmp2 */
+    addq.ph       s4, t0, t4      /* tmp10 */
+    subq.ph       s5, t0, t4      /* tmp11 */
+    ins           t6, v0, 16, 16  /* ... tmp3 */
+    subq.ph       s6, t2, t6      /* tmp12 ... */
+    addq.ph       s7, t2, t6      /* tmp13 */
+    mulq_s.ph     s6, s6, t8      /* ... tmp12 ... */
+    addq.ph       t0, s4, s7      /* tmp0 */
+    subq.ph       t6, s4, s7      /* tmp3 */
+    muleq_s.w.phl v0, t1, s0      /* tmp4 ... */
+    muleq_s.w.phr t1, t1, s0      /* ... tmp4 ... */
+    shll_s.ph     s6, s6, 1       /* x2 */
+    lw            s3, 112(a1)     /* quantptr[DCTSIZE*7] */
+    subq.ph       s6, s6, s7      /* ... tmp12 */
+    muleq_s.w.phl v1, t7, s3      /* tmp7 ... */
+    muleq_s.w.phr t7, t7, s3      /* ... tmp7 ... */
+    ins           t1, v0, 16, 16  /* ... tmp4 */
+    addq.ph       t2, s5, s6      /* tmp1 */
+    subq.ph       t4, s5, s6      /* tmp2 */
+    muleq_s.w.phl v0, t5, s2      /* tmp6 ... */
+    muleq_s.w.phr t5, t5, s2      /* ... tmp6 ... */
+    ins           t7, v1, 16, 16  /* ... tmp7 */
+    addq.ph       s5, t1, t7      /* z11 */
+    subq.ph       s6, t1, t7      /* z12 */
+    muleq_s.w.phl v1, t3, s1      /* tmp5 ... */
+    muleq_s.w.phr t3, t3, s1      /* ... tmp5 ... */
+    ins           t5, v0, 16, 16  /* ... tmp6 */
+    ins           t3, v1, 16, 16  /* ... tmp5 */
+    addq.ph       s7, t5, t3      /* z13 */
+    subq.ph       v0, t5, t3      /* z10 */
+    addq.ph       t7, s5, s7      /* tmp7 */
+    subq.ph       s5, s5, s7      /* tmp11 ... */
+    addq.ph       v1, v0, s6      /* z5 ... */
+    mulq_s.ph     s5, s5, t8      /* ... tmp11 */
+    lw            t8, 8(AT)       /* FIX(1.847759065) */
+    lw            s4, 0(AT)       /* FIX(1.082392200) */
+    addq.ph       s0, t0, t7
+    subq.ph       s1, t0, t7
+    mulq_s.ph     v1, v1, t8      /* ... z5 */
+    shll_s.ph     s5, s5, 1       /* x2 */
+    lw            t8, 12(AT)      /* FIX(-2.613125930) */
+    sw            s0, 0(a2)       /* wsptr[DCTSIZE*0] */
+    shll_s.ph     v0, v0, 1       /* x4 */
+    mulq_s.ph     v0, v0, t8      /* tmp12 ... */
+    mulq_s.ph     s4, s6, s4      /* tmp10 ... */
+    shll_s.ph     v1, v1, 1       /* x2 */
+    addiu         a0, a0, 4
+    addiu         a1, a1, 4
+    sw            s1, 112(a2)     /* wsptr[DCTSIZE*7] */
+    shll_s.ph     s6, v0, 1       /* x4 */
+    shll_s.ph     s4, s4, 1       /* x2 */
+    addq.ph       s6, s6, v1      /* ... tmp12 */
+    subq.ph       t5, s6, t7      /* tmp6 */
+    subq.ph       s4, s4, v1      /* ... tmp10 */
+    subq.ph       t3, s5, t5      /* tmp5 */
+    addq.ph       s2, t2, t5
+    addq.ph       t1, s4, t3      /* tmp4 */
+    subq.ph       s3, t2, t5
+    sw            s2, 16(a2)      /* wsptr[DCTSIZE*1] */
+    sw            s3, 96(a2)      /* wsptr[DCTSIZE*6] */
+    addq.ph       v0, t4, t3
+    subq.ph       v1, t4, t3
+    sw            v0, 32(a2)      /* wsptr[DCTSIZE*2] */
+    sw            v1, 80(a2)      /* wsptr[DCTSIZE*5] */
+    addq.ph       v0, t6, t1
+    subq.ph       v1, t6, t1
+    sw            v0, 64(a2)      /* wsptr[DCTSIZE*4] */
+    sw            v1, 48(a2)      /* wsptr[DCTSIZE*3] */
+
+2:
+    bne           a0, t9, 0b
+     addiu        a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j             ra
+     nop
+
+END(jsimd_idct_ifast_cols_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
+/*
+ * a0 = wsptr
+ * a1 = output_buf
+ * a2 = output_col
+ * a3 = mips_idct_ifast_coefs
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    addiu         t9, a0, 128     /* end address */
+    lui           s8, 0x8080
+    ori           s8, s8, 0x8080
+
+0:
+    lw            AT, 36(sp)      /* restore $a3 (mips_idct_ifast_coefs) */
+    lw            t0, 0(a0)       /* wsptr[DCTSIZE*0+0/1]  b a */
+    lw            s0, 16(a0)      /* wsptr[DCTSIZE*1+0/1]  B A */
+    lw            t2, 4(a0)       /* wsptr[DCTSIZE*0+2/3]  d c */
+    lw            s2, 20(a0)      /* wsptr[DCTSIZE*1+2/3]  D C */
+    lw            t4, 8(a0)       /* wsptr[DCTSIZE*0+4/5]  f e */
+    lw            s4, 24(a0)      /* wsptr[DCTSIZE*1+4/5]  F E */
+    lw            t6, 12(a0)      /* wsptr[DCTSIZE*0+6/7]  h g */
+    lw            s6, 28(a0)      /* wsptr[DCTSIZE*1+6/7]  H G */
+    precrq.ph.w   t1, s0, t0      /* B b */
+    ins           t0, s0, 16, 16  /* A a */
+    bnez          t1, 1f
+     or           s0, t2, s2
+    bnez          s0, 1f
+     or           s0, t4, s4
+    bnez          s0, 1f
+     or           s0, t6, s6
+    bnez          s0, 1f
+     shll_s.ph    s0, t0, 2       /* A a */
+    lw            a3, 0(a1)
+    lw            AT, 4(a1)
+    precrq.ph.w   t0, s0, s0      /* A A */
+    ins           s0, s0, 16, 16  /* a a */
+    addu          a3, a3, a2
+    addu          AT, AT, a2
+    precrq.qb.ph  t0, t0, t0      /* A A A A */
+    precrq.qb.ph  s0, s0, s0      /* a a a a */
+    addu.qb       s0, s0, s8
+    addu.qb       t0, t0, s8
+    sw            s0, 0(a3)
+    sw            s0, 4(a3)
+    sw            t0, 0(AT)
+    sw            t0, 4(AT)
+    addiu         a0, a0, 32
+    bne           a0, t9, 0b
+     addiu        a1, a1, 8
+    b             2f
+     nop
+
+1:
+    precrq.ph.w   t3, s2, t2
+    ins           t2, s2, 16, 16
+    precrq.ph.w   t5, s4, t4
+    ins           t4, s4, 16, 16
+    precrq.ph.w   t7, s6, t6
+    ins           t6, s6, 16, 16
+    lw            t8, 4(AT)       /* FIX(1.414213562) */
+    addq.ph       s4, t0, t4      /* tmp10 */
+    subq.ph       s5, t0, t4      /* tmp11 */
+    subq.ph       s6, t2, t6      /* tmp12 ... */
+    addq.ph       s7, t2, t6      /* tmp13 */
+    mulq_s.ph     s6, s6, t8      /* ... tmp12 ... */
+    addq.ph       t0, s4, s7      /* tmp0 */
+    subq.ph       t6, s4, s7      /* tmp3 */
+    shll_s.ph     s6, s6, 1       /* x2 */
+    subq.ph       s6, s6, s7      /* ... tmp12 */
+    addq.ph       t2, s5, s6      /* tmp1 */
+    subq.ph       t4, s5, s6      /* tmp2 */
+    addq.ph       s5, t1, t7      /* z11 */
+    subq.ph       s6, t1, t7      /* z12 */
+    addq.ph       s7, t5, t3      /* z13 */
+    subq.ph       v0, t5, t3      /* z10 */
+    addq.ph       t7, s5, s7      /* tmp7 */
+    subq.ph       s5, s5, s7      /* tmp11 ... */
+    addq.ph       v1, v0, s6      /* z5 ... */
+    mulq_s.ph     s5, s5, t8      /* ... tmp11 */
+    lw            t8, 8(AT)       /* FIX(1.847759065) */
+    lw            s4, 0(AT)       /* FIX(1.082392200) */
+    addq.ph       s0, t0, t7      /* tmp0 + tmp7 */
+    subq.ph       s7, t0, t7      /* tmp0 - tmp7 */
+    mulq_s.ph     v1, v1, t8      /* ... z5 */
+    lw            a3, 0(a1)
+    lw            t8, 12(AT)      /* FIX(-2.613125930) */
+    shll_s.ph     s5, s5, 1       /* x2 */
+    addu          a3, a3, a2
+    shll_s.ph     v0, v0, 1       /* x4 */
+    mulq_s.ph     v0, v0, t8      /* tmp12 ... */
+    mulq_s.ph     s4, s6, s4      /* tmp10 ... */
+    shll_s.ph     v1, v1, 1       /* x2 */
+    addiu         a0, a0, 32
+    addiu         a1, a1, 8
+    shll_s.ph     s6, v0, 1       /* x4 */
+    shll_s.ph     s4, s4, 1       /* x2 */
+    addq.ph       s6, s6, v1      /* ... tmp12 */
+    shll_s.ph     s0, s0, 2
+    subq.ph       t5, s6, t7      /* tmp6 */
+    subq.ph       s4, s4, v1      /* ... tmp10 */
+    subq.ph       t3, s5, t5      /* tmp5 */
+    shll_s.ph     s7, s7, 2
+    addq.ph       t1, s4, t3      /* tmp4 */
+    addq.ph       s1, t2, t5      /* tmp1 + tmp6 */
+    subq.ph       s6, t2, t5      /* tmp1 - tmp6 */
+    addq.ph       s2, t4, t3      /* tmp2 + tmp5 */
+    subq.ph       s5, t4, t3      /* tmp2 - tmp5 */
+    addq.ph       s4, t6, t1      /* tmp3 + tmp4 */
+    subq.ph       s3, t6, t1      /* tmp3 - tmp4 */
+    shll_s.ph     s1, s1, 2
+    shll_s.ph     s2, s2, 2
+    shll_s.ph     s3, s3, 2
+    shll_s.ph     s4, s4, 2
+    shll_s.ph     s5, s5, 2
+    shll_s.ph     s6, s6, 2
+    precrq.ph.w   t0, s1, s0      /* B A */
+    ins           s0, s1, 16, 16  /* b a */
+    precrq.ph.w   t2, s3, s2      /* D C */
+    ins           s2, s3, 16, 16  /* d c */
+    precrq.ph.w   t4, s5, s4      /* F E */
+    ins           s4, s5, 16, 16  /* f e */
+    precrq.ph.w   t6, s7, s6      /* H G */
+    ins           s6, s7, 16, 16  /* h g */
+    precrq.qb.ph  t0, t2, t0      /* D C B A */
+    precrq.qb.ph  s0, s2, s0      /* d c b a */
+    precrq.qb.ph  t4, t6, t4      /* H G F E */
+    precrq.qb.ph  s4, s6, s4      /* h g f e */
+    addu.qb       s0, s0, s8
+    addu.qb       s4, s4, s8
+    sw            s0, 0(a3)       /* outptr[0/1/2/3]       d c b a */
+    sw            s4, 4(a3)       /* outptr[4/5/6/7]       h g f e */
+    lw            a3, -4(a1)
+    addu.qb       t0, t0, s8
+    addu          a3, a3, a2
+    addu.qb       t4, t4, s8
+    sw            t0, 0(a3)       /* outptr[0/1/2/3]       D C B A */
+    bne           a0, t9, 0b
+     sw           t4, 4(a3)       /* outptr[4/5/6/7]       H G F E */
+
+2:
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    j             ra
+     nop
+
+END(jsimd_idct_ifast_rows_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_fdct_islow_dspr2)
+/*
+ * a0 = data
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lui         t0, 6437
+    ori         t0, 2260
+    lui         t1, 9633
+    ori         t1, 11363
+    lui         t2, 0xd39e
+    ori         t2, 0xe6dc
+    lui         t3, 0xf72d
+    ori         t3, 9633
+    lui         t4, 2261
+    ori         t4, 9633
+    lui         t5, 0xd39e
+    ori         t5, 6437
+    lui         t6, 9633
+    ori         t6, 0xd39d
+    lui         t7, 0xe6dc
+    ori         t7, 2260
+    lui         t8, 4433
+    ori         t8, 10703
+    lui         t9, 0xd630
+    ori         t9, 4433
+    li          s8, 8
+    move        a1, a0
+1:
+    lw          s0, 0(a1)       /* tmp0 = 1|0 */
+    lw          s1, 4(a1)       /* tmp1 = 3|2 */
+    lw          s2, 8(a1)       /* tmp2 = 5|4 */
+    lw          s3, 12(a1)      /* tmp3 = 7|6 */
+    packrl.ph   s1, s1, s1      /* tmp1 = 2|3 */
+    packrl.ph   s3, s3, s3      /* tmp3 = 6|7 */
+    subq.ph     s7, s1, s2      /* tmp7 = 2-5|3-4 = t5|t4 */
+    subq.ph     s5, s0, s3      /* tmp5 = 1-6|0-7 = t6|t7 */
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, s7, t0    /* ac0 += t5*  6437 + t4*  2260 */
+    dpa.w.ph    $ac0, s5, t1    /* ac0 += t6*  9633 + t7* 11363 */
+    mult        $ac1, $0, $0    /* ac1  = 0 */
+    dpa.w.ph    $ac1, s7, t2    /* ac1 += t5*-11362 + t4* -6436 */
+    dpa.w.ph    $ac1, s5, t3    /* ac1 += t6* -2259 + t7*  9633 */
+    mult        $ac2, $0, $0    /* ac2  = 0 */
+    dpa.w.ph    $ac2, s7, t4    /* ac2 += t5*  2261 + t4*  9633 */
+    dpa.w.ph    $ac2, s5, t5    /* ac2 += t6*-11362 + t7*  6437 */
+    mult        $ac3, $0, $0    /* ac3  = 0 */
+    dpa.w.ph    $ac3, s7, t6    /* ac3 += t5*  9633 + t4*-11363 */
+    dpa.w.ph    $ac3, s5, t7    /* ac3 += t6* -6436 + t7*  2260 */
+    addq.ph     s6, s1, s2      /* tmp6 = 2+5|3+4 = t2|t3 */
+    addq.ph     s4, s0, s3      /* tmp4 = 1+6|0+7 = t1|t0 */
+    extr_r.w    s0, $ac0, 11    /* tmp0 = (ac0 + 1024) >> 11 */
+    extr_r.w    s1, $ac1, 11    /* tmp1 = (ac1 + 1024) >> 11 */
+    extr_r.w    s2, $ac2, 11    /* tmp2 = (ac2 + 1024) >> 11 */
+    extr_r.w    s3, $ac3, 11    /* tmp3 = (ac3 + 1024) >> 11 */
+    addq.ph     s5, s4, s6      /* tmp5 = t1+t2|t0+t3 = t11|t10 */
+    subq.ph     s7, s4, s6      /* tmp7 = t1-t2|t0-t3 = t12|t13 */
+    sh          s0, 2(a1)
+    sh          s1, 6(a1)
+    sh          s2, 10(a1)
+    sh          s3, 14(a1)
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, s7, t8    /* ac0 += t12*  4433 + t13* 10703 */
+    mult        $ac1, $0, $0    /* ac1  = 0 */
+    dpa.w.ph    $ac1, s7, t9    /* ac1 += t12*-10704 + t13*  4433 */
+    sra         s4, s5, 16      /* tmp4 = t11 */
+    addiu       a1, a1, 16
+    addiu       s8, s8, -1
+    extr_r.w    s0, $ac0, 11    /* tmp0 = (ac0 + 1024) >> 11 */
+    extr_r.w    s1, $ac1, 11    /* tmp1 = (ac1 + 1024) >> 11 */
+    addu        s2, s5, s4      /* tmp2 = t10 + t11 */
+    subu        s3, s5, s4      /* tmp3 = t10 - t11 */
+    sll         s2, s2, 2       /* tmp2 = (t10 + t11) << 2 */
+    sll         s3, s3, 2       /* tmp3 = (t10 - t11) << 2 */
+    sh          s2, -16(a1)
+    sh          s3, -8(a1)
+    sh          s0, -12(a1)
+    bgtz        s8, 1b
+     sh         s1, -4(a1)
+    li          t0, 2260
+    li          t1, 11363
+    li          t2, 9633
+    li          t3, 6436
+    li          t4, 6437
+    li          t5, 2261
+    li          t6, 11362
+    li          t7, 2259
+    li          t8, 4433
+    li          t9, 10703
+    li          a1, 10704
+    li          s8, 8
+
+2:
+    lh          a2, 0(a0)       /* 0 */
+    lh          a3, 16(a0)      /* 8 */
+    lh          v0, 32(a0)      /* 16 */
+    lh          v1, 48(a0)      /* 24 */
+    lh          s4, 64(a0)      /* 32 */
+    lh          s5, 80(a0)      /* 40 */
+    lh          s6, 96(a0)      /* 48 */
+    lh          s7, 112(a0)     /* 56 */
+    addu        s2, v0, s5      /* tmp2 = 16 + 40 */
+    subu        s5, v0, s5      /* tmp5 = 16 - 40 */
+    addu        s3, v1, s4      /* tmp3 = 24 + 32 */
+    subu        s4, v1, s4      /* tmp4 = 24 - 32 */
+    addu        s0, a2, s7      /* tmp0 =  0 + 56 */
+    subu        s7, a2, s7      /* tmp7 =  0 - 56 */
+    addu        s1, a3, s6      /* tmp1 =  8 + 48 */
+    subu        s6, a3, s6      /* tmp6 =  8 - 48 */
+    addu        a2, s0, s3      /* tmp10 = tmp0 + tmp3 */
+    subu        v1, s0, s3      /* tmp13 = tmp0 - tmp3 */
+    addu        a3, s1, s2      /* tmp11 = tmp1 + tmp2 */
+    subu        v0, s1, s2      /* tmp12 = tmp1 - tmp2 */
+    mult        s7, t1          /* ac0  = tmp7 * c1 */
+    madd        s4, t0          /* ac0 += tmp4 * c0 */
+    madd        s5, t4          /* ac0 += tmp5 * c4 */
+    madd        s6, t2          /* ac0 += tmp6 * c2 */
+    mult        $ac1, s7, t2    /* ac1  = tmp7 * c2 */
+    msub        $ac1, s4, t3    /* ac1 -= tmp4 * c3 */
+    msub        $ac1, s5, t6    /* ac1 -= tmp5 * c6 */
+    msub        $ac1, s6, t7    /* ac1 -= tmp6 * c7 */
+    mult        $ac2, s7, t4    /* ac2  = tmp7 * c4 */
+    madd        $ac2, s4, t2    /* ac2 += tmp4 * c2 */
+    madd        $ac2, s5, t5    /* ac2 += tmp5 * c5 */
+    msub        $ac2, s6, t6    /* ac2 -= tmp6 * c6 */
+    mult        $ac3, s7, t0    /* ac3  = tmp7 * c0 */
+    msub        $ac3, s4, t1    /* ac3 -= tmp4 * c1 */
+    madd        $ac3, s5, t2    /* ac3 += tmp5 * c2 */
+    msub        $ac3, s6, t3    /* ac3 -= tmp6 * c3 */
+    extr_r.w    s0, $ac0, 15    /* tmp0 = (ac0 + 16384) >> 15 */
+    extr_r.w    s1, $ac1, 15    /* tmp1 = (ac1 + 16384) >> 15 */
+    extr_r.w    s2, $ac2, 15    /* tmp2 = (ac2 + 16384) >> 15 */
+    extr_r.w    s3, $ac3, 15    /* tmp3 = (ac3 + 16384) >> 15 */
+    addiu       s8, s8, -1
+    addu        s4, a2, a3      /* tmp4 = tmp10 + tmp11 */
+    subu        s5, a2, a3      /* tmp5 = tmp10 - tmp11 */
+    sh          s0, 16(a0)
+    sh          s1, 48(a0)
+    sh          s2, 80(a0)
+    sh          s3, 112(a0)
+    mult        v0, t8          /* ac0  = tmp12 * c8 */
+    madd        v1, t9          /* ac0 += tmp13 * c9 */
+    mult        $ac1, v1, t8    /* ac1  = tmp13 * c8 */
+    msub        $ac1, v0, a1    /* ac1 -= tmp12 * c10 */
+    addiu       a0, a0, 2
+    extr_r.w    s6, $ac0, 15    /* tmp6 = (ac0 + 16384) >> 15 */
+    extr_r.w    s7, $ac1, 15    /* tmp7 = (ac1 + 16384) >> 15 */
+    shra_r.w    s4, s4, 2       /* tmp4 = (tmp4 + 2) >> 2 */
+    shra_r.w    s5, s5, 2       /* tmp5 = (tmp5 + 2) >> 2 */
+    sh          s4, -2(a0)
+    sh          s5, 62(a0)
+    sh          s6, 30(a0)
+    bgtz        s8, 2b
+     sh         s7, 94(a0)
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    jr          ra
+     nop
+
+END(jsimd_fdct_islow_dspr2)
+
+
+/**************************************************************************/
+LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
+/*
+ * a0 = data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 8, s0, s1
+
+    li          a1, 0x014e014e  /* FIX_1_306562965 (334 << 16) |
+                                                   (334 & 0xffff) */
+    li          a2, 0x008b008b  /* FIX_0_541196100 (139 << 16) |
+                                                   (139 & 0xffff) */
+    li          a3, 0x00620062  /* FIX_0_382683433 (98 << 16) |
+                                                   (98 & 0xffff) */
+    li          s1, 0x00b500b5  /* FIX_0_707106781 (181 << 16) |
+                                                   (181 & 0xffff) */
+
+    move        v0, a0
+    addiu       v1, v0, 128     /* end address */
+
+0:
+    lw          t0, 0(v0)       /* tmp0 = 1|0 */
+    lw          t1, 4(v0)       /* tmp1 = 3|2 */
+    lw          t2, 8(v0)       /* tmp2 = 5|4 */
+    lw          t3, 12(v0)      /* tmp3 = 7|6 */
+    packrl.ph   t1, t1, t1      /* tmp1 = 2|3 */
+    packrl.ph   t3, t3, t3      /* tmp3 = 6|7 */
+    subq.ph     t7, t1, t2      /* tmp7 = 2-5|3-4 = t5|t4 */
+    subq.ph     t5, t0, t3      /* tmp5 = 1-6|0-7 = t6|t7 */
+    addq.ph     t6, t1, t2      /* tmp6 = 2+5|3+4 = t2|t3 */
+    addq.ph     t4, t0, t3      /* tmp4 = 1+6|0+7 = t1|t0 */
+    addq.ph     t8, t4, t6      /* tmp5 = t1+t2|t0+t3 = t11|t10 */
+    subq.ph     t9, t4, t6      /* tmp7 = t1-t2|t0-t3 = t12|t13 */
+    sra         t4, t8, 16      /* tmp4 = t11 */
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, t9, s1
+    mult        $ac1, $0, $0    /* ac1  = 0 */
+    dpa.w.ph    $ac1, t7, a3    /* ac1 += t4*98 + t5*98 */
+    dpsx.w.ph   $ac1, t5, a3    /* ac1 += t6*98 + t7*98 */
+    mult        $ac2, $0, $0    /* ac2  = 0 */
+    dpa.w.ph    $ac2, t7, a2    /* ac2 += t4*139 + t5*139 */
+    mult        $ac3, $0, $0    /* ac3  = 0 */
+    dpa.w.ph    $ac3, t5, a1    /* ac3 += t6*334 + t7*334 */
+    precrq.ph.w t0, t5, t7      /* t0 = t5|t6 */
+    addq.ph     t2, t8, t4      /* tmp2 = t10 + t11 */
+    subq.ph     t3, t8, t4      /* tmp3 = t10 - t11 */
+    extr.w      t4, $ac0, 8
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, t0, s1    /* ac0 += t5*181 + t6*181 */
+    extr.w      t0, $ac1, 8     /* t0 = z5 */
+    extr.w      t1, $ac2, 8     /* t1 = MULTIPLY(tmp10, 139) */
+    extr.w      t7, $ac3, 8     /* t2 = MULTIPLY(tmp12, 334) */
+    extr.w      t8, $ac0, 8     /* t8 = z3 = MULTIPLY(tmp11, 181) */
+    add         t6, t1, t0      /* t6 = z2 */
+    add         t7, t7, t0      /* t7 = z4 */
+    subq.ph     t0, t5, t8      /* t0 = z13 = tmp7 - z3 */
+    addq.ph     t8, t5, t8      /* t9 = z11 = tmp7 + z3 */
+    addq.ph     t1, t0, t6      /* t1 = z13 + z2 */
+    subq.ph     t6, t0, t6      /* t6 = z13 - z2 */
+    addq.ph     t0, t8, t7      /* t0 = z11 + z4 */
+    subq.ph     t7, t8, t7      /* t7 = z11 - z4 */
+    addq.ph     t5, t4, t9
+    subq.ph     t4, t9, t4
+    sh          t2, 0(v0)
+    sh          t5, 4(v0)
+    sh          t3, 8(v0)
+    sh          t4, 12(v0)
+    sh          t1, 10(v0)
+    sh          t6, 6(v0)
+    sh          t0, 2(v0)
+    sh          t7, 14(v0)
+    addiu       v0, 16
+    bne         v1, v0, 0b
+     nop
+    move        v0, a0
+    addiu       v1, v0, 16
+
+1:
+    lh          t0, 0(v0)       /* 0 */
+    lh          t1, 16(v0)      /* 8 */
+    lh          t2, 32(v0)      /* 16 */
+    lh          t3, 48(v0)      /* 24 */
+    lh          t4, 64(v0)      /* 32 */
+    lh          t5, 80(v0)      /* 40 */
+    lh          t6, 96(v0)      /* 48 */
+    lh          t7, 112(v0)     /* 56 */
+    add         t8, t0, t7      /* t8 = tmp0 */
+    sub         t7, t0, t7      /* t7 = tmp7 */
+    add         t0, t1, t6      /* t0 = tmp1 */
+    sub         t1, t1, t6      /* t1 = tmp6 */
+    add         t6, t2, t5      /* t6 = tmp2 */
+    sub         t5, t2, t5      /* t5 = tmp5 */
+    add         t2, t3, t4      /* t2 = tmp3 */
+    sub         t3, t3, t4      /* t3 = tmp4 */
+    add         t4, t8, t2      /* t4 = tmp10 = tmp0 + tmp3 */
+    sub         t8, t8, t2      /* t8 = tmp13 = tmp0 - tmp3 */
+    sub         s0, t0, t6      /* s0 = tmp12 = tmp1 - tmp2 */
+    ins         t8, s0, 16, 16  /* t8 = tmp12|tmp13 */
+    add         t2, t0, t6      /* t2 = tmp11 = tmp1 + tmp2 */
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, t8, s1    /* ac0 += t12*181 + t13*181 */
+    add         s0, t4, t2      /* t8 = tmp10+tmp11 */
+    sub         t4, t4, t2      /* t4 = tmp10-tmp11 */
+    sh          s0, 0(v0)
+    sh          t4, 64(v0)
+    extr.w      t2, $ac0, 8     /* z1 = MULTIPLY(tmp12+tmp13,
+                                                 FIX_0_707106781) */
+    addq.ph     t4, t8, t2      /* t9 = tmp13 + z1 */
+    subq.ph     t8, t8, t2      /* t2 = tmp13 - z1 */
+    sh          t4, 32(v0)
+    sh          t8, 96(v0)
+    add         t3, t3, t5      /* t3 = tmp10 = tmp4 + tmp5 */
+    add         t0, t5, t1      /* t0 = tmp11 = tmp5 + tmp6 */
+    add         t1, t1, t7      /* t1 = tmp12 = tmp6 + tmp7 */
+    andi        t4, a1, 0xffff
+    mul         s0, t1, t4
+    sra         s0, s0, 8       /* s0 = z4 =
+                                     MULTIPLY(tmp12, FIX_1_306562965) */
+    ins         t1, t3, 16, 16  /* t1 = tmp10|tmp12 */
+    mult        $0, $0          /* ac0  = 0 */
+    mulsa.w.ph  $ac0, t1, a3    /* ac0 += t10*98 - t12*98 */
+    extr.w      t8, $ac0, 8     /* z5 = MULTIPLY(tmp10-tmp12,
+                                                 FIX_0_382683433) */
+    add         t2, t7, t8      /* t2 = tmp7 + z5 */
+    sub         t7, t7, t8      /* t7 = tmp7 - z5 */
+    andi        t4, a2, 0xffff
+    mul         t8, t3, t4
+    sra         t8, t8, 8       /* t8 = z2 =
+                                     MULTIPLY(tmp10, FIX_0_541196100) */
+    andi        t4, s1, 0xffff
+    mul         t6, t0, t4
+    sra         t6, t6, 8       /* t6 = z3 =
+                                     MULTIPLY(tmp11, FIX_0_707106781) */
+    add         t0, t6, t8      /* t0 = z3 + z2 */
+    sub         t1, t6, t8      /* t1 = z3 - z2 */
+    add         t3, t6, s0      /* t3 = z3 + z4 */
+    sub         t4, t6, s0      /* t4 = z3 - z4 */
+    sub         t5, t2, t1      /* t5 = dataptr[5] */
+    sub         t6, t7, t0      /* t6 = dataptr[3] */
+    add         t3, t2, t3      /* t3 = dataptr[1] */
+    add         t4, t7, t4      /* t4 = dataptr[7] */
+    sh          t5, 80(v0)
+    sh          t6, 48(v0)
+    sh          t3, 16(v0)
+    sh          t4, 112(v0)
+    addiu       v0, 2
+    bne         v0, v1, 1b
+     nop
+
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j           ra
+     nop
+END(jsimd_fdct_ifast_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2
+
+    addiu       v0, a2, 124     /* v0 = workspace_end */
+    lh          t0, 0(a2)
+    lh          t1, 0(a1)
+    lh          t2, 128(a1)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    lh          t4, 384(a1)
+    lh          t5, 130(a1)
+    lh          t6, 2(a2)
+    lh          t7, 2(a1)
+    lh          t8, 386(a1)
+
+1:
+    andi        t1, 0xffff
+    add         t9, t0, t2
+    andi        t9, 0xffff
+    mul         v1, t9, t1
+    sra         s0, t6, 15
+    sll         s0, s0, 1
+    addiu       s0, s0, 1
+    addiu       t9, t4, 16
+    srav        v1, v1, t9
+    mul         v1, v1, t3
+    mul         t6, t6, s0
+    andi        t7, 0xffff
+    addiu       a2, a2, 4
+    addiu       a1, a1, 4
+    add         s1, t6, t5
+    andi        s1, 0xffff
+    sh          v1, 0(a0)
+
+    mul         s2, s1, t7
+    addiu       s1, t8, 16
+    srav        s2, s2, s1
+    mul         s2, s2, s0
+    lh          t0, 0(a2)
+    lh          t1, 0(a1)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    lh          t2, 128(a1)
+    lh          t4, 384(a1)
+    lh          t5, 130(a1)
+    lh          t8, 386(a1)
+    lh          t6, 2(a2)
+    lh          t7, 2(a1)
+    sh          s2, 2(a0)
+    lh          t0, 0(a2)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    bne         a2, v0, 1b
+     addiu      a0, a0, 4
+
+    andi        t1, 0xffff
+    add         t9, t0, t2
+    andi        t9, 0xffff
+    mul         v1, t9, t1
+    sra         s0, t6, 15
+    sll         s0, s0, 1
+    addiu       s0, s0, 1
+    addiu       t9, t4, 16
+    srav        v1, v1, t9
+    mul         v1, v1, t3
+    mul         t6, t6, s0
+    andi        t7, 0xffff
+    sh          v1, 0(a0)
+    add         s1, t6, t5
+    andi        s1, 0xffff
+    mul         s2, s1, t7
+    addiu       s1, t8, 16
+    addiu       a2, a2, 4
+    addiu       a1, a1, 4
+    srav        s2, s2, s1
+    mul         s2, s2, s0
+    sh          s2, 2(a0)
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
+
+    j           ra
+     nop
+
+END(jsimd_quantize_dspr2)
+
+
+#ifndef __mips_soft_float
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_float_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+    .set at
+
+    li          t1, 0x46800100  /* integer representation 16384.5 */
+    mtc1        t1, f0
+    li          t0, 63
+0:
+    lwc1        f2, 0(a2)
+    lwc1        f10, 0(a1)
+    lwc1        f4, 4(a2)
+    lwc1        f12, 4(a1)
+    lwc1        f6, 8(a2)
+    lwc1        f14, 8(a1)
+    lwc1        f8, 12(a2)
+    lwc1        f16, 12(a1)
+    madd.s      f2, f0, f2, f10
+    madd.s      f4, f0, f4, f12
+    madd.s      f6, f0, f6, f14
+    madd.s      f8, f0, f8, f16
+    lwc1        f10, 16(a1)
+    lwc1        f12, 20(a1)
+    trunc.w.s   f2, f2
+    trunc.w.s   f4, f4
+    trunc.w.s   f6, f6
+    trunc.w.s   f8, f8
+    lwc1        f14, 24(a1)
+    lwc1        f16, 28(a1)
+    mfc1        t1, f2
+    mfc1        t2, f4
+    mfc1        t3, f6
+    mfc1        t4, f8
+    lwc1        f2, 16(a2)
+    lwc1        f4, 20(a2)
+    lwc1        f6, 24(a2)
+    lwc1        f8, 28(a2)
+    madd.s      f2, f0, f2, f10
+    madd.s      f4, f0, f4, f12
+    madd.s      f6, f0, f6, f14
+    madd.s      f8, f0, f8, f16
+    addiu       t1, t1, -16384
+    addiu       t2, t2, -16384
+    addiu       t3, t3, -16384
+    addiu       t4, t4, -16384
+    trunc.w.s   f2, f2
+    trunc.w.s   f4, f4
+    trunc.w.s   f6, f6
+    trunc.w.s   f8, f8
+    sh          t1, 0(a0)
+    sh          t2, 2(a0)
+    sh          t3, 4(a0)
+    sh          t4, 6(a0)
+    mfc1        t1, f2
+    mfc1        t2, f4
+    mfc1        t3, f6
+    mfc1        t4, f8
+    addiu       t0, t0, -8
+    addiu       a2, a2, 32
+    addiu       a1, a1, 32
+    addiu       t1, t1, -16384
+    addiu       t2, t2, -16384
+    addiu       t3, t3, -16384
+    addiu       t4, t4, -16384
+    sh          t1, 8(a0)
+    sh          t2, 10(a0)
+    sh          t3, 12(a0)
+    sh          t4, 14(a0)
+    bgez        t0, 0b
+     addiu      a0, a0, 16
+
+    j           ra
+     nop
+
+END(jsimd_quantize_float_dspr2)
+
+#endif
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_2x2_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    addiu       sp, sp, -40
+    move        v0, sp
+    addiu       s2, zero, 29692
+    addiu       s3, zero, -10426
+    addiu       s4, zero, 6967
+    addiu       s5, zero, -5906
+    lh          t0, 0(a1)       /* t0 = inptr[DCTSIZE*0] */
+    lh          t5, 0(a0)       /* t5 = quantptr[DCTSIZE*0] */
+    lh          t1, 48(a1)      /* t1 = inptr[DCTSIZE*3] */
+    lh          t6, 48(a0)      /* t6 = quantptr[DCTSIZE*3] */
+    mul         t4, t5, t0
+    lh          t0, 16(a1)      /* t0 = inptr[DCTSIZE*1] */
+    lh          t5, 16(a0)      /* t5 = quantptr[DCTSIZE*1] */
+    mul         t6, t6, t1
+    mul         t5, t5, t0
+    lh          t2, 80(a1)      /* t2 = inptr[DCTSIZE*5] */
+    lh          t7, 80(a0)      /* t7 = quantptr[DCTSIZE*5] */
+    lh          t3, 112(a1)     /* t3 = inptr[DCTSIZE*7] */
+    lh          t8, 112(a0)     /* t8 = quantptr[DCTSIZE*7] */
+    mul         t7, t7, t2
+    mult        zero, zero
+    mul         t8, t8, t3
+    li          s0, 0x73FCD746  /* s0 = (29692 << 16) | (-10426 & 0xffff) */
+    li          s1, 0x1B37E8EE  /* s1 = (6967 << 16) | (-5906 & 0xffff) */
+    ins         t6, t5, 16, 16  /* t6 = t5|t6 */
+    sll         t4, t4, 15
+    dpa.w.ph    $ac0, t6, s0
+    lh          t1, 2(a1)
+    lh          t6, 2(a0)
+    ins         t8, t7, 16, 16  /* t8 = t7|t8 */
+    dpa.w.ph    $ac0, t8, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 18(a1)
+    lh          t6, 18(a0)
+    lh          t2, 50(a1)
+    lh          t7, 50(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 82(a1)
+    lh          t2, 82(a0)
+    lh          t3, 114(a1)
+    lh          t4, 114(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 0(v0)
+    sw          t8, 20(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 6(a1)
+    lh          t6, 6(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 22(a1)
+    lh          t6, 22(a0)
+    lh          t2, 54(a1)
+    lh          t7, 54(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 86(a1)
+    lh          t2, 86(a0)
+    lh          t3, 118(a1)
+    lh          t4, 118(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 4(v0)
+    sw          t8, 24(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 10(a1)
+    lh          t6, 10(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 26(a1)
+    lh          t6, 26(a0)
+    lh          t2, 58(a1)
+    lh          t7, 58(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 90(a1)
+    lh          t2, 90(a0)
+    lh          t3, 122(a1)
+    lh          t4, 122(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 8(v0)
+    sw          t8, 28(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 14(a1)
+    lh          t6, 14(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 30(a1)
+    lh          t6, 30(a0)
+    lh          t2, 62(a1)
+    lh          t7, 62(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 94(a1)
+    lh          t2, 94(a0)
+    lh          t3, 126(a1)
+    lh          t4, 126(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 12(v0)
+    sw          t8, 32(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    lw          t9, 0(a2)
+    lw          t3, 0(v0)
+    lw          t7, 4(v0)
+    lw          t1, 8(v0)
+    addu        t9, t9, a3
+    sll         t3, t3, 15
+    subu        t8, t4, t0
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    shra_r.w    t8, t8, 13
+    sw          t0, 16(v0)
+    sw          t8, 36(v0)
+    lw          t5, 12(v0)
+    lw          t6, 16(v0)
+    mult        t7, s2
+    madd        t1, s3
+    madd        t5, s4
+    madd        t6, s5
+    lw          t5, 24(v0)
+    lw          t7, 28(v0)
+    mflo        t0, $ac0
+    lw          t8, 32(v0)
+    lw          t2, 36(v0)
+    mult        $ac1, t5, s2
+    madd        $ac1, t7, s3
+    madd        $ac1, t8, s4
+    madd        $ac1, t2, s5
+    addu        t1, t3, t0
+    subu        t6, t3, t0
+    shra_r.w    t1, t1, 20
+    shra_r.w    t6, t6, 20
+    mflo        t4, $ac1
+    shll_s.w    t1, t1, 24
+    shll_s.w    t6, t6, 24
+    sra         t1, t1, 24
+    sra         t6, t6, 24
+    addiu       t1, t1, 128
+    addiu       t6, t6, 128
+    lw          t0, 20(v0)
+    sb          t1, 0(t9)
+    sb          t6, 1(t9)
+    sll         t0, t0, 15
+    lw          t9, 4(a2)
+    addu        t1, t0, t4
+    subu        t6, t0, t4
+    addu        t9, t9, a3
+    shra_r.w    t1, t1, 20
+    shra_r.w    t6, t6, 20
+    shll_s.w    t1, t1, 24
+    shll_s.w    t6, t6, 24
+    sra         t1, t1, 24
+    sra         t6, t6, 24
+    addiu       t1, t1, 128
+    addiu       t6, t6, 128
+    sb          t1, 0(t9)
+    sb          t6, 1(t9)
+    addiu       sp, sp, 40
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j           ra
+     nop
+
+END(jsimd_idct_2x2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_4x4_dspr2)
+/*
+ * a0     = compptr->dct_table
+ * a1     = coef_block
+ * a2     = output_buf
+ * a3     = output_col
+ * 16(sp) = workspace[DCTSIZE*4]  (buffers data between passes)
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          v1, 48(sp)
+    move        t0, a1
+    move        t1, v1
+    li          t9, 4
+    li          s0, 0x2e75f93e
+    li          s1, 0x21f9ba79
+    li          s2, 0xecc2efb0
+    li          s3, 0x52031ccd
+
+0:
+    lh          s6, 32(t0)      /* inptr[DCTSIZE*2] */
+    lh          t6, 32(a0)      /* quantptr[DCTSIZE*2] */
+    lh          s7, 96(t0)      /* inptr[DCTSIZE*6] */
+    lh          t7, 96(a0)      /* quantptr[DCTSIZE*6] */
+    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          s4, 0(t0)       /* inptr[DCTSIZE*0] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s5, 0(a0)       /* quantptr[0] */
+    li          s6, 15137
+    li          s7, 6270
+    mul         t2, s4, s5      /* tmp0 = (inptr[0] * quantptr[0]) */
+    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          t5, 112(t0)     /* inptr[DCTSIZE*7] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s4, 112(a0)     /* quantptr[DCTSIZE*7] */
+    lh          v0, 80(t0)      /* inptr[DCTSIZE*5] */
+    lh          s5, 80(a0)      /* quantptr[DCTSIZE*5] */
+    lh          s6, 48(a0)      /* quantptr[DCTSIZE*3] */
+    sll         t2, t2, 14      /* tmp0 <<= (CONST_BITS+1) */
+    lh          s7, 16(a0)      /* quantptr[DCTSIZE*1] */
+    lh          t8, 16(t0)      /* inptr[DCTSIZE*1] */
+    subu        t6, t6, t7      /* tmp2 =
+                                     MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
+    lh          t7, 48(t0)      /* inptr[DCTSIZE*3] */
+    mul         t5, s4, t5      /* z1 = (inptr[DCTSIZE*7] *
+                                         quantptr[DCTSIZE*7]) */
+    mul         v0, s5, v0      /* z2 = (inptr[DCTSIZE*5] *
+                                         quantptr[DCTSIZE*5]) */
+    mul         t7, s6, t7      /* z3 = (inptr[DCTSIZE*3] *
+                                         quantptr[DCTSIZE*3]) */
+    mul         t8, s7, t8      /* z4 = (inptr[DCTSIZE*1] *
+                                         quantptr[DCTSIZE*1]) */
+    addu        t3, t2, t6      /* tmp10 = tmp0 + z2 */
+    subu        t4, t2, t6      /* tmp10 = tmp0 - z2 */
+    mult        $ac0, zero, zero
+    mult        $ac1, zero, zero
+    ins         t5, v0, 16, 16
+    ins         t7, t8, 16, 16
+    addiu       t9, t9, -1
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    mflo        s4, $ac0
+    mflo        s5, $ac1
+    addiu       a0, a0, 2
+    addiu       t1, t1, 4
+    addiu       t0, t0, 2
+    addu        t6, t4, s4
+    subu        t5, t4, s4
+    addu        s6, t3, s5
+    subu        s7, t3, s5
+    shra_r.w    t6, t6, 12      /* DESCALE(tmp12 + temp1, 12) */
+    shra_r.w    t5, t5, 12      /* DESCALE(tmp12 - temp1, 12) */
+    shra_r.w    s6, s6, 12      /* DESCALE(tmp10 + temp2, 12) */
+    shra_r.w    s7, s7, 12      /* DESCALE(tmp10 - temp2, 12) */
+    sw          t6, 28(t1)
+    sw          t5, 60(t1)
+    sw          s6, -4(t1)
+    bgtz        t9, 0b
+     sw         s7, 92(t1)
+    /* second loop three pass */
+    li          t9, 3
+1:
+    lh          s6, 34(t0)      /* inptr[DCTSIZE*2] */
+    lh          t6, 34(a0)      /* quantptr[DCTSIZE*2] */
+    lh          s7, 98(t0)      /* inptr[DCTSIZE*6] */
+    lh          t7, 98(a0)      /* quantptr[DCTSIZE*6] */
+    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          s4, 2(t0)       /* inptr[DCTSIZE*0] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s5, 2(a0)       /* quantptr[DCTSIZE*0] */
+    li          s6, 15137
+    li          s7, 6270
+    mul         t2, s4, s5      /* tmp0 = (inptr[0] * quantptr[0]) */
+    mul         v0, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          t5, 114(t0)     /* inptr[DCTSIZE*7] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s4, 114(a0)     /* quantptr[DCTSIZE*7] */
+    lh          s5, 82(a0)      /* quantptr[DCTSIZE*5] */
+    lh          t6, 82(t0)      /* inptr[DCTSIZE*5] */
+    sll         t2, t2, 14      /* tmp0 <<= (CONST_BITS+1) */
+    lh          s6, 50(a0)      /* quantptr[DCTSIZE*3] */
+    lh          t8, 18(t0)      /* inptr[DCTSIZE*1] */
+    subu        v0, v0, t7      /* tmp2 =
+                                     MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
+    lh          t7, 50(t0)      /* inptr[DCTSIZE*3] */
+    lh          s7, 18(a0)      /* quantptr[DCTSIZE*1] */
+    mul         t5, s4, t5      /* z1 = (inptr[DCTSIZE*7] *
+                                         quantptr[DCTSIZE*7]) */
+    mul         t6, s5, t6      /* z2 = (inptr[DCTSIZE*5] *
+                                         quantptr[DCTSIZE*5]) */
+    mul         t7, s6, t7      /* z3 = (inptr[DCTSIZE*3] *
+                                         quantptr[DCTSIZE*3]) */
+    mul         t8, s7, t8      /* z4 = (inptr[DCTSIZE*1] *
+                                         quantptr[DCTSIZE*1]) */
+    addu        t3, t2, v0      /* tmp10 = tmp0 + z2 */
+    subu        t4, t2, v0      /* tmp10 = tmp0 - z2 */
+    mult        $ac0, zero, zero
+    mult        $ac1, zero, zero
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    mflo        t5, $ac0
+    mflo        t6, $ac1
+    addiu       t9, t9, -1
+    addiu       t0, t0, 2
+    addiu       a0, a0, 2
+    addiu       t1, t1, 4
+    addu        s5, t4, t5
+    subu        s4, t4, t5
+    addu        s6, t3, t6
+    subu        s7, t3, t6
+    shra_r.w    s5, s5, 12      /* DESCALE(tmp12 + temp1, 12) */
+    shra_r.w    s4, s4, 12      /* DESCALE(tmp12 - temp1, 12) */
+    shra_r.w    s6, s6, 12      /* DESCALE(tmp10 + temp2, 12) */
+    shra_r.w    s7, s7, 12      /* DESCALE(tmp10 - temp2, 12) */
+    sw          s5, 32(t1)
+    sw          s4, 64(t1)
+    sw          s6, 0(t1)
+    bgtz        t9, 1b
+     sw         s7, 96(t1)
+    move        t1, v1
+    li          s4, 15137
+    lw          s6, 8(t1)       /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 24(t1)      /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 0(t1)       /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 28(t1)      /* wsptr[7] */
+    lh          t6, 20(t1)      /* wsptr[5] */
+    lh          t7, 12(t1)      /* wsptr[3] */
+    lh          t8, 4(t1)       /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
+    sll         s4, t9, 2
+    lw          v0, 0(a2)       /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    /* 2 */
+    li          s4, 15137
+    lw          s6, 40(t1)      /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 56(t1)      /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 32(t1)      /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 60(t1)      /* wsptr[7] */
+    lh          t6, 52(t1)      /* wsptr[5] */
+    lh          t7, 44(t1)      /* wsptr[3] */
+    lh          t8, 36(t1)      /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2,
+                                           CONST_BITS-PASS1_BITS+1) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2,
+                                           CONST_BITS-PASS1_BITS+1) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1,
+                                           CONST_BITS-PASS1_BITS+1) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1,
+                                           CONST_BITS-PASS1_BITS+1) */
+    sll         s4, t9, 2
+    lw          v0, 4(a2)       /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    /* 3 */
+    li          s4, 15137
+    lw          s6, 72(t1)      /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 88(t1)      /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 64(t1)      /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 92(t1)      /* wsptr[7] */
+    lh          t6, 84(t1)      /* wsptr[5] */
+    lh          t7, 76(t1)      /* wsptr[3] */
+    lh          t8, 68(t1)      /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
+    sll         s4, t9, 2
+    lw          v0, 8(a2)       /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    li          s4, 15137
+    lw          s6, 104(t1)     /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 120(t1)     /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 96(t1)      /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 124(t1)     /* wsptr[7] */
+    lh          t6, 116(t1)     /* wsptr[5] */
+    lh          t7, 108(t1)     /* wsptr[3] */
+    lh          t8, 100(t1)     /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2; */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2; */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
+    sll         s4, t9, 2
+    lw          v0, 12(a2)      /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_idct_4x4_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_6x6_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu       sp, sp, -144
+    move        v0, sp
+    addiu       v1, v0, 24
+    addiu       t9, zero, 5793
+    addiu       s0, zero, 10033
+    addiu       s1, zero, 2998
+
+1:
+    lh          s2, 0(a0)       /* q0 = quantptr[ 0] */
+    lh          s3, 32(a0)      /* q1 = quantptr[16] */
+    lh          s4, 64(a0)      /* q2 = quantptr[32] */
+    lh          t2, 64(a1)      /* tmp2 = inptr[32] */
+    lh          t1, 32(a1)      /* tmp1 = inptr[16] */
+    lh          t0, 0(a1)       /* tmp0 = inptr[ 0] */
+    mul         t2, t2, s4      /* tmp2 = tmp2 * q2 */
+    mul         t1, t1, s3      /* tmp1 = tmp1 * q1 */
+    mul         t0, t0, s2      /* tmp0 = tmp0 * q0 */
+    lh          t6, 16(a1)      /* z1 = inptr[ 8] */
+    lh          t8, 80(a1)      /* z3 = inptr[40] */
+    lh          t7, 48(a1)      /* z2 = inptr[24] */
+    lh          s2, 16(a0)      /* q0 = quantptr[ 8] */
+    lh          s4, 80(a0)      /* q2 = quantptr[40] */
+    lh          s3, 48(a0)      /* q1 = quantptr[24] */
+    mul         t2, t2, t9      /* tmp2 = tmp2 * 5793 */
+    mul         t1, t1, s0      /* tmp1 = tmp1 * 10033 */
+    sll         t0, t0, 13      /* tmp0 = tmp0 << 13 */
+    mul         t6, t6, s2      /* z1 = z1 * q0 */
+    mul         t8, t8, s4      /* z3 = z3 * q2 */
+    mul         t7, t7, s3      /* z2 = z2 * q1 */
+    addu        t3, t0, t2      /* tmp10 = tmp0 + tmp2 */
+    sll         t2, t2, 1       /* tmp2 = tmp2 << 2 */
+    subu        t4, t0, t2      /* tmp11 = tmp0 - tmp2; */
+    subu        t5, t3, t1      /* tmp12 = tmp10 - tmp1 */
+    addu        t3, t3, t1      /* tmp10 = tmp10 + tmp1 */
+    addu        t1, t6, t8      /* tmp1 = z1 + z3 */
+    mul         t1, t1, s1      /* tmp1 = tmp1 * 2998 */
+    shra_r.w    t4, t4, 11      /* tmp11 = (tmp11 + 1024) >> 11 */
+    subu        t2, t6, t8      /* tmp2 = z1 - z3 */
+    subu        t2, t2, t7      /* tmp2 = tmp2 - z2 */
+    sll         t2, t2, 2       /* tmp2 = tmp2 << 2 */
+    addu        t0, t6, t7      /* tmp0 = z1 + z2 */
+    sll         t0, t0, 13      /* tmp0 = tmp0 << 13 */
+    subu        s2, t8, t7      /* q0 = z3 - z2 */
+    sll         s2, s2, 13      /* q0 = q0 << 13 */
+    addu        t0, t0, t1      /* tmp0 = tmp0 + tmp1 */
+    addu        t1, s2, t1      /* tmp1 = q0 + tmp1 */
+    addu        s2, t4, t2      /* q0 = tmp11 + tmp2 */
+    subu        s3, t4, t2      /* q1 = tmp11 - tmp2 */
+    addu        t6, t3, t0      /* z1 = tmp10 + tmp0 */
+    subu        t7, t3, t0      /* z2 = tmp10 - tmp0 */
+    addu        t4, t5, t1      /* tmp11 = tmp12 + tmp1 */
+    subu        t5, t5, t1      /* tmp12 = tmp12 - tmp1 */
+    shra_r.w    t6, t6, 11      /* z1 = (z1 + 1024) >> 11 */
+    shra_r.w    t7, t7, 11      /* z2 = (z2 + 1024) >> 11 */
+    shra_r.w    t4, t4, 11      /* tmp11 = (tmp11 + 1024) >> 11 */
+    shra_r.w    t5, t5, 11      /* tmp12 = (tmp12 + 1024) >> 11 */
+    sw          s2, 24(v0)
+    sw          s3, 96(v0)
+    sw          t6, 0(v0)
+    sw          t7, 120(v0)
+    sw          t4, 48(v0)
+    sw          t5, 72(v0)
+    addiu       v0, v0, 4
+    addiu       a1, a1, 2
+    bne         v0, v1, 1b
+     addiu      a0, a0, 2
+
+    /* Pass 2: process 6 rows from work array, store into output array. */
+    move        v0, sp
+    addiu       v1, v0, 144
+
+2:
+    lw          t0, 0(v0)
+    lw          t2, 16(v0)
+    lw          s5, 0(a2)
+    addiu       t0, t0, 16
+    sll         t0, t0, 13
+    mul         t3, t2, t9
+    lw          t6, 4(v0)
+    lw          t8, 20(v0)
+    lw          t7, 12(v0)
+    addu        s5, s5, a3
+    addu        s6, t6, t8
+    mul         s6, s6, s1
+    addu        t1, t0, t3
+    subu        t4, t0, t3
+    subu        t4, t4, t3
+    lw          t3, 8(v0)
+    mul         t0, t3, s0
+    addu        s7, t6, t7
+    sll         s7, s7, 13
+    addu        s7, s6, s7
+    subu        t2, t8, t7
+    sll         t2, t2, 13
+    addu        t2, s6, t2
+    subu        s6, t6, t7
+    subu        s6, s6, t8
+    sll         s6, s6, 13
+    addu        t3, t1, t0
+    subu        t5, t1, t0
+    addu        t6, t3, s7
+    subu        t3, t3, s7
+    addu        t7, t4, s6
+    subu        t4, t4, s6
+    addu        t8, t5, t2
+    subu        t5, t5, t2
+    shll_s.w    t6, t6, 6
+    shll_s.w    t3, t3, 6
+    shll_s.w    t7, t7, 6
+    shll_s.w    t4, t4, 6
+    shll_s.w    t8, t8, 6
+    shll_s.w    t5, t5, 6
+    sra         t6, t6, 24
+    addiu       t6, t6, 128
+    sra         t3, t3, 24
+    addiu       t3, t3, 128
+    sb          t6, 0(s5)
+    sra         t7, t7, 24
+    addiu       t7, t7, 128
+    sb          t3, 5(s5)
+    sra         t4, t4, 24
+    addiu       t4, t4, 128
+    sb          t7, 1(s5)
+    sra         t8, t8, 24
+    addiu       t8, t8, 128
+    sb          t4, 4(s5)
+    addiu       v0, v0, 24
+    sra         t5, t5, 24
+    addiu       t5, t5, 128
+    sb          t8, 2(s5)
+    addiu       a2, a2,  4
+    bne         v0, v1, 2b
+     sb         t5, 3(s5)
+
+    addiu       sp, sp, 144
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_idct_6x6_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = workspace
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li          a3, 8
+
+1:
+    /* odd part */
+    lh          t0, 48(a1)
+    lh          t1, 48(a0)
+    lh          t2, 16(a1)
+    lh          t3, 16(a0)
+    lh          t4, 80(a1)
+    lh          t5, 80(a0)
+    lh          t6, 112(a1)
+    lh          t7, 112(a0)
+    mul         t0, t0, t1      /* z2 */
+    mul         t1, t2, t3      /* z1 */
+    mul         t2, t4, t5      /* z3 */
+    mul         t3, t6, t7      /* z4 */
+    li          t4, 10703       /* FIX(1.306562965) */
+    li          t5, 4433        /* FIX_0_541196100 */
+    li          t6, 7053        /* FIX(0.860918669) */
+    mul         t4, t0, t4      /* tmp11 */
+    mul         t5, t0, t5      /* -tmp14 */
+    addu        t7, t1, t2      /* tmp10 */
+    addu        t8, t7, t3      /* tmp10 + z4 */
+    mul         t6, t6, t8      /* tmp15 */
+    li          t8, 2139        /* FIX(0.261052384) */
+    mul         t8, t7, t8      /* MULTIPLY(tmp10, FIX(0.261052384)) */
+    li          t7, 2295        /* FIX(0.280143716) */
+    mul         t7, t1, t7      /* MULTIPLY(z1, FIX(0.280143716)) */
+    addu        t9, t2, t3      /* z3 + z4 */
+    li          s0, 8565        /* FIX(1.045510580) */
+    mul         t9, t9, s0      /* -tmp13 */
+    li          s0, 12112       /* FIX(1.478575242) */
+    mul         s0, t2, s0      /* MULTIPLY(z3, FIX(1.478575242) */
+    li          s1, 12998       /* FIX(1.586706681) */
+    mul         s1, t3, s1      /* MULTIPLY(z4, FIX(1.586706681)) */
+    li          s2, 5540        /* FIX(0.676326758) */
+    mul         s2, t1, s2      /* MULTIPLY(z1, FIX(0.676326758)) */
+    li          s3, 16244       /* FIX(1.982889723) */
+    mul         s3, t3, s3      /* MULTIPLY(z4, FIX(1.982889723)) */
+    subu        t1, t1, t3      /* z1-=z4 */
+    subu        t0, t0, t2      /* z2-=z3 */
+    addu        t2, t0, t1      /* z1+z2 */
+    li          t3, 4433        /* FIX_0_541196100 */
+    mul         t2, t2, t3      /* z3 */
+    li          t3, 6270        /* FIX_0_765366865 */
+    mul         t1, t1, t3      /* MULTIPLY(z1, FIX_0_765366865) */
+    li          t3, 15137       /* FIX_0_765366865 */
+    mul         t0, t0, t3      /* MULTIPLY(z2, FIX_1_847759065) */
+    addu        t8, t6, t8      /* tmp12 */
+    addu        t3, t8, t4      /* tmp12 + tmp11 */
+    addu        t3, t3, t7      /* tmp10 */
+    subu        t8, t8, t9      /* tmp12 + tmp13 */
+    addu        s0, t5, s0
+    subu        t8, t8, s0      /* tmp12 */
+    subu        t9, t6, t9
+    subu        s1, s1, t4
+    addu        t9, t9, s1      /* tmp13 */
+    subu        t6, t6, t5
+    subu        t6, t6, s2
+    subu        t6, t6, s3      /* tmp15 */
+    /* even part start */
+    lh          t4, 64(a1)
+    lh          t5, 64(a0)
+    lh          t7, 32(a1)
+    lh          s0, 32(a0)
+    lh          s1, 0(a1)
+    lh          s2, 0(a0)
+    lh          s3, 96(a1)
+    lh          v0, 96(a0)
+    mul         t4, t4, t5      /* DEQUANTIZE(inptr[DCTSIZE*4],
+                                              quantptr[DCTSIZE*4]) */
+    mul         t5, t7, s0      /* DEQUANTIZE(inptr[DCTSIZE*2],
+                                              quantptr[DCTSIZE*2]) */
+    mul         t7, s1, s2      /* DEQUANTIZE(inptr[DCTSIZE*0],
+                                              quantptr[DCTSIZE*0]) */
+    mul         s0, s3, v0      /* DEQUANTIZE(inptr[DCTSIZE*6],
+                                              quantptr[DCTSIZE*6]) */
+    /* odd part end */
+    addu        t1, t2, t1      /* tmp11 */
+    subu        t0, t2, t0      /* tmp14 */
+    /* update counter and pointers */
+    addiu       a3, a3, -1
+    addiu       a0, a0, 2
+    addiu       a1, a1, 2
+    /* even part rest */
+    li          s1, 10033
+    li          s2, 11190
+    mul         t4, t4, s1      /* z4 */
+    mul         s1, t5, s2      /* z4 */
+    sll         t5, t5, 13      /* z1 */
+    sll         t7, t7, 13
+    addiu       t7, t7, 1024    /* z3 */
+    sll         s0, s0, 13      /* z2 */
+    addu        s2, t7, t4      /* tmp10 */
+    subu        t4, t7, t4      /* tmp11 */
+    subu        s3, t5, s0      /* tmp12 */
+    addu        t2, t7, s3      /* tmp21 */
+    subu        s3, t7, s3      /* tmp24 */
+    addu        t7, s1, s0      /* tmp12 */
+    addu        v0, s2, t7      /* tmp20 */
+    subu        s2, s2, t7      /* tmp25 */
+    subu        s1, s1, t5      /* z4 - z1 */
+    subu        s1, s1, s0      /* tmp12 */
+    addu        s0, t4, s1      /* tmp22 */
+    subu        t4, t4, s1      /* tmp23 */
+    /* final output stage */
+    addu        t5, v0, t3
+    subu        v0, v0, t3
+    addu        t3, t2, t1
+    subu        t2, t2, t1
+    addu        t1, s0, t8
+    subu        s0, s0, t8
+    addu        t8, t4, t9
+    subu        t4, t4, t9
+    addu        t9, s3, t0
+    subu        s3, s3, t0
+    addu        t0, s2, t6
+    subu        s2, s2, t6
+    sra         t5, t5, 11
+    sra         t3, t3, 11
+    sra         t1, t1, 11
+    sra         t8, t8, 11
+    sra         t9, t9, 11
+    sra         t0, t0, 11
+    sra         s2, s2, 11
+    sra         s3, s3, 11
+    sra         t4, t4, 11
+    sra         s0, s0, 11
+    sra         t2, t2, 11
+    sra         v0, v0, 11
+    sw          t5, 0(a2)
+    sw          t3, 32(a2)
+    sw          t1, 64(a2)
+    sw          t8, 96(a2)
+    sw          t9, 128(a2)
+    sw          t0, 160(a2)
+    sw          s2, 192(a2)
+    sw          s3, 224(a2)
+    sw          t4, 256(a2)
+    sw          s0, 288(a2)
+    sw          t2, 320(a2)
+    sw          v0, 352(a2)
+    bgtz        a3, 1b
+     addiu      a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j           ra
+     nop
+
+END(jsimd_idct_12x12_pass1_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
+/*
+ * a0 = workspace
+ * a1 = output
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li          a3, 12
+
+1:
+    /* Odd part */
+    lw          t0, 12(a0)
+    lw          t1, 4(a0)
+    lw          t2, 20(a0)
+    lw          t3, 28(a0)
+    li          t4, 10703       /* FIX(1.306562965) */
+    li          t5, 4433        /* FIX_0_541196100 */
+    mul         t4, t0, t4      /* tmp11 */
+    mul         t5, t0, t5      /* -tmp14 */
+    addu        t6, t1, t2      /* tmp10 */
+    li          t7, 2139        /* FIX(0.261052384) */
+    mul         t7, t6, t7      /* MULTIPLY(tmp10, FIX(0.261052384)) */
+    addu        t6, t6, t3      /* tmp10 + z4 */
+    li          t8, 7053        /* FIX(0.860918669) */
+    mul         t6, t6, t8      /* tmp15 */
+    li          t8, 2295        /* FIX(0.280143716) */
+    mul         t8, t1, t8      /* MULTIPLY(z1, FIX(0.280143716)) */
+    addu        t9, t2, t3      /* z3 + z4 */
+    li          s0, 8565        /* FIX(1.045510580) */
+    mul         t9, t9, s0      /* -tmp13 */
+    li          s0, 12112       /* FIX(1.478575242) */
+    mul         s0, t2, s0      /* MULTIPLY(z3, FIX(1.478575242)) */
+    li          s1, 12998       /* FIX(1.586706681) */
+    mul         s1, t3, s1      /* MULTIPLY(z4, FIX(1.586706681)) */
+    li          s2, 5540        /* FIX(0.676326758) */
+    mul         s2, t1, s2      /* MULTIPLY(z1, FIX(0.676326758)) */
+    li          s3, 16244       /* FIX(1.982889723) */
+    mul         s3, t3, s3      /* MULTIPLY(z4, FIX(1.982889723)) */
+    subu        t1, t1, t3      /* z1 -= z4 */
+    subu        t0, t0, t2      /* z2 -= z3 */
+    addu        t2, t1, t0      /* z1 + z2 */
+    li          t3, 4433        /* FIX_0_541196100 */
+    mul         t2, t2, t3      /* z3 */
+    li          t3, 6270        /* FIX_0_765366865 */
+    mul         t1, t1, t3      /* MULTIPLY(z1, FIX_0_765366865) */
+    li          t3, 15137       /* FIX_1_847759065 */
+    mul         t0, t0, t3      /* MULTIPLY(z2, FIX_1_847759065) */
+    addu        t3, t6, t7      /* tmp12 */
+    addu        t7, t3, t4
+    addu        t7, t7, t8      /* tmp10 */
+    subu        t3, t3, t9
+    subu        t3, t3, t5
+    subu        t3, t3, s0      /* tmp12 */
+    subu        t9, t6, t9
+    subu        t9, t9, t4
+    addu        t9, t9, s1      /* tmp13 */
+    subu        t6, t6, t5
+    subu        t6, t6, s2
+    subu        t6, t6, s3      /* tmp15 */
+    addu        t1, t2, t1      /* tmp11 */
+    subu        t0, t2, t0      /* tmp14 */
+    /* even part */
+    lw          t2, 16(a0)      /* z4 */
+    lw          t4, 8(a0)       /* z1 */
+    lw          t5, 0(a0)       /* z3 */
+    lw          t8, 24(a0)      /* z2 */
+    li          s0, 10033       /* FIX(1.224744871) */
+    li          s1, 11190       /* FIX(1.366025404) */
+    mul         t2, t2, s0      /* z4 */
+    mul         s0, t4, s1      /* z4 */
+    addiu       t5, t5, 0x10
+    sll         t5, t5, 13      /* z3 */
+    sll         t4, t4, 13      /* z1 */
+    sll         t8, t8, 13      /* z2 */
+    subu        s1, t4, t8      /* tmp12 */
+    addu        s2, t5, t2      /* tmp10 */
+    subu        t2, t5, t2      /* tmp11 */
+    addu        s3, t5, s1      /* tmp21 */
+    subu        s1, t5, s1      /* tmp24 */
+    addu        t5, s0, t8      /* tmp12 */
+    addu        v0, s2, t5      /* tmp20 */
+    subu        t5, s2, t5      /* tmp25 */
+    subu        t4, s0, t4
+    subu        t4, t4, t8      /* tmp12 */
+    addu        t8, t2, t4      /* tmp22 */
+    subu        t2, t2, t4      /* tmp23 */
+    /* increment counter and pointers */
+    addiu       a3, a3, -1
+    addiu       a0, a0, 32
+    /* Final stage */
+    addu        t4, v0, t7
+    subu        v0, v0, t7
+    addu        t7, s3, t1
+    subu        s3, s3, t1
+    addu        t1, t8, t3
+    subu        t8, t8, t3
+    addu        t3, t2, t9
+    subu        t2, t2, t9
+    addu        t9, s1, t0
+    subu        s1, s1, t0
+    addu        t0, t5, t6
+    subu        t5, t5, t6
+    sll         t4, t4, 4
+    sll         t7, t7, 4
+    sll         t1, t1, 4
+    sll         t3, t3, 4
+    sll         t9, t9, 4
+    sll         t0, t0, 4
+    sll         t5, t5, 4
+    sll         s1, s1, 4
+    sll         t2, t2, 4
+    sll         t8, t8, 4
+    sll         s3, s3, 4
+    sll         v0, v0, 4
+    shll_s.w    t4, t4, 2
+    shll_s.w    t7, t7, 2
+    shll_s.w    t1, t1, 2
+    shll_s.w    t3, t3, 2
+    shll_s.w    t9, t9, 2
+    shll_s.w    t0, t0, 2
+    shll_s.w    t5, t5, 2
+    shll_s.w    s1, s1, 2
+    shll_s.w    t2, t2, 2
+    shll_s.w    t8, t8, 2
+    shll_s.w    s3, s3, 2
+    shll_s.w    v0, v0, 2
+    srl         t4, t4, 24
+    srl         t7, t7, 24
+    srl         t1, t1, 24
+    srl         t3, t3, 24
+    srl         t9, t9, 24
+    srl         t0, t0, 24
+    srl         t5, t5, 24
+    srl         s1, s1, 24
+    srl         t2, t2, 24
+    srl         t8, t8, 24
+    srl         s3, s3, 24
+    srl         v0, v0, 24
+    lw          t6, 0(a1)
+    addiu       t4, t4, 0x80
+    addiu       t7, t7, 0x80
+    addiu       t1, t1, 0x80
+    addiu       t3, t3, 0x80
+    addiu       t9, t9, 0x80
+    addiu       t0, t0, 0x80
+    addiu       t5, t5, 0x80
+    addiu       s1, s1, 0x80
+    addiu       t2, t2, 0x80
+    addiu       t8, t8, 0x80
+    addiu       s3, s3, 0x80
+    addiu       v0, v0, 0x80
+    sb          t4, 0(t6)
+    sb          t7, 1(t6)
+    sb          t1, 2(t6)
+    sb          t3, 3(t6)
+    sb          t9, 4(t6)
+    sb          t0, 5(t6)
+    sb          t5, 6(t6)
+    sb          s1, 7(t6)
+    sb          t2, 8(t6)
+    sb          t8, 9(t6)
+    sb          s3, 10(t6)
+    sb          v0, 11(t6)
+    bgtz        a3, 1b
+     addiu      a1, a1, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    jr          ra
+     nop
+
+END(jsimd_idct_12x12_pass2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+    lw            t0, 0(a0)
+    li            t7, 0xff80ff80
+    addu          t0, t0, a1
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    lw            t0, 4(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 0(a2)
+    usw           t4, 4(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 8(a2)
+    usw           t6, 12(a2)
+
+    lw            t0, 8(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 16(a2)
+    usw           t4, 20(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 24(a2)
+    usw           t6, 28(a2)
+
+    lw            t0, 12(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 32(a2)
+    usw           t4, 36(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 40(a2)
+    usw           t6, 44(a2)
+
+    lw            t0, 16(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 48(a2)
+    usw           t4, 52(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 56(a2)
+    usw           t6, 60(a2)
+
+    lw            t0, 20(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 64(a2)
+    usw           t4, 68(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 72(a2)
+    usw           t6, 76(a2)
+
+    lw            t0, 24(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 80(a2)
+    usw           t4, 84(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 88(a2)
+    usw           t6, 92(a2)
+
+    lw            t0, 28(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 96(a2)
+    usw           t4, 100(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 104(a2)
+    usw           t6, 108(a2)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 112(a2)
+    usw           t4, 116(a2)
+    usw           t5, 120(a2)
+    usw           t6, 124(a2)
+
+    j             ra
+     nop
+
+END(jsimd_convsamp_dspr2)
+
+
+#ifndef __mips_soft_float
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_float_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+    .set at
+
+    lw          t0, 0(a0)
+    addu        t0, t0, a1
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 4(a0)
+    swc1        f2, 0(a2)
+    swc1        f4, 4(a2)
+    swc1        f6, 8(a2)
+    addu        t0, t0, a1
+    swc1        f8, 12(a2)
+    swc1        f10, 16(a2)
+    swc1        f12, 20(a2)
+    swc1        f14, 24(a2)
+    swc1        f16, 28(a2)
+    /* elemr 1 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 8(a0)
+    swc1        f2, 32(a2)
+    swc1        f4, 36(a2)
+    swc1        f6, 40(a2)
+    addu        t0, t0, a1
+    swc1        f8, 44(a2)
+    swc1        f10, 48(a2)
+    swc1        f12, 52(a2)
+    swc1        f14, 56(a2)
+    swc1        f16, 60(a2)
+    /* elemr 2 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 12(a0)
+    swc1        f2, 64(a2)
+    swc1        f4, 68(a2)
+    swc1        f6, 72(a2)
+    addu        t0, t0, a1
+    swc1        f8, 76(a2)
+    swc1        f10, 80(a2)
+    swc1        f12, 84(a2)
+    swc1        f14, 88(a2)
+    swc1        f16, 92(a2)
+    /*  elemr 3 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 16(a0)
+    swc1        f2, 96(a2)
+    swc1        f4, 100(a2)
+    swc1        f6, 104(a2)
+    addu        t0, t0, a1
+    swc1        f8, 108(a2)
+    swc1        f10, 112(a2)
+    swc1        f12, 116(a2)
+    swc1        f14, 120(a2)
+    swc1        f16, 124(a2)
+    /* elemr 4 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 20(a0)
+    swc1        f2, 128(a2)
+    swc1        f4, 132(a2)
+    swc1        f6, 136(a2)
+    addu        t0, t0, a1
+    swc1        f8, 140(a2)
+    swc1        f10, 144(a2)
+    swc1        f12, 148(a2)
+    swc1        f14, 152(a2)
+    swc1        f16, 156(a2)
+    /* elemr 5 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 24(a0)
+    swc1        f2, 160(a2)
+    swc1        f4, 164(a2)
+    swc1        f6, 168(a2)
+    addu        t0, t0, a1
+    swc1        f8, 172(a2)
+    swc1        f10, 176(a2)
+    swc1        f12, 180(a2)
+    swc1        f14, 184(a2)
+    swc1        f16, 188(a2)
+    /* elemr 6 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 28(a0)
+    swc1        f2, 192(a2)
+    swc1        f4, 196(a2)
+    swc1        f6, 200(a2)
+    addu        t0, t0, a1
+    swc1        f8, 204(a2)
+    swc1        f10, 208(a2)
+    swc1        f12, 212(a2)
+    swc1        f14, 216(a2)
+    swc1        f16, 220(a2)
+    /* elemr 7 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    swc1        f2, 224(a2)
+    swc1        f4, 228(a2)
+    swc1        f6, 232(a2)
+    swc1        f8, 236(a2)
+    swc1        f10, 240(a2)
+    swc1        f12, 244(a2)
+    swc1        f14, 248(a2)
+    swc1        f16, 252(a2)
+
+    j           ra
+     nop
+
+END(jsimd_convsamp_float_dspr2)
+
+#endif
+
+/*****************************************************************************/
diff --git a/media/libjpeg/simd/mips/jsimd_dspr2_asm.h b/media/libjpeg/simd/mips/jsimd_dspr2_asm.h
new file mode 100644
index 0000000000..12cfda486c
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd_dspr2_asm.h
@@ -0,0 +1,292 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * Copyright (C) 2018, Matthieu Darbois.
+ * All Rights Reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero  $0
+#define AT    $1
+#define v0    $2
+#define v1    $3
+#define a0    $4
+#define a1    $5
+#define a2    $6
+#define a3    $7
+#define t0    $8
+#define t1    $9
+#define t2    $10
+#define t3    $11
+#define t4    $12
+#define t5    $13
+#define t6    $14
+#define t7    $15
+#define s0    $16
+#define s1    $17
+#define s2    $18
+#define s3    $19
+#define s4    $20
+#define s5    $21
+#define s6    $22
+#define s7    $23
+#define t8    $24
+#define t9    $25
+#define k0    $26
+#define k1    $27
+#define gp    $28
+#define sp    $29
+#define fp    $30
+#define s8    $30
+#define ra    $31
+
+#define f0    $f0
+#define f1    $f1
+#define f2    $f2
+#define f3    $f3
+#define f4    $f4
+#define f5    $f5
+#define f6    $f6
+#define f7    $f7
+#define f8    $f8
+#define f9    $f9
+#define f10   $f10
+#define f11   $f11
+#define f12   $f12
+#define f13   $f13
+#define f14   $f14
+#define f15   $f15
+#define f16   $f16
+#define f17   $f17
+#define f18   $f18
+#define f19   $f19
+#define f20   $f20
+#define f21   $f21
+#define f22   $f22
+#define f23   $f23
+#define f24   $f24
+#define f25   $f25
+#define f26   $f26
+#define f27   $f27
+#define f28   $f28
+#define f29   $f29
+#define f30   $f30
+#define f31   $f31
+
+#ifdef __ELF__
+#define HIDDEN_SYMBOL(symbol)  .hidden symbol;
+#else
+#define HIDDEN_SYMBOL(symbol)
+#endif
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol) \
+    .globl      symbol; \
+    HIDDEN_SYMBOL(symbol) \
+    .align      2; \
+    .type       symbol, @function; \
+    .ent        symbol, 0; \
+symbol: \
+    .frame      sp, 0, ra; \
+    .set        push; \
+    .set        arch = mips32r2; \
+    .set        noreorder; \
+    .set        noat;
+
+/*
+ * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_DSPR2(symbol) \
+LEAF_MIPS32R2(symbol) \
+    .set        dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function) \
+    .set        pop; \
+    .end        function; \
+    .size       function, .-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK  stack_offset = 0, r1, \
+                           r2  = 0, r3  = 0, r4  = 0, \
+                           r5  = 0, r6  = 0, r7  = 0, \
+                           r8  = 0, r9  = 0, r10 = 0, \
+                           r11 = 0, r12 = 0, r13 = 0, \
+                           r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+.endif
+.if \stack_offset != 0
+    addiu       sp, sp, -\stack_offset
+.endif
+    sw          \r1, 0(sp)
+.if \r2 != 0
+    sw          \r2, 4(sp)
+.endif
+.if \r3 != 0
+    sw          \r3, 8(sp)
+.endif
+.if \r4 != 0
+    sw          \r4, 12(sp)
+.endif
+.if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    sw          \r5, 16(sp)
+.endif
+.if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    sw          \r6, 20(sp)
+.endif
+.if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    sw          \r7, 24(sp)
+.endif
+.if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    sw          \r8, 28(sp)
+.endif
+.if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    sw          \r9, 32(sp)
+.endif
+.if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    sw          \r10, 36(sp)
+.endif
+.if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    sw          \r11, 40(sp)
+.endif
+.if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    sw          \r12, 44(sp)
+.endif
+.if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    sw          \r13, 48(sp)
+.endif
+.if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    sw          \r14, 52(sp)
+.endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK  stack_offset = 0, r1, \
+                                r2  = 0, r3  = 0, r4  = 0, \
+                                r5  = 0, r6  = 0, r7  = 0, \
+                                r8  = 0, r9  = 0, r10 = 0, \
+                                r11 = 0, r12 = 0, r13 = 0, \
+                                r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+.endif
+    lw          \r1, 0(sp)
+.if \r2 != 0
+    lw          \r2, 4(sp)
+.endif
+.if \r3 != 0
+    lw          \r3, 8(sp)
+.endif
+.if \r4 != 0
+    lw          \r4, 12(sp)
+.endif
+.if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    lw          \r5, 16(sp)
+.endif
+.if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    lw          \r6, 20(sp)
+.endif
+.if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    lw          \r7, 24(sp)
+.endif
+.if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    lw          \r8, 28(sp)
+.endif
+.if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    lw          \r9, 32(sp)
+.endif
+.if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    lw          \r10, 36(sp)
+.endif
+.if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    lw          \r11, 40(sp)
+.endif
+.if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    lw          \r12, 44(sp)
+.endif
+.if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    lw          \r13, 48(sp)
+.endif
+.if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    lw          \r14, 52(sp)
+.endif
+.if \stack_offset != 0
+    addiu       sp, sp, \stack_offset
+.endif
+.endm
diff --git a/media/libjpeg/simd/mips64/jccolext-mmi.c b/media/libjpeg/simd/mips64/jccolext-mmi.c
new file mode 100644
index 0000000000..558eb2ab10
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jccolext-mmi.c
@@ -0,0 +1,455 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                               JSAMPIMAGE output_buf, JDIMENSION output_row,
+                               int num_rows)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
+  __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+  __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
+  __m64 crle, crhe, cre, crlo, crho, cro, cr;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr0 += 8, outptr1 += 8, outptr2 += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+      cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
+      cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
+
+      blo = _mm_loadlo_pi16_f(bo);
+      bho = _mm_loadhi_pi16_f(bo);
+      halfblo = _mm_srli_pi32(blo, 1);
+      halfbho = _mm_srli_pi32(bho, 1);
+
+      cblo = _mm_add_pi32(cblo, halfblo);
+      cbho = _mm_add_pi32(cbho, halfbho);
+      cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
+      cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
+      cblo = _mm_srli_pi32(cblo, SCALEBITS);
+      cbho = _mm_srli_pi32(cbho, SCALEBITS);
+      cbo = _mm_packs_pi32(cblo, cbho);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+      cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
+      cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
+
+      ble = _mm_loadlo_pi16_f(be);
+      bhe = _mm_loadhi_pi16_f(be);
+      halfble = _mm_srli_pi32(ble, 1);
+      halfbhe = _mm_srli_pi32(bhe, 1);
+
+      cble = _mm_add_pi32(cble, halfble);
+      cbhe = _mm_add_pi32(cbhe, halfbhe);
+      cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
+      cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
+      cble = _mm_srli_pi32(cble, SCALEBITS);
+      cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
+      cbe = _mm_packs_pi32(cble, cbhe);
+
+      cbo = _mm_slli_pi16(cbo, BYTE_BIT);
+      cb = _mm_or_si64(cbe, cbo);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+      crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
+      crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      rlo = _mm_loadlo_pi16_f(ro);
+      rho = _mm_loadhi_pi16_f(ro);
+      halfrlo = _mm_srli_pi32(rlo, 1);
+      halfrho = _mm_srli_pi32(rho, 1);
+
+      crlo = _mm_add_pi32(crlo, halfrlo);
+      crho = _mm_add_pi32(crho, halfrho);
+      crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
+      crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
+      crlo = _mm_srli_pi32(crlo, SCALEBITS);
+      crho = _mm_srli_pi32(crho, SCALEBITS);
+      cro = _mm_packs_pi32(crlo, crho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+      crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
+      crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      rle = _mm_loadlo_pi16_f(re);
+      rhe = _mm_loadhi_pi16_f(re);
+      halfrle = _mm_srli_pi32(rle, 1);
+      halfrhe = _mm_srli_pi32(rhe, 1);
+
+      crle = _mm_add_pi32(crle, halfrle);
+      crhe = _mm_add_pi32(crhe, halfrhe);
+      crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
+      crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
+      crle = _mm_srli_pi32(crle, SCALEBITS);
+      crhe = _mm_srli_pi32(crhe, SCALEBITS);
+      cre = _mm_packs_pi32(crle, crhe);
+
+      cro = _mm_slli_pi16(cro, BYTE_BIT);
+      cr = _mm_or_si64(cre, cro);
+
+      _mm_store_si64((__m64 *)&outptr0[0], y);
+      _mm_store_si64((__m64 *)&outptr1[0], cb);
+      _mm_store_si64((__m64 *)&outptr2[0], cr);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jccolor-mmi.c b/media/libjpeg/simd/mips64/jccolor-mmi.c
new file mode 100644
index 0000000000..93ef5c79f7
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jccolor-mmi.c
@@ -0,0 +1,148 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_081  ((short)5329)                /* FIX(0.08131) */
+#define F_0_114  ((short)7471)                /* FIX(0.11400) */
+#define F_0_168  ((short)11059)               /* FIX(0.16874) */
+#define F_0_250  ((short)16384)               /* FIX(0.25000) */
+#define F_0_299  ((short)19595)               /* FIX(0.29900) */
+#define F_0_331  ((short)21709)               /* FIX(0.33126) */
+#define F_0_418  ((short)27439)               /* FIX(0.41869) */
+#define F_0_587  ((short)38470)               /* FIX(0.58700) */
+#define F_0_337  ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+  index_PD_ONEHALF,
+  index_PW_F0299_F0337,
+  index_PW_F0114_F0250,
+  index_PW_MF016_MF033,
+  index_PW_MF008_MF041,
+  index_PD_ONEHALFM1_CJ
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+  _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+  _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114),
+  _uint64_set_pi16(-F_0_331, -F_0_168, -F_0_331, -F_0_168),
+  _uint64_set_pi16(-F_0_418, -F_0_081, -F_0_418, -F_0_081),
+  _uint64_set_pi32(((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)),
+                   ((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)))
+};
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337   get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250   get_const_value(index_PW_F0114_F0250)
+#define PW_MF016_MF033   get_const_value(index_PW_MF016_MF033)
+#define PW_MF008_MF041   get_const_value(index_PW_MF008_MF041)
+#define PD_ONEHALFM1_CJ  get_const_value(index_PD_ONEHALFM1_CJ)
+
+
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extrgbx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extbgrx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extxbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extxrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jcgray-mmi.c b/media/libjpeg/simd/mips64/jcgray-mmi.c
new file mode 100644
index 0000000000..9c7b833f2e
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcgray-mmi.c
@@ -0,0 +1,132 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_114  ((short)7471)                /* FIX(0.11400) */
+#define F_0_250  ((short)16384)               /* FIX(0.25000) */
+#define F_0_299  ((short)19595)               /* FIX(0.29900) */
+#define F_0_587  ((short)38470)               /* FIX(0.58700) */
+#define F_0_337  ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+  index_PD_ONEHALF,
+  index_PW_F0299_F0337,
+  index_PW_F0114_F0250
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+  _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+  _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114)
+};
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337   get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250   get_const_value(index_PW_F0114_F0250)
+
+
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgbx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgrx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jcgryext-mmi.c b/media/libjpeg/simd/mips64/jcgryext-mmi.c
new file mode 100644
index 0000000000..08a83d6699
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcgryext-mmi.c
@@ -0,0 +1,374 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  JSAMPROW inptr, outptr;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      _mm_store_si64((__m64 *)&outptr[0], y);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jcsample-mmi.c b/media/libjpeg/simd/mips64/jcsample-mmi.c
new file mode 100644
index 0000000000..0354dac087
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcsample-mmi.c
@@ -0,0 +1,98 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_mmi.h"
+#include "jcsample.h"
+
+
+void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_in_blocks,
+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int inrow, outrow, outcol;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+  JSAMPROW inptr0, inptr1, outptr;
+  __m64 bias, mask = 0.0, thisavg, nextavg, avg;
+  __m64 this0o, this0e, this0, this0sum, next0o, next0e, next0, next0sum;
+  __m64 this1o, this1e, this1, this1sum, next1o, next1e, next1, next1sum;
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  bias = _mm_set1_pi32((1 << 17) + 1);   /* 0x00020001 (32-bit bias pattern) */
+                                         /* bias={1, 2, 1, 2} (16-bit) */
+  mask = _mm_cmpeq_pi16(mask, mask);
+  mask = _mm_srli_pi16(mask, BYTE_BIT);  /* {0xFF 0x00 0xFF 0x00 ..} */
+
+  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+       inrow += 2, outrow++) {
+
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr = output_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
+
+      this0 = _mm_load_si64((__m64 *)&inptr0[0]);
+      this1 = _mm_load_si64((__m64 *)&inptr1[0]);
+      next0 = _mm_load_si64((__m64 *)&inptr0[8]);
+      next1 = _mm_load_si64((__m64 *)&inptr1[8]);
+
+      this0o = _mm_and_si64(this0, mask);
+      this0e = _mm_srli_pi16(this0, BYTE_BIT);
+      this1o = _mm_and_si64(this1, mask);
+      this1e = _mm_srli_pi16(this1, BYTE_BIT);
+      this0sum = _mm_add_pi16(this0o, this0e);
+      this1sum = _mm_add_pi16(this1o, this1e);
+
+      next0o = _mm_and_si64(next0, mask);
+      next0e = _mm_srli_pi16(next0, BYTE_BIT);
+      next1o = _mm_and_si64(next1, mask);
+      next1e = _mm_srli_pi16(next1, BYTE_BIT);
+      next0sum = _mm_add_pi16(next0o, next0e);
+      next1sum = _mm_add_pi16(next1o, next1e);
+
+      thisavg = _mm_add_pi16(this0sum, this1sum);
+      nextavg = _mm_add_pi16(next0sum, next1sum);
+      thisavg = _mm_add_pi16(thisavg, bias);
+      nextavg = _mm_add_pi16(nextavg, bias);
+      thisavg = _mm_srli_pi16(thisavg, 2);
+      nextavg = _mm_srli_pi16(nextavg, 2);
+
+      avg = _mm_packs_pu16(thisavg, nextavg);
+
+      _mm_store_si64((__m64 *)&outptr[0], avg);
+    }
+  }
+}
diff --git a/media/libjpeg/simd/jcsample.h b/media/libjpeg/simd/mips64/jcsample.h
index 2a50544e97..bd07fcc4ed 100644
--- a/media/libjpeg/simd/jcsample.h
+++ b/media/libjpeg/simd/mips64/jcsample.h
@@ -8,19 +8,19 @@
  */
 
 LOCAL(void)
-expand_right_edge (JSAMPARRAY image_data, int num_rows,
-                   JDIMENSION input_cols, JDIMENSION output_cols)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
 {
   register JSAMPROW ptr;
   register JSAMPLE pixval;
   register int count;
   int row;
-  int numcols = (int) (output_cols - input_cols);
+  int numcols = (int)(output_cols - input_cols);
 
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
diff --git a/media/libjpeg/simd/mips64/jdcolext-mmi.c b/media/libjpeg/simd/mips64/jdcolext-mmi.c
new file mode 100644
index 0000000000..3b5b2f2030
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdcolext-mmi.c
@@ -0,0 +1,415 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                               JDIMENSION input_row, JSAMPARRAY output_buf,
+                               int num_rows)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr;
+  __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    for (num_cols = out_width; num_cols > 0; num_cols -= 8,
+         inptr0 += 8, inptr1 += 8, inptr2 += 8) {
+
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      mask = decenter = 0.0;
+      mask = _mm_cmpeq_pi16(mask, mask);
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbe = _mm_and_si64(mask, cb);           /* Cb(0246) */
+      cbo = _mm_srli_pi16(cb, BYTE_BIT);      /* Cb(1357) */
+      cre = _mm_and_si64(mask, cr);           /* Cr(0246) */
+      cro = _mm_srli_pi16(cr, BYTE_BIT);      /* Cr(1357) */
+      cbe = _mm_add_pi16(cbe, decenter);
+      cbo = _mm_add_pi16(cbo, decenter);
+      cre = _mm_add_pi16(cre, decenter);
+      cro = _mm_add_pi16(cro, decenter);
+
+      /* (Original)
+       * R = Y                + 1.40200 * Cr
+       * G = Y - 0.34414 * Cb - 0.71414 * Cr
+       * B = Y + 1.77200 * Cb
+       *
+       * (This implementation)
+       * R = Y                + 0.40200 * Cr + Cr
+       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       * B = Y - 0.22800 * Cb + Cb + Cb
+       */
+
+      cbe2 = _mm_add_pi16(cbe, cbe);          /* 2*CbE */
+      cbo2 = _mm_add_pi16(cbo, cbo);          /* 2*CbO */
+      cre2 = _mm_add_pi16(cre, cre);          /* 2*CrE */
+      cro2 = _mm_add_pi16(cro, cro);          /* 2*CrO */
+
+      be = _mm_mulhi_pi16(cbe2, PW_MF0228);   /* (2*CbE * -FIX(0.22800) */
+      bo = _mm_mulhi_pi16(cbo2, PW_MF0228);   /* (2*CbO * -FIX(0.22800) */
+      re = _mm_mulhi_pi16(cre2, PW_F0402);    /* (2*CrE * FIX(0.40200)) */
+      ro = _mm_mulhi_pi16(cro2, PW_F0402);    /* (2*CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, PW_ONE);
+      bo = _mm_add_pi16(bo, PW_ONE);
+      be = _mm_srai_pi16(be, 1);              /* (CbE * -FIX(0.22800)) */
+      bo = _mm_srai_pi16(bo, 1);              /* (CbO * -FIX(0.22800)) */
+      re = _mm_add_pi16(re, PW_ONE);
+      ro = _mm_add_pi16(ro, PW_ONE);
+      re = _mm_srai_pi16(re, 1);              /* (CrE * FIX(0.40200)) */
+      ro = _mm_srai_pi16(ro, 1);              /* (CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, cbe);
+      bo = _mm_add_pi16(bo, cbo);
+      be = _mm_add_pi16(be, cbe);             /* (CbE * FIX(1.77200))=(B-Y)E */
+      bo = _mm_add_pi16(bo, cbo);             /* (CbO * FIX(1.77200))=(B-Y)O */
+      re = _mm_add_pi16(re, cre);             /* (CrE * FIX(1.40200))=(R-Y)E */
+      ro = _mm_add_pi16(ro, cro);             /* (CrO * FIX(1.40200))=(R-Y)O */
+
+      gle = _mm_unpacklo_pi16(cbe, cre);
+      ghe = _mm_unpackhi_pi16(cbe, cre);
+      gle = _mm_madd_pi16(gle, PW_MF0344_F0285);
+      ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285);
+      glo = _mm_unpacklo_pi16(cbo, cro);
+      gho = _mm_unpackhi_pi16(cbo, cro);
+      glo = _mm_madd_pi16(glo, PW_MF0344_F0285);
+      gho = _mm_madd_pi16(gho, PW_MF0344_F0285);
+
+      gle = _mm_add_pi32(gle, PD_ONEHALF);
+      ghe = _mm_add_pi32(ghe, PD_ONEHALF);
+      gle = _mm_srai_pi32(gle, SCALEBITS);
+      ghe = _mm_srai_pi32(ghe, SCALEBITS);
+      glo = _mm_add_pi32(glo, PD_ONEHALF);
+      gho = _mm_add_pi32(gho, PD_ONEHALF);
+      glo = _mm_srai_pi32(glo, SCALEBITS);
+      gho = _mm_srai_pi32(gho, SCALEBITS);
+
+      ge = _mm_packs_pi32(gle, ghe);       /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
+      go = _mm_packs_pi32(glo, gho);       /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
+      ge = _mm_sub_pi16(ge, cre);  /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
+      go = _mm_sub_pi16(go, cro);  /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
+
+      ye = _mm_and_si64(mask, y);             /* Y(0246) */
+      yo = _mm_srli_pi16(y, BYTE_BIT);        /* Y(1357) */
+
+      re = _mm_add_pi16(re, ye);              /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
+      ro = _mm_add_pi16(ro, yo);              /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
+      re = _mm_packs_pu16(re, re);            /* (R0 R2 R4 R6 ** ** ** **) */
+      ro = _mm_packs_pu16(ro, ro);            /* (R1 R3 R5 R7 ** ** ** **) */
+
+      ge = _mm_add_pi16(ge, ye);              /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
+      go = _mm_add_pi16(go, yo);              /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
+      ge = _mm_packs_pu16(ge, ge);            /* (G0 G2 G4 G6 ** ** ** **) */
+      go = _mm_packs_pu16(go, go);            /* (G1 G3 G5 G7 ** ** ** **) */
+
+      be = _mm_add_pi16(be, ye);              /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
+      bo = _mm_add_pi16(bo, yo);              /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
+      be = _mm_packs_pu16(be, be);            /* (B0 B2 B4 B6 ** ** ** **) */
+      bo = _mm_packs_pu16(bo, bo);            /* (B1 B3 B5 B7 ** ** ** **) */
+
+#if RGB_PIXELSIZE == 3
+
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmB);       /* (20 01 22 03 24 05 26 07) */
+      mmD = _mm_unpacklo_pi8(mmD, mmF);       /* (11 21 13 23 15 25 17 27) */
+
+      mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
+
+      mmG = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 05 06 16 26 07) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 01 02 12 22 03) */
+
+      mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
+      mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT);  /* (13 23 15 25 17 27 -- --) */
+
+      mmC = _mm_unpackhi_pi16(mmD, mmH);      /* (15 25 06 16 17 27 -- --) */
+      mmD = _mm_unpacklo_pi16(mmD, mmH);      /* (11 21 02 12 13 23 04 14) */
+
+      mmF = _mm_unpackhi_pi16(mmE, mmB);      /* (26 07 17 27 -- -- -- --) */
+      mmE = _mm_unpacklo_pi16(mmE, mmB);      /* (22 03 13 23 24 05 15 25) */
+
+      mmA = _mm_unpacklo_pi32(mmA, mmD);      /* (00 10 20 01 11 21 02 12) */
+      mmE = _mm_unpacklo_pi32(mmE, mmG);      /* (22 03 13 23 04 14 24 05) */
+      mmC = _mm_unpacklo_pi32(mmC, mmF);      /* (15 25 06 16 26 07 17 27) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmE);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmE);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 16\r\n"
+            "move     $9, %4\r\n"
+            "mov.s    $f4, %1\r\n"
+            "mov.s    $f6, %3\r\n"
+            "move     $10, %5\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %2\r\n"
+            "subu     $9, $9, 16\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+            "b        2f\r\n"
+            "nop      \r\n"
+
+            "1:       \r\n"
+            "li       $8, 8\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "subu     $9, $9, 8\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"              /* st4 */
+            "mfc1     $11, $f4\r\n"
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "swl      $11, 3($10)\r\n"
+            "swr      $11, 0($10)\r\n"
+            "li       $8, 32\r\n"
+            "mtc1     $8, $f6\r\n"
+            "dsrl     $f4, $f4, $f6\r\n"
+            "mfc1     $11, $f4\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 4\r\n"
+
+            "3:       \r\n"
+            "li       $8, 2\r\n"              /* st2 */
+            "bltu     $9, $8, 4f\r\n"
+            "nop      \r\n"
+            "ush      $11, 0($10)\r\n"
+            "srl      $11, 16\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 2\r\n"
+
+            "4:       \r\n"
+            "li       $8, 1\r\n"              /* st1 */
+            "bltu     $9, $8, 5f\r\n"
+            "nop      \r\n"
+            "sb       $11, 0($10)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"                   /* end */
+            : "=m" (*outptr)
+            : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
+           );
+      }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+      xo = _mm_cmpeq_pi8(xo, xo);
+#else
+      xe = _mm_xor_si64(xe, xe);
+      xo = _mm_xor_si64(xo, xo);
+#endif
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
+      /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
+
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmG);       /* (20 30 22 32 24 34 26 36) */
+      mmB = _mm_unpacklo_pi8(mmB, mmD);       /* (01 11 03 13 05 15 07 17) */
+      mmF = _mm_unpacklo_pi8(mmF, mmH);       /* (21 31 23 33 25 35 27 37) */
+
+      mmC = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 34 06 16 26 36) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 30 02 12 22 32) */
+      mmG = _mm_unpackhi_pi16(mmB, mmF);      /* (05 15 25 35 07 17 27 37) */
+      mmB = _mm_unpacklo_pi16(mmB, mmF);      /* (01 11 21 31 03 13 23 33) */
+
+      mmD = _mm_unpackhi_pi32(mmA, mmB);      /* (02 12 22 32 03 13 23 33) */
+      mmA = _mm_unpacklo_pi32(mmA, mmB);      /* (00 10 20 30 01 11 21 31) */
+      mmH = _mm_unpackhi_pi32(mmC, mmG);      /* (06 16 26 36 07 17 27 37) */
+      mmC = _mm_unpacklo_pi32(mmC, mmG);      /* (04 14 24 34 05 15 25 35) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmD);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+          _mm_store_si64((__m64 *)(outptr + 24), mmH);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmD);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+          _mm_storeu_si64((__m64 *)(outptr + 24), mmH);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols;
+        asm(".set noreorder\r\n"              /* st16 */
+
+            "li       $8, 4\r\n"
+            "move     $9, %6\r\n"
+            "move     $10, %7\r\n"
+            "mov.s    $f4, %2\r\n"
+            "mov.s    $f6, %4\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "mov.s    $f6, %5\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, $f6\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 1\r\n"              /* st4 */
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "gsswlc1  $f4, 3($10)\r\n"
+            "gsswrc1  $f4, 0($10)\r\n"
+
+            "3:       \r\n"
+            "li       %1, 0\r\n"              /* end */
+            : "=m" (*outptr), "=r" (col)
+            : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
+              "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "memory"
+           );
+      }
+
+#endif
+
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jdcolor-mmi.c b/media/libjpeg/simd/mips64/jdcolor-mmi.c
new file mode 100644
index 0000000000..2c58263dbd
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdcolor-mmi.c
@@ -0,0 +1,139 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344  ((short)22554)  /* FIX(0.34414) */
+#define F_0_402  ((short)26345)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  ((short)18734)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  ((short)14942)  /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_F0402,
+  index_PW_MF0228,
+  index_PW_MF0344_F0285,
+  index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+  _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+  _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE           get_const_value(index_PW_ONE)
+#define PW_F0402         get_const_value(index_PW_F0402)
+#define PW_MF0228        get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285  get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF  1
+
+
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extrgbx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extbgrx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extxbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extxrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jdmerge-mmi.c b/media/libjpeg/simd/mips64/jdmerge-mmi.c
new file mode 100644
index 0000000000..0a39bd5680
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdmerge-mmi.c
@@ -0,0 +1,149 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344  ((short)22554)  /* FIX(0.34414) */
+#define F_0_402  ((short)26345)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  ((short)18734)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  ((short)14942)  /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_F0402,
+  index_PW_MF0228,
+  index_PW_MF0344_F0285,
+  index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+  _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+  _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE           get_const_value(index_PW_ONE)
+#define PW_F0402         get_const_value(index_PW_F0402)
+#define PW_MF0228        get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285  get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF  1
+
+
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgbx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgbx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgrx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgrx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
diff --git a/media/libjpeg/simd/mips64/jdmrgext-mmi.c b/media/libjpeg/simd/mips64/jdmrgext-mmi.c
new file mode 100644
index 0000000000..be09ff2a65
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdmrgext-mmi.c
@@ -0,0 +1,615 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y;
+  __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr;
+  __m64 rle, rlo, rl, rhe, rho, rh, re, ro;
+  __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go;
+  __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask, zero = 0.0;
+#if RGB_PIXELSIZE == 4
+  __m64 mm8, mm9;
+#endif
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8,
+       inptr0 += 16, inptr1 += 8, inptr2 += 8) {
+
+    cb = _mm_load_si64((__m64 *)inptr1);
+    cr = _mm_load_si64((__m64 *)inptr2);
+    ythis = _mm_load_si64((__m64 *)inptr0);
+    ynext = _mm_load_si64((__m64 *)inptr0 + 1);
+
+    mask = decenter = 0.0;
+    mask = _mm_cmpeq_pi16(mask, mask);
+    decenter = _mm_cmpeq_pi16(decenter, decenter);
+    mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+    decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+    cbl = _mm_unpacklo_pi8(cb, zero);         /* Cb(0123) */
+    cbh = _mm_unpackhi_pi8(cb, zero);         /* Cb(4567) */
+    crl = _mm_unpacklo_pi8(cr, zero);         /* Cr(0123) */
+    crh = _mm_unpackhi_pi8(cr, zero);         /* Cr(4567) */
+    cbl = _mm_add_pi16(cbl, decenter);
+    cbh = _mm_add_pi16(cbh, decenter);
+    crl = _mm_add_pi16(crl, decenter);
+    crh = _mm_add_pi16(crh, decenter);
+
+    /* (Original)
+     * R = Y                + 1.40200 * Cr
+     * G = Y - 0.34414 * Cb - 0.71414 * Cr
+     * B = Y + 1.77200 * Cb
+     *
+     * (This implementation)
+     * R = Y                + 0.40200 * Cr + Cr
+     * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+     * B = Y - 0.22800 * Cb + Cb + Cb
+     */
+
+    cbl2 = _mm_add_pi16(cbl, cbl);            /* 2*CbL */
+    cbh2 = _mm_add_pi16(cbh, cbh);            /* 2*CbH */
+    crl2 = _mm_add_pi16(crl, crl);            /* 2*CrL */
+    crh2 = _mm_add_pi16(crh, crh);            /* 2*CrH */
+
+    bl = _mm_mulhi_pi16(cbl2, PW_MF0228);     /* (2*CbL * -FIX(0.22800) */
+    bh = _mm_mulhi_pi16(cbh2, PW_MF0228);     /* (2*CbH * -FIX(0.22800) */
+    rl = _mm_mulhi_pi16(crl2, PW_F0402);      /* (2*CrL * FIX(0.40200)) */
+    rh = _mm_mulhi_pi16(crh2, PW_F0402);      /* (2*CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, PW_ONE);
+    bh = _mm_add_pi16(bh, PW_ONE);
+    bl = _mm_srai_pi16(bl, 1);                /* (CbL * -FIX(0.22800)) */
+    bh = _mm_srai_pi16(bh, 1);                /* (CbH * -FIX(0.22800)) */
+    rl = _mm_add_pi16(rl, PW_ONE);
+    rh = _mm_add_pi16(rh, PW_ONE);
+    rl = _mm_srai_pi16(rl, 1);                /* (CrL * FIX(0.40200)) */
+    rh = _mm_srai_pi16(rh, 1);                /* (CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, cbl);
+    bh = _mm_add_pi16(bh, cbh);
+    bl = _mm_add_pi16(bl, cbl);               /* (CbL * FIX(1.77200))=(B-Y)L */
+    bh = _mm_add_pi16(bh, cbh);               /* (CbH * FIX(1.77200))=(B-Y)H */
+    rl = _mm_add_pi16(rl, crl);               /* (CrL * FIX(1.40200))=(R-Y)L */
+    rh = _mm_add_pi16(rh, crh);               /* (CrH * FIX(1.40200))=(R-Y)H */
+
+    ga = _mm_unpacklo_pi16(cbl, crl);
+    gb = _mm_unpackhi_pi16(cbl, crl);
+    ga = _mm_madd_pi16(ga, PW_MF0344_F0285);
+    gb = _mm_madd_pi16(gb, PW_MF0344_F0285);
+    gc = _mm_unpacklo_pi16(cbh, crh);
+    gd = _mm_unpackhi_pi16(cbh, crh);
+    gc = _mm_madd_pi16(gc, PW_MF0344_F0285);
+    gd = _mm_madd_pi16(gd, PW_MF0344_F0285);
+
+    ga = _mm_add_pi32(ga, PD_ONEHALF);
+    gb = _mm_add_pi32(gb, PD_ONEHALF);
+    ga = _mm_srai_pi32(ga, SCALEBITS);
+    gb = _mm_srai_pi32(gb, SCALEBITS);
+    gc = _mm_add_pi32(gc, PD_ONEHALF);
+    gd = _mm_add_pi32(gd, PD_ONEHALF);
+    gc = _mm_srai_pi32(gc, SCALEBITS);
+    gd = _mm_srai_pi32(gd, SCALEBITS);
+
+    gl = _mm_packs_pi32(ga, gb);           /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+    gh = _mm_packs_pi32(gc, gd);           /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
+    gl = _mm_sub_pi16(gl, crl);    /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+    gh = _mm_sub_pi16(gh, crh);    /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
+
+    ythise = _mm_and_si64(mask, ythis);       /* Y(0246) */
+    ythiso = _mm_srli_pi16(ythis, BYTE_BIT);  /* Y(1357) */
+    ynexte = _mm_and_si64(mask, ynext);       /* Y(8ACE) */
+    ynexto = _mm_srli_pi16(ynext, BYTE_BIT);  /* Y(9BDF) */
+
+    rle = _mm_add_pi16(rl, ythise);           /* (R0 R2 R4 R6) */
+    rlo = _mm_add_pi16(rl, ythiso);           /* (R1 R3 R5 R7) */
+    rhe = _mm_add_pi16(rh, ynexte);           /* (R8 RA RC RE) */
+    rho = _mm_add_pi16(rh, ynexto);           /* (R9 RB RD RF) */
+    re = _mm_packs_pu16(rle, rhe);            /* (R0 R2 R4 R6 R8 RA RC RE) */
+    ro = _mm_packs_pu16(rlo, rho);            /* (R1 R3 R5 R7 R9 RB RD RF) */
+
+    gle = _mm_add_pi16(gl, ythise);           /* (G0 G2 G4 G6) */
+    glo = _mm_add_pi16(gl, ythiso);           /* (G1 G3 G5 G7) */
+    ghe = _mm_add_pi16(gh, ynexte);           /* (G8 GA GC GE) */
+    gho = _mm_add_pi16(gh, ynexto);           /* (G9 GB GD GF) */
+    ge = _mm_packs_pu16(gle, ghe);            /* (G0 G2 G4 G6 G8 GA GC GE) */
+    go = _mm_packs_pu16(glo, gho);            /* (G1 G3 G5 G7 G9 GB GD GF) */
+
+    ble = _mm_add_pi16(bl, ythise);           /* (B0 B2 B4 B6) */
+    blo = _mm_add_pi16(bl, ythiso);           /* (B1 B3 B5 B7) */
+    bhe = _mm_add_pi16(bh, ynexte);           /* (B8 BA BC BE) */
+    bho = _mm_add_pi16(bh, ynexto);           /* (B9 BB BD BF) */
+    be = _mm_packs_pu16(ble, bhe);            /* (B0 B2 B4 B6 B8 BA BC BE) */
+    bo = _mm_packs_pu16(blo, bho);            /* (B1 B3 B5 B7 B9 BB BD BF) */
+
+#if RGB_PIXELSIZE == 3
+
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    mmG = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mmA = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmH = _mm_unpacklo_pi8(mmE, mmB);         /* (20 01 22 03 24 05 26 07) */
+    mmE = _mm_unpackhi_pi8(mmE, mmB);         /* (28 09 2A 0B 2C 0D 2E 0F) */
+    mmC = _mm_unpacklo_pi8(mmD, mmF);         /* (11 21 13 23 15 25 17 27) */
+    mmD = _mm_unpackhi_pi8(mmD, mmF);         /* (19 29 1B 2B 1D 2D 1F 2F) */
+
+    mmB = _mm_unpacklo_pi16(mmG, mmA);        /* (00 10 08 18 02 12 0A 1A) */
+    mmA = _mm_unpackhi_pi16(mmG, mmA);        /* (04 14 0C 1C 06 16 0E 1E) */
+    mmF = _mm_unpacklo_pi16(mmH, mmE);        /* (20 01 28 09 22 03 2A 0B) */
+    mmE = _mm_unpackhi_pi16(mmH, mmE);        /* (24 05 2C 0D 26 07 2E 0F) */
+    mmH = _mm_unpacklo_pi16(mmC, mmD);        /* (11 21 19 29 13 23 1B 2B) */
+    mmG = _mm_unpackhi_pi16(mmC, mmD);        /* (15 25 1D 2D 17 27 1F 2F) */
+
+    mmC = _mm_unpacklo_pi16(mmB, mmF);        /* (00 10 20 01 08 18 28 09) */
+    mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
+    mmB = _mm_unpacklo_pi16(mmH, mmB);        /* (11 21 02 12 19 29 0A 1A) */
+    mmD = _mm_unpackhi_pi16(mmF, mmH);        /* (22 03 13 23 2A 0B 1B 2B) */
+    mmF = _mm_unpacklo_pi16(mmA, mmE);        /* (04 14 24 05 0C 1C 2C 0D) */
+    mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+    mmH = _mm_unpacklo_pi16(mmG, mmA);        /* (15 25 06 16 1D 2D 0E 1E) */
+    mmG = _mm_unpackhi_pi16(mmE, mmG);        /* (26 07 17 27 2E 0F 1F 2F) */
+
+    mmA = _mm_unpacklo_pi32(mmC, mmB);        /* (00 10 20 01 11 21 02 12) */
+    mmE = _mm_unpackhi_pi32(mmC, mmB);        /* (08 18 28 09 19 29 0A 1A) */
+    mmB = _mm_unpacklo_pi32(mmD, mmF);        /* (22 03 13 23 04 14 24 05) */
+    mmF = _mm_unpackhi_pi32(mmD, mmF);        /* (2A 0B 1B 2B 0C 1C 2C 0D) */
+    mmC = _mm_unpacklo_pi32(mmH, mmG);        /* (15 25 06 16 26 07 17 27) */
+    mmG = _mm_unpackhi_pi32(mmH, mmG);        /* (1D 2D 0E 1E 2E 0F 1F 2F) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmE);
+        _mm_store_si64((__m64 *)(outptr + 32), mmF);
+        _mm_store_si64((__m64 *)(outptr + 40), mmG);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmG);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 6 + 3;
+      else
+        col = num_cols * 6;
+
+      asm(".set noreorder\r\n"                /* st24 */
+
+          "li       $8, 24\r\n"
+          "move     $9, %7\r\n"
+          "mov.s    $f4, %1\r\n"
+          "mov.s    $f6, %2\r\n"
+          "mov.s    $f8, %3\r\n"
+          "move     $10, %8\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "mov.s    $f4, %4\r\n"
+          "mov.s    $f6, %5\r\n"
+          "mov.s    $f8, %6\r\n"
+          "subu     $9, $9, 24\r\n"
+          PTR_ADDU  "$10, $10, 24\r\n"
+
+          "1:       \r\n"
+          "li       $8, 16\r\n"               /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "subu     $9, $9, 16\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8,  8\r\n"               /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8,  4\r\n"               /* st4 */
+          "mfc1     $11, $f4\r\n"
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "swl      $11, 3($10)\r\n"
+          "swr      $11, 0($10)\r\n"
+          "li       $8, 32\r\n"
+          "mtc1     $8, $f6\r\n"
+          "dsrl     $f4, $f4, $f6\r\n"
+          "mfc1     $11, $f4\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 4\r\n"
+
+          "4:       \r\n"
+          "li       $8, 2\r\n"                /* st2 */
+          "bltu     $9, $8, 5f\r\n"
+          "nop      \r\n"
+          "ush      $11, 0($10)\r\n"
+          "srl      $11, 16\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 2\r\n"
+
+          "5:       \r\n"
+          "li       $8, 1\r\n"                /* st1 */
+          "bltu     $9, $8, 6f\r\n"
+          "nop      \r\n"
+          "sb       $11, 0($10)\r\n"
+
+          "6:       \r\n"
+          "nop      \r\n"                     /* end */
+          : "=m" (*outptr)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF),
+            "f" (mmG), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory"
+         );
+    }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+    xe = _mm_cmpeq_pi8(xe, xe);
+    xo = _mm_cmpeq_pi8(xo, xo);
+#else
+    xe = _mm_xor_si64(xe, xe);
+    xo = _mm_xor_si64(xo, xo);
+#endif
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
+
+    mm8 = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mm9 = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmA = _mm_unpacklo_pi8(mmE, mmG);         /* (20 30 22 32 24 34 26 36) */
+    mmE = _mm_unpackhi_pi8(mmE, mmG);         /* (28 38 2A 3A 2C 3C 2E 3E) */
+
+    mmG = _mm_unpacklo_pi8(mmB, mmD);         /* (01 11 03 13 05 15 07 17) */
+    mmB = _mm_unpackhi_pi8(mmB, mmD);         /* (09 19 0B 1B 0D 1D 0F 1F) */
+    mmD = _mm_unpacklo_pi8(mmF, mmH);         /* (21 31 23 33 25 35 27 37) */
+    mmF = _mm_unpackhi_pi8(mmF, mmH);         /* (29 39 2B 3B 2D 3D 2F 3F) */
+
+    mmH = _mm_unpacklo_pi16(mm8, mmA);        /* (00 10 20 30 02 12 22 32) */
+    mm8 = _mm_unpackhi_pi16(mm8, mmA);        /* (04 14 24 34 06 16 26 36) */
+    mmA = _mm_unpacklo_pi16(mmG, mmD);        /* (01 11 21 31 03 13 23 33) */
+    mmD = _mm_unpackhi_pi16(mmG, mmD);        /* (05 15 25 35 07 17 27 37) */
+
+    mmG = _mm_unpackhi_pi16(mm9, mmE);        /* (0C 1C 2C 3C 0E 1E 2E 3E) */
+    mm9 = _mm_unpacklo_pi16(mm9, mmE);        /* (08 18 28 38 0A 1A 2A 3A) */
+    mmE = _mm_unpacklo_pi16(mmB, mmF);        /* (09 19 29 39 0B 1B 2B 3B) */
+    mmF = _mm_unpackhi_pi16(mmB, mmF);        /* (0D 1D 2D 3D 0F 1F 2F 3F) */
+
+    mmB = _mm_unpackhi_pi32(mmH, mmA);        /* (02 12 22 32 03 13 23 33) */
+    mmA = _mm_unpacklo_pi32(mmH, mmA);        /* (00 10 20 30 01 11 21 31) */
+    mmC = _mm_unpacklo_pi32(mm8, mmD);        /* (04 14 24 34 05 15 25 35) */
+    mmD = _mm_unpackhi_pi32(mm8, mmD);        /* (06 16 26 36 07 17 27 37) */
+
+    mmH = _mm_unpackhi_pi32(mmG, mmF);        /* (0E 1E 2E 3E 0F 1F 2F 3F) */
+    mmG = _mm_unpacklo_pi32(mmG, mmF);        /* (0C 1C 2C 3C 0D 1D 2D 3D) */
+    mmF = _mm_unpackhi_pi32(mm9, mmE);        /* (0A 1A 2A 3A 0B 1B 2B 3B) */
+    mmE = _mm_unpacklo_pi32(mm9, mmE);        /* (08 18 28 38 09 19 29 39) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmD);
+        _mm_store_si64((__m64 *)(outptr + 32), mmE);
+        _mm_store_si64((__m64 *)(outptr + 40), mmF);
+        _mm_store_si64((__m64 *)(outptr + 48), mmG);
+        _mm_store_si64((__m64 *)(outptr + 56), mmH);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmD);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 48), mmG);
+        _mm_storeu_si64((__m64 *)(outptr + 56), mmH);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 2 + 1;
+      else
+        col = num_cols * 2;
+      asm(".set noreorder\r\n"                /* st32 */
+
+          "li       $8, 8\r\n"
+          "move     $9, %10\r\n"
+          "move     $10, %11\r\n"
+          "mov.s    $f4, %2\r\n"
+          "mov.s    $f6, %3\r\n"
+          "mov.s    $f8, %4\r\n"
+          "mov.s    $f10, %5\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "gssdlc1  $f10, 7+24($10)\r\n"
+          "gssdrc1  $f10, 24($10)\r\n"
+          "mov.s    $f4, %6\r\n"
+          "mov.s    $f6, %7\r\n"
+          "mov.s    $f8, %8\r\n"
+          "mov.s    $f10, %9\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 32\r\n"
+
+          "1:       \r\n"
+          "li       $8, 4\r\n"                /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "mov.s    $f6, $f10\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8, 2\r\n"                /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8, 1\r\n"                /* st4 */
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "gsswlc1  $f4, 3($10)\r\n"
+          "gsswrc1  $f4, 0($10)\r\n"
+
+          "4:       \r\n"
+          "li       %1, 0\r\n"                /* end */
+          : "=m" (*outptr), "=r" (col)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF),
+            "f" (mmG), "f" (mmH), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory"
+         );
+    }
+
+#endif
+
+  }
+
+  if (!((output_width >> 1) & 7)) {
+    if (output_width & 1) {
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      decenter = 0.0;
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbl = _mm_unpacklo_pi8(cb, zero);       /* Cb(0123) */
+      crl = _mm_unpacklo_pi8(cr, zero);       /* Cr(0123) */
+      cbl = _mm_add_pi16(cbl, decenter);
+      crl = _mm_add_pi16(crl, decenter);
+
+      cbl2 = _mm_add_pi16(cbl, cbl);          /* 2*CbL */
+      crl2 = _mm_add_pi16(crl, crl);          /* 2*CrL */
+      bl = _mm_mulhi_pi16(cbl2, PW_MF0228);   /* (2*CbL * -FIX(0.22800) */
+      rl = _mm_mulhi_pi16(crl2, PW_F0402);    /* (2*CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, PW_ONE);
+      bl = _mm_srai_pi16(bl, 1);              /* (CbL * -FIX(0.22800)) */
+      rl = _mm_add_pi16(rl, PW_ONE);
+      rl = _mm_srai_pi16(rl, 1);              /* (CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, cbl);
+      bl = _mm_add_pi16(bl, cbl);             /* (CbL * FIX(1.77200))=(B-Y)L */
+      rl = _mm_add_pi16(rl, crl);             /* (CrL * FIX(1.40200))=(R-Y)L */
+
+      gl = _mm_unpacklo_pi16(cbl, crl);
+      gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
+      gl = _mm_add_pi32(gl, PD_ONEHALF);
+      gl = _mm_srai_pi32(gl, SCALEBITS);
+      gl = _mm_packs_pi32(gl, zero);       /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+      gl = _mm_sub_pi16(gl, crl);  /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+
+      yl = _mm_unpacklo_pi8(y, zero);         /* Y(0123) */
+      rl = _mm_add_pi16(rl, yl);              /* (R0 R1 R2 R3) */
+      gl = _mm_add_pi16(gl, yl);              /* (G0 G1 G2 G3) */
+      bl = _mm_add_pi16(bl, yl);              /* (B0 B1 B2 B3) */
+      re = _mm_packs_pu16(rl, rl);
+      ge = _mm_packs_pu16(gl, gl);
+      be = _mm_packs_pu16(bl, bl);
+#if RGB_PIXELSIZE == 3
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "mfc1    $9, $f4\r\n"
+          "ush     $9, 0($8)\r\n"
+          "srl     $9, 16\r\n"
+          "sb      $9, 2($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "$9", "memory"
+         );
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+#else
+      xe = _mm_xor_si64(xe, xe);
+#endif
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "gsswlc1 $f4, 3($8)\r\n"
+          "gsswrc1 $f4, 0($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "memory"
+         );
+#endif
+    }
+  }
+}
+
+
+void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW inptr, outptr;
+
+  inptr = input_buf[0][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+  output_buf[0] = output_buf[1];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = inptr;
+  output_buf[0] = outptr;
+}
+
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jdsample-mmi.c b/media/libjpeg/simd/mips64/jdsample-mmi.c
new file mode 100644
index 0000000000..8ae94e7dcf
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdsample-mmi.c
@@ -0,0 +1,304 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_mmi.h"
+
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_TWO,
+  index_PW_THREE,
+  index_PW_SEVEN,
+  index_PW_EIGHT,
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(2, 2, 2, 2),
+  _uint64_set_pi16(3, 3, 3, 3),
+  _uint64_set_pi16(7, 7, 7, 7),
+  _uint64_set_pi16(8, 8, 8, 8),
+};
+
+#define PW_ONE    get_const_value(index_PW_ONE)
+#define PW_TWO    get_const_value(index_PW_TWO)
+#define PW_THREE  get_const_value(index_PW_THREE)
+#define PW_SEVEN  get_const_value(index_PW_SEVEN)
+#define PW_EIGHT  get_const_value(index_PW_EIGHT)
+
+
+#define PROCESS_ROW(row, wkoffset, bias1, bias2, shift) { \
+  __m64 samp123X, samp3XXX, samp1234, sampX012, samp_1012; \
+  __m64 sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \
+  __m64 outle, outhe, outlo, outho, outl, outh; \
+  \
+  samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT);  /* ( 1 2 3 -) */ \
+  sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 4) */ \
+  samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 3 - - -) */ \
+  sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT);  /* ( - 4 5 6) */ \
+  \
+  samp1234 = _mm_or_si64(samp123X, sampXXX4);  /* ( 1 2 3 4) */ \
+  samp3456 = _mm_or_si64(samp3XXX, sampX456);  /* ( 3 4 5 6) */ \
+  \
+  sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT);  /* ( - 0 1 2) */ \
+  samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT);  /* ( 5 6 7 -) */ \
+  samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 7 - - -) */ \
+  \
+  samp_1012 = _mm_or_si64(sampX012, wk[row]);            /* (-1 0 1 2) */ \
+  samp5678 = _mm_or_si64(samp567X, wk[row + wkoffset]);  /* ( 5 6 7 8) */ \
+  \
+  wk[row] = samp7XXX; \
+  \
+  samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \
+  samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \
+  samp_1012 = _mm_add_pi16(samp_1012, bias1); \
+  samp3456 = _mm_add_pi16(samp3456, bias1); \
+  samp1234 = _mm_add_pi16(samp1234, bias2); \
+  samp5678 = _mm_add_pi16(samp5678, bias2); \
+  \
+  outle = _mm_add_pi16(samp_1012, samp0123); \
+  outhe = _mm_add_pi16(samp3456, samp4567); \
+  outle = _mm_srli_pi16(outle, shift);        /* ( 0  2  4  6) */ \
+  outhe = _mm_srli_pi16(outhe, shift);        /* ( 8 10 12 14) */ \
+  outlo = _mm_add_pi16(samp1234, samp0123); \
+  outho = _mm_add_pi16(samp5678, samp4567); \
+  outlo = _mm_srli_pi16(outlo, shift);        /* ( 1  3  5  7) */ \
+  outho = _mm_srli_pi16(outho, shift);        /* ( 9 11 13 15) */ \
+  \
+  outlo = _mm_slli_pi16(outlo, BYTE_BIT); \
+  outho = _mm_slli_pi16(outho, BYTE_BIT); \
+  outl = _mm_or_si64(outle, outlo);           /* ( 0  1  2  3  4  5  6  7) */ \
+  outh = _mm_or_si64(outhe, outho);           /* ( 8  9 10 11 12 13 14 15) */ \
+  \
+  _mm_store_si64((__m64 *)outptr##row, outl); \
+  _mm_store_si64((__m64 *)outptr##row + 1, outh); \
+}
+
+void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol, tmp, tmp1;
+  __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h;
+  __m64 this0l, this0h, this0;
+  __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h;
+  __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h;
+  __m64 next0l, next0h, next0;
+  __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[4], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %3, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %3, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
+          : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this0 = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    this_1 = _mm_load_si64((__m64 *)inptr_1);  /* row[-1][0] */
+    this1 = _mm_load_si64((__m64 *)inptr1);    /* row[ 1][0] */
+
+    this0l = _mm_unpacklo_pi8(this0, zero);    /* row[ 0][0]( 0 1 2 3) */
+    this0h = _mm_unpackhi_pi8(this0, zero);    /* row[ 0][0]( 4 5 6 7) */
+    this_1l = _mm_unpacklo_pi8(this_1, zero);  /* row[-1][0]( 0 1 2 3) */
+    this_1h = _mm_unpackhi_pi8(this_1, zero);  /* row[-1][0]( 4 5 6 7) */
+    this1l = _mm_unpacklo_pi8(this1, zero);    /* row[+1][0]( 0 1 2 3) */
+    this1h = _mm_unpackhi_pi8(this1, zero);    /* row[+1][0]( 4 5 6 7) */
+
+    this0l = _mm_mullo_pi16(this0l, PW_THREE);
+    this0h = _mm_mullo_pi16(this0h, PW_THREE);
+
+    thiscolsum_1l = _mm_add_pi16(this_1l, this0l);  /* ( 0 1 2 3) */
+    thiscolsum_1h = _mm_add_pi16(this_1h, this0h);  /* ( 4 5 6 7) */
+    thiscolsum1l = _mm_add_pi16(this0l, this1l);    /* ( 0 1 2 3) */
+    thiscolsum1h = _mm_add_pi16(this0h, this1h);    /* ( 4 5 6 7) */
+
+    /* temporarily save the intermediate data */
+    _mm_store_si64((__m64 *)outptr0, thiscolsum_1l);
+    _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h);
+    _mm_store_si64((__m64 *)outptr1, thiscolsum1l);
+    _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h);
+
+    wk[0] = _mm_and_si64(thiscolsum_1l, mask0);  /* ( 0 - - -) */
+    wk[1] = _mm_and_si64(thiscolsum1l, mask0);   /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
+         outptr0 += 16, outptr1 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next0 = _mm_load_si64((__m64 *)inptr0 + 1);    /* row[ 0][1] */
+        next_1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* row[-1][1] */
+        next1 = _mm_load_si64((__m64 *)inptr1 + 1);    /* row[+1][1] */
+
+        next0l = _mm_unpacklo_pi8(next0, zero);    /* row[ 0][1]( 0 1 2 3) */
+        next0h = _mm_unpackhi_pi8(next0, zero);    /* row[ 0][1]( 4 5 6 7) */
+        next_1l = _mm_unpacklo_pi8(next_1, zero);  /* row[-1][1]( 0 1 2 3) */
+        next_1h = _mm_unpackhi_pi8(next_1, zero);  /* row[-1][1]( 4 5 6 7) */
+        next1l = _mm_unpacklo_pi8(next1, zero);    /* row[+1][1]( 0 1 2 3) */
+        next1h = _mm_unpackhi_pi8(next1, zero);    /* row[+1][1]( 4 5 6 7) */
+
+        next0l = _mm_mullo_pi16(next0l, PW_THREE);
+        next0h = _mm_mullo_pi16(next0h, PW_THREE);
+
+        nextcolsum_1l = _mm_add_pi16(next_1l, next0l);  /* ( 0 1 2 3) */
+        nextcolsum_1h = _mm_add_pi16(next_1h, next0h);  /* ( 4 5 6 7) */
+        nextcolsum1l = _mm_add_pi16(next0l, next1l);    /* ( 0 1 2 3) */
+        nextcolsum1h = _mm_add_pi16(next0h, next1h);    /* ( 4 5 6 7) */
+
+        /* temporarily save the intermediate data */
+        _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l);
+        _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h);
+        _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l);
+        _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h);
+
+        wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+        wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);   /* ( - - - 0) */
+      } else {
+        __m64 tmp;
+
+        /* process the last column block */
+        tmp = _mm_load_si64((__m64 *)outptr0 + 1);
+        wk[2] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+        tmp = _mm_load_si64((__m64 *)outptr1 + 1);
+        wk[3] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+      }
+
+      /* process the upper row */
+      samp0123 = _mm_load_si64((__m64 *)outptr0);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr0 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(0, 2, PW_EIGHT, PW_SEVEN, 4)
+
+      /* process the lower row */
+      samp0123 = _mm_load_si64((__m64 *)outptr1);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr1 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(1, 2, PW_EIGHT, PW_SEVEN, 4)
+    }
+  }
+}
+
+
+void jsimd_h2v1_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, outptr0;
+  int inrow, incol, tmp, tmp1;
+  __m64 thisl, this, nextl, next;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[2], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+
+    inptr0 = input_data[inrow];
+    outptr0 = output_data[inrow];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %1, %2\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %1, %3\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr0)
+          : "r" (inptr0), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    thisl = _mm_unpacklo_pi8(this, zero);     /* row[ 0][0]( 0 1 2 3) */
+    wk[0] = _mm_and_si64(thisl, mask0);       /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr0 += 8, outptr0 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next = _mm_load_si64((__m64 *)inptr0 + 1);  /* row[ 0][1] */
+        nextl = _mm_unpacklo_pi8(next, zero);       /* row[ 0][1]( 0 1 2 3) */
+        wk[1] = _mm_slli_si64(nextl, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+      } else {
+        __m64 thish;
+
+        /* process the last column block */
+        this = _mm_load_si64((__m64 *)inptr0);  /* row[ 0][0] */
+        thish = _mm_unpackhi_pi8(this, zero);   /* row[ 0][1]( 4 5 6 7) */
+        wk[1] = _mm_and_si64(masklast, thish);  /* ( - - - 7) */
+      }
+
+      /* process the row */
+      this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+      samp0123 = _mm_unpacklo_pi8(this, zero);  /* ( 0 1 2 3) */
+      samp4567 = _mm_unpackhi_pi8(this, zero);  /* ( 4 5 6 7) */
+      PROCESS_ROW(0, 1, PW_ONE, PW_TWO, 2)
+    }
+  }
+}
diff --git a/media/libjpeg/simd/mips64/jfdctfst-mmi.c b/media/libjpeg/simd/mips64/jfdctfst-mmi.c
new file mode 100644
index 0000000000..f7caf09a88
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jfdctfst-mmi.c
@@ -0,0 +1,255 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+
+#define F_0_382  ((short)98)   /* FIX(0.382683433) */
+#define F_0_541  ((short)139)  /* FIX(0.541196100) */
+#define F_0_707  ((short)181)  /* FIX(0.707106781) */
+#define F_1_306  ((short)334)  /* FIX(1.306562965) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F0707,
+  index_PW_F0382,
+  index_PW_F0541,
+  index_PW_F1306
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(F_0_707),
+  _uint64_set1_pi16(F_0_382),
+  _uint64_set1_pi16(F_0_541),
+  _uint64_set1_pi16(F_1_306)
+};
+
+#define PW_F0707  get_const_value(index_PW_F0707)
+#define PW_F0382  get_const_value(index_PW_F0382)
+#define PW_F0541  get_const_value(index_PW_F0541)
+#define PW_F1306  get_const_value(index_PW_F1306)
+
+
+#define DO_FDCT_MULTIPLY(out, in, multiplier) { \
+  __m64 mulhi, mullo, mul12, mul34; \
+  \
+  mullo = _mm_mullo_pi16(in, multiplier); \
+  mulhi = _mm_mulhi_pi16(in, multiplier); \
+  mul12 = _mm_unpacklo_pi16(mullo, mulhi); \
+  mul34 = _mm_unpackhi_pi16(mullo, mulhi); \
+  mul12 = _mm_srai_pi32(mul12, CONST_BITS); \
+  mul34 = _mm_srai_pi32(mul34, CONST_BITS); \
+  out = _mm_packs_pi32(mul12, mul34); \
+}
+
+#define DO_FDCT_COMMON() { \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3); \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3); \
+  tmp11 = _mm_add_pi16(tmp1, tmp2); \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2); \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11); \
+  out4 = _mm_sub_pi16(tmp10, tmp11); \
+  \
+  z1 = _mm_add_pi16(tmp12, tmp13); \
+  DO_FDCT_MULTIPLY(z1, z1, PW_F0707) \
+  \
+  out2 = _mm_add_pi16(tmp13, z1); \
+  out6 = _mm_sub_pi16(tmp13, z1); \
+  \
+  /* Odd part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp4, tmp5); \
+  tmp11 = _mm_add_pi16(tmp5, tmp6); \
+  tmp12 = _mm_add_pi16(tmp6, tmp7); \
+  \
+  z5 = _mm_sub_pi16(tmp10, tmp12); \
+  DO_FDCT_MULTIPLY(z5, z5, PW_F0382) \
+  \
+  DO_FDCT_MULTIPLY(z2, tmp10, PW_F0541) \
+  z2 = _mm_add_pi16(z2, z5); \
+  \
+  DO_FDCT_MULTIPLY(z4, tmp12, PW_F1306) \
+  z4 = _mm_add_pi16(z4, z5); \
+  \
+  DO_FDCT_MULTIPLY(z3, tmp11, PW_F0707) \
+  \
+  z11 = _mm_add_pi16(tmp7, z3); \
+  z13 = _mm_sub_pi16(tmp7, z3); \
+  \
+  out5 = _mm_add_pi16(z13, z2); \
+  out3 = _mm_sub_pi16(z13, z2); \
+  out1 = _mm_add_pi16(z11, z4); \
+  out7 = _mm_sub_pi16(z11, z4); \
+}
+
+#define DO_FDCT_PASS1() { \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+  \
+  row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);     /* (00 01 02 03) */ \
+  row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+  row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);     /* (10 11 12 13) */ \
+  row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+  row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);     /* (20 21 22 23) */ \
+  row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+  row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);     /* (30 31 32 33) */ \
+  row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row23a = _mm_unpacklo_pi16(row2l, row3l);   /* row23a=(20 30 21 31) */ \
+  row23b = _mm_unpackhi_pi16(row2l, row3l);   /* row23b=(22 32 23 33) */ \
+  row23c = _mm_unpacklo_pi16(row2h, row3h);   /* row23c=(24 34 25 35) */ \
+  row23d = _mm_unpackhi_pi16(row2h, row3h);   /* row23d=(26 36 27 37) */ \
+  \
+  row01a = _mm_unpacklo_pi16(row0l, row1l);   /* row01a=(00 10 01 11) */ \
+  row01b = _mm_unpackhi_pi16(row0l, row1l);   /* row01b=(02 12 03 13) */ \
+  row01c = _mm_unpacklo_pi16(row0h, row1h);   /* row01c=(04 14 05 15) */ \
+  row01d = _mm_unpackhi_pi16(row0h, row1h);   /* row01d=(06 16 07 17) */ \
+  \
+  col0 = _mm_unpacklo_pi32(row01a, row23a);   /* col0=(00 10 20 30) */ \
+  col1 = _mm_unpackhi_pi32(row01a, row23a);   /* col1=(01 11 21 31) */ \
+  col6 = _mm_unpacklo_pi32(row01d, row23d);   /* col6=(06 16 26 36) */ \
+  col7 = _mm_unpackhi_pi32(row01d, row23d);   /* col7=(07 17 27 37) */ \
+  \
+  tmp6 = _mm_sub_pi16(col1, col6);            /* tmp6=col1-col6 */ \
+  tmp7 = _mm_sub_pi16(col0, col7);            /* tmp7=col0-col7 */ \
+  tmp1 = _mm_add_pi16(col1, col6);            /* tmp1=col1+col6 */ \
+  tmp0 = _mm_add_pi16(col0, col7);            /* tmp0=col0+col7 */ \
+  \
+  col2 = _mm_unpacklo_pi32(row01b, row23b);   /* col2=(02 12 22 32) */ \
+  col3 = _mm_unpackhi_pi32(row01b, row23b);   /* col3=(03 13 23 33) */ \
+  col4 = _mm_unpacklo_pi32(row01c, row23c);   /* col4=(04 14 24 34) */ \
+  col5 = _mm_unpackhi_pi32(row01c, row23c);   /* col5=(05 15 25 35) */ \
+  \
+  tmp3 = _mm_add_pi16(col3, col4);            /* tmp3=col3+col4 */ \
+  tmp2 = _mm_add_pi16(col2, col5);            /* tmp2=col2+col5 */ \
+  tmp4 = _mm_sub_pi16(col3, col4);            /* tmp4=col3-col4 */ \
+  tmp5 = _mm_sub_pi16(col2, col5);            /* tmp5=col2-col5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+  __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+  __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+  __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+  \
+  col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]);  /* (40 50 60 70) */ \
+  col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]);  /* (41 51 61 71) */ \
+  col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]);  /* (42 52 62 72) */ \
+  col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]);  /* (43 53 63 73) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  col23a = _mm_unpacklo_pi16(col2l, col3l);   /* col23a=(02 03 12 13) */ \
+  col23b = _mm_unpackhi_pi16(col2l, col3l);   /* col23b=(22 23 32 33) */ \
+  col23c = _mm_unpacklo_pi16(col2h, col3h);   /* col23c=(42 43 52 53) */ \
+  col23d = _mm_unpackhi_pi16(col2h, col3h);   /* col23d=(62 63 72 73) */ \
+  \
+  col01a = _mm_unpacklo_pi16(col0l, col1l);   /* col01a=(00 01 10 11) */ \
+  col01b = _mm_unpackhi_pi16(col0l, col1l);   /* col01b=(20 21 30 31) */ \
+  col01c = _mm_unpacklo_pi16(col0h, col1h);   /* col01c=(40 41 50 51) */ \
+  col01d = _mm_unpackhi_pi16(col0h, col1h);   /* col01d=(60 61 70 71) */ \
+  \
+  row0 = _mm_unpacklo_pi32(col01a, col23a);   /* row0=(00 01 02 03) */ \
+  row1 = _mm_unpackhi_pi32(col01a, col23a);   /* row1=(10 11 12 13) */ \
+  row6 = _mm_unpacklo_pi32(col01d, col23d);   /* row6=(60 61 62 63) */ \
+  row7 = _mm_unpackhi_pi32(col01d, col23d);   /* row7=(70 71 72 73) */ \
+  \
+  tmp6 = _mm_sub_pi16(row1, row6);            /* tmp6=row1-row6 */ \
+  tmp7 = _mm_sub_pi16(row0, row7);            /* tmp7=row0-row7 */ \
+  tmp1 = _mm_add_pi16(row1, row6);            /* tmp1=row1+row6 */ \
+  tmp0 = _mm_add_pi16(row0, row7);            /* tmp0=row0+row7 */ \
+  \
+  row2 = _mm_unpacklo_pi32(col01b, col23b);   /* row2=(20 21 22 23) */ \
+  row3 = _mm_unpackhi_pi32(col01b, col23b);   /* row3=(30 31 32 33) */ \
+  row4 = _mm_unpacklo_pi32(col01c, col23c);   /* row4=(40 41 42 43) */ \
+  row5 = _mm_unpackhi_pi32(col01c, col23c);   /* row5=(50 51 52 53) */ \
+  \
+  tmp3 = _mm_add_pi16(row3, row4);            /* tmp3=row3+row4 */ \
+  tmp2 = _mm_add_pi16(row2, row5);            /* tmp2=row2+row5 */ \
+  tmp4 = _mm_sub_pi16(row3, row4);            /* tmp4=row3-row4 */ \
+  tmp5 = _mm_sub_pi16(row2, row5);            /* tmp5=row2-row5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_ifast_mmi(DCTELEM *data)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
+  DCTELEM *dataptr = data;
+
+  /* Pass 1: process rows. */
+
+  DO_FDCT_PASS1()
+  dataptr += DCTSIZE * 4;
+  DO_FDCT_PASS1()
+
+  /* Pass 2: process columns. */
+
+  dataptr = data;
+  DO_FDCT_PASS2()
+  dataptr += 4;
+  DO_FDCT_PASS2()
+}
diff --git a/media/libjpeg/simd/mips64/jfdctint-mmi.c b/media/libjpeg/simd/mips64/jfdctint-mmi.c
new file mode 100644
index 0000000000..7f4dfe9123
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jfdctint-mmi.c
@@ -0,0 +1,398 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define FIX_0_298  ((short)2446)   /* FIX(0.298631336) */
+#define FIX_0_390  ((short)3196)   /* FIX(0.390180644) */
+#define FIX_0_541  ((short)4433)   /* FIX(0.541196100) */
+#define FIX_0_765  ((short)6270)   /* FIX(0.765366865) */
+#define FIX_0_899  ((short)7373)   /* FIX(0.899976223) */
+#define FIX_1_175  ((short)9633)   /* FIX(1.175875602) */
+#define FIX_1_501  ((short)12299)  /* FIX(1.501321110) */
+#define FIX_1_847  ((short)15137)  /* FIX(1.847759065) */
+#define FIX_1_961  ((short)16069)  /* FIX(1.961570560) */
+#define FIX_2_053  ((short)16819)  /* FIX(2.053119869) */
+#define FIX_2_562  ((short)20995)  /* FIX(2.562915447) */
+#define FIX_3_072  ((short)25172)  /* FIX(3.072711026) */
+
+enum const_index {
+  index_PW_F130_F054,
+  index_PW_F054_MF130,
+  index_PW_MF078_F117,
+  index_PW_F117_F078,
+  index_PW_MF060_MF089,
+  index_PW_MF089_F060,
+  index_PW_MF050_MF256,
+  index_PW_MF256_F050,
+  index_PD_DESCALE_P1,
+  index_PD_DESCALE_P2,
+  index_PW_DESCALE_P2X
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+                   FIX_0_541, (FIX_0_541 + FIX_0_765)),
+  _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+                   (FIX_0_541 - FIX_1_847), FIX_0_541),
+  _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+                   FIX_1_175, (FIX_1_175 - FIX_1_961)),
+  _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+                   (FIX_1_175 - FIX_0_390), FIX_1_175),
+  _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+                   -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+  _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+                   (FIX_1_501 - FIX_0_899), -FIX_0_899),
+  _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+                   -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+  _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+                   (FIX_3_072 - FIX_2_562), -FIX_2_562),
+  _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+  _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+  _uint64_set_pi16((1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)),
+                   (1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)))
+};
+
+#define PW_F130_F054    get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130   get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117   get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078    get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089  get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060   get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256  get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050   get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1   get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2   get_const_value(index_PD_DESCALE_P2)
+#define PW_DESCALE_P2X  get_const_value(index_PW_DESCALE_P2X)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+  __m64 tmp1312l, tmp1312h, tmp47l, tmp47h, tmp4l, tmp4h, tmp7l, tmp7h; \
+  __m64 tmp56l, tmp56h, tmp5l, tmp5h, tmp6l, tmp6h; \
+  __m64 out1l, out1h, out2l, out2h, out3l, out3h; \
+  __m64 out5l, out5h, out6l, out6h, out7l, out7h; \
+  __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+  \
+  /* (Original) \
+   * z1 = (tmp12 + tmp13) * 0.541196100; \
+   * out2 = z1 + tmp13 * 0.765366865; \
+   * out6 = z1 + tmp12 * -1.847759065; \
+   * \
+   * (This implementation) \
+   * out2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+   * out6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+   */ \
+  \
+  tmp1312l = _mm_unpacklo_pi16(tmp13, tmp12); \
+  tmp1312h = _mm_unpackhi_pi16(tmp13, tmp12); \
+  \
+  out2l = _mm_madd_pi16(tmp1312l, PW_F130_F054); \
+  out2h = _mm_madd_pi16(tmp1312h, PW_F130_F054); \
+  out6l = _mm_madd_pi16(tmp1312l, PW_F054_MF130); \
+  out6h = _mm_madd_pi16(tmp1312h, PW_F054_MF130); \
+  \
+  out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+  out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+  out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+  out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+  \
+  out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+  out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+  out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+  out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+  \
+  out2 = _mm_packs_pi32(out2l, out2h); \
+  out6 = _mm_packs_pi32(out6l, out6h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = _mm_add_pi16(tmp4, tmp6); \
+  z4 = _mm_add_pi16(tmp5, tmp7); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = _mm_unpacklo_pi16(z3, z4); \
+  z34h = _mm_unpackhi_pi16(z3, z4); \
+  z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+  z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+  z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+  z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+  \
+  /* (Original) \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * out7 = tmp4 + z1 + z3;  out5 = tmp5 + z2 + z4; \
+   * out3 = tmp6 + z2 + z3;  out1 = tmp7 + z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+   * out7 = tmp4 + z3;  out5 = tmp5 + z4; \
+   * out3 = tmp6 + z3;  out1 = tmp7 + z4; \
+   */ \
+  \
+  tmp47l = _mm_unpacklo_pi16(tmp4, tmp7); \
+  tmp47h = _mm_unpackhi_pi16(tmp4, tmp7); \
+  \
+  tmp4l = _mm_madd_pi16(tmp47l, PW_MF060_MF089); \
+  tmp4h = _mm_madd_pi16(tmp47h, PW_MF060_MF089); \
+  tmp7l = _mm_madd_pi16(tmp47l, PW_MF089_F060); \
+  tmp7h = _mm_madd_pi16(tmp47h, PW_MF089_F060); \
+  \
+  out7l = _mm_add_pi32(tmp4l, z3l); \
+  out7h = _mm_add_pi32(tmp4h, z3h); \
+  out1l = _mm_add_pi32(tmp7l, z4l); \
+  out1h = _mm_add_pi32(tmp7h, z4h); \
+  \
+  out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+  out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+  out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+  out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+  \
+  out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+  out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+  out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+  out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+  \
+  out7 = _mm_packs_pi32(out7l, out7h); \
+  out1 = _mm_packs_pi32(out1l, out1h); \
+  \
+  tmp56l = _mm_unpacklo_pi16(tmp5, tmp6); \
+  tmp56h = _mm_unpackhi_pi16(tmp5, tmp6); \
+  \
+  tmp5l = _mm_madd_pi16(tmp56l, PW_MF050_MF256); \
+  tmp5h = _mm_madd_pi16(tmp56h, PW_MF050_MF256); \
+  tmp6l = _mm_madd_pi16(tmp56l, PW_MF256_F050); \
+  tmp6h = _mm_madd_pi16(tmp56h, PW_MF256_F050); \
+  \
+  out5l = _mm_add_pi32(tmp5l, z4l); \
+  out5h = _mm_add_pi32(tmp5h, z4h); \
+  out3l = _mm_add_pi32(tmp6l, z3l); \
+  out3h = _mm_add_pi32(tmp6h, z3h); \
+  \
+  out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+  out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+  out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+  out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+  \
+  out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+  out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+  out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+  out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+  \
+  out5 = _mm_packs_pi32(out5l, out5h); \
+  out3 = _mm_packs_pi32(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+  __m64 tmp10, tmp11; \
+  \
+  row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);     /* (00 01 02 03) */ \
+  row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+  row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);     /* (10 11 12 13) */ \
+  row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+  row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);     /* (20 21 22 23) */ \
+  row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+  row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);     /* (30 31 32 33) */ \
+  row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row23a = _mm_unpacklo_pi16(row2l, row3l);   /* row23a=(20 30 21 31) */ \
+  row23b = _mm_unpackhi_pi16(row2l, row3l);   /* row23b=(22 32 23 33) */ \
+  row23c = _mm_unpacklo_pi16(row2h, row3h);   /* row23c=(24 34 25 35) */ \
+  row23d = _mm_unpackhi_pi16(row2h, row3h);   /* row23d=(26 36 27 37) */ \
+  \
+  row01a = _mm_unpacklo_pi16(row0l, row1l);   /* row01a=(00 10 01 11) */ \
+  row01b = _mm_unpackhi_pi16(row0l, row1l);   /* row01b=(02 12 03 13) */ \
+  row01c = _mm_unpacklo_pi16(row0h, row1h);   /* row01c=(04 14 05 15) */ \
+  row01d = _mm_unpackhi_pi16(row0h, row1h);   /* row01d=(06 16 07 17) */ \
+  \
+  col0 = _mm_unpacklo_pi32(row01a, row23a);   /* col0=(00 10 20 30) */ \
+  col1 = _mm_unpackhi_pi32(row01a, row23a);   /* col1=(01 11 21 31) */ \
+  col6 = _mm_unpacklo_pi32(row01d, row23d);   /* col6=(06 16 26 36) */ \
+  col7 = _mm_unpackhi_pi32(row01d, row23d);   /* col7=(07 17 27 37) */ \
+  \
+  tmp6 = _mm_sub_pi16(col1, col6);            /* tmp6=col1-col6 */ \
+  tmp7 = _mm_sub_pi16(col0, col7);            /* tmp7=col0-col7 */ \
+  tmp1 = _mm_add_pi16(col1, col6);            /* tmp1=col1+col6 */ \
+  tmp0 = _mm_add_pi16(col0, col7);            /* tmp0=col0+col7 */ \
+  \
+  col2 = _mm_unpacklo_pi32(row01b, row23b);   /* col2=(02 12 22 32) */ \
+  col3 = _mm_unpackhi_pi32(row01b, row23b);   /* col3=(03 13 23 33) */ \
+  col4 = _mm_unpacklo_pi32(row01c, row23c);   /* col4=(04 14 24 34) */ \
+  col5 = _mm_unpackhi_pi32(row01c, row23c);   /* col5=(05 15 25 35) */ \
+  \
+  tmp3 = _mm_add_pi16(col3, col4);            /* tmp3=col3+col4 */ \
+  tmp2 = _mm_add_pi16(col2, col5);            /* tmp2=col2+col5 */ \
+  tmp4 = _mm_sub_pi16(col3, col4);            /* tmp4=col3-col4 */ \
+  tmp5 = _mm_sub_pi16(col2, col5);            /* tmp5=col2-col5 */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3);           /* tmp10=tmp0+tmp3 */ \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3);           /* tmp13=tmp0-tmp3 */ \
+  tmp11 = _mm_add_pi16(tmp1, tmp2);           /* tmp11=tmp1+tmp2 */ \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2);           /* tmp12=tmp1-tmp2 */ \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11);          /* out0=tmp10+tmp11 */ \
+  out4 = _mm_sub_pi16(tmp10, tmp11);          /* out4=tmp10-tmp11 */ \
+  out0 = _mm_slli_pi16(out0, PASS1_BITS); \
+  out4 = _mm_slli_pi16(out4, PASS1_BITS); \
+  \
+  DO_FDCT_COMMON(1) \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+  __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+  __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+  __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+  __m64 tmp10, tmp11; \
+  \
+  col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]);  /* (40 50 60 70) */ \
+  col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]);  /* (41 51 61 71) */ \
+  col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]);  /* (42 52 62 72) */ \
+  col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]);  /* (43 53 63 73) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  col23a = _mm_unpacklo_pi16(col2l, col3l);   /* col23a=(02 03 12 13) */ \
+  col23b = _mm_unpackhi_pi16(col2l, col3l);   /* col23b=(22 23 32 33) */ \
+  col23c = _mm_unpacklo_pi16(col2h, col3h);   /* col23c=(42 43 52 53) */ \
+  col23d = _mm_unpackhi_pi16(col2h, col3h);   /* col23d=(62 63 72 73) */ \
+  \
+  col01a = _mm_unpacklo_pi16(col0l, col1l);   /* col01a=(00 01 10 11) */ \
+  col01b = _mm_unpackhi_pi16(col0l, col1l);   /* col01b=(20 21 30 31) */ \
+  col01c = _mm_unpacklo_pi16(col0h, col1h);   /* col01c=(40 41 50 51) */ \
+  col01d = _mm_unpackhi_pi16(col0h, col1h);   /* col01d=(60 61 70 71) */ \
+  \
+  row0 = _mm_unpacklo_pi32(col01a, col23a);   /* row0=(00 01 02 03) */ \
+  row1 = _mm_unpackhi_pi32(col01a, col23a);   /* row1=(10 11 12 13) */ \
+  row6 = _mm_unpacklo_pi32(col01d, col23d);   /* row6=(60 61 62 63) */ \
+  row7 = _mm_unpackhi_pi32(col01d, col23d);   /* row7=(70 71 72 73) */ \
+  \
+  tmp6 = _mm_sub_pi16(row1, row6);            /* tmp6=row1-row6 */ \
+  tmp7 = _mm_sub_pi16(row0, row7);            /* tmp7=row0-row7 */ \
+  tmp1 = _mm_add_pi16(row1, row6);            /* tmp1=row1+row6 */ \
+  tmp0 = _mm_add_pi16(row0, row7);            /* tmp0=row0+row7 */ \
+  \
+  row2 = _mm_unpacklo_pi32(col01b, col23b);   /* row2=(20 21 22 23) */ \
+  row3 = _mm_unpackhi_pi32(col01b, col23b);   /* row3=(30 31 32 33) */ \
+  row4 = _mm_unpacklo_pi32(col01c, col23c);   /* row4=(40 41 42 43) */ \
+  row5 = _mm_unpackhi_pi32(col01c, col23c);   /* row5=(50 51 52 53) */ \
+  \
+  tmp3 = _mm_add_pi16(row3, row4);            /* tmp3=row3+row4 */ \
+  tmp2 = _mm_add_pi16(row2, row5);            /* tmp2=row2+row5 */ \
+  tmp4 = _mm_sub_pi16(row3, row4);            /* tmp4=row3-row4 */ \
+  tmp5 = _mm_sub_pi16(row2, row5);            /* tmp5=row2-row5 */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3);           /* tmp10=tmp0+tmp3 */ \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3);           /* tmp13=tmp0-tmp3 */ \
+  tmp11 = _mm_add_pi16(tmp1, tmp2);           /* tmp11=tmp1+tmp2 */ \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2);           /* tmp12=tmp1-tmp2 */ \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11);          /* out0=tmp10+tmp11 */ \
+  out4 = _mm_sub_pi16(tmp10, tmp11);          /* out4=tmp10-tmp11 */ \
+  \
+  out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \
+  out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \
+  out0 = _mm_srai_pi16(out0, PASS1_BITS); \
+  out4 = _mm_srai_pi16(out4, PASS1_BITS); \
+  \
+  DO_FDCT_COMMON(2) \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_islow_mmi(DCTELEM *data)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 tmp12, tmp13;
+  DCTELEM *dataptr = data;
+
+  /* Pass 1: process rows. */
+
+  DO_FDCT_PASS1()
+  dataptr += DCTSIZE * 4;
+  DO_FDCT_PASS1()
+
+  /* Pass 2: process columns. */
+
+  dataptr = data;
+  DO_FDCT_PASS2()
+  dataptr += 4;
+  DO_FDCT_PASS2()
+}
diff --git a/media/libjpeg/simd/mips64/jidctfst-mmi.c b/media/libjpeg/simd/mips64/jidctfst-mmi.c
new file mode 100644
index 0000000000..503bb35a3c
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jidctfst-mmi.c
@@ -0,0 +1,395 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+#define PASS1_BITS  2
+
+#define FIX_1_082  ((short)277)                   /* FIX(1.082392200) */
+#define FIX_1_414  ((short)362)                   /* FIX(1.414213562) */
+#define FIX_1_847  ((short)473)                   /* FIX(1.847759065) */
+#define FIX_2_613  ((short)669)                   /* FIX(2.613125930) */
+#define FIX_1_613  ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F1082,
+  index_PW_F1414,
+  index_PW_F1847,
+  index_PW_MF1613,
+  index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(FIX_1_082 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_414 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_847 << CONST_SHIFT),
+  _uint64_set1_pi16(-FIX_1_613 << CONST_SHIFT),
+  _uint64_set1_pi8(CENTERJSAMPLE)
+};
+
+#define PW_F1414        get_const_value(index_PW_F1414)
+#define PW_F1847        get_const_value(index_PW_F1847)
+#define PW_MF1613       get_const_value(index_PW_MF1613)
+#define PW_F1082        get_const_value(index_PW_F1082)
+#define PB_CENTERJSAMP  get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32)  (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64)  (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON() { \
+  tmp7 = _mm_add_pi16(z11, z13); \
+  \
+  tmp11 = _mm_sub_pi16(z11, z13); \
+  tmp11 = _mm_slli_pi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+  tmp11 = _mm_mulhi_pi16(tmp11, PW_F1414); \
+  \
+  tmp10 = _mm_slli_pi16(z12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_slli_pi16(z10, PRE_MULTIPLY_SCALE_BITS); \
+  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
+  \
+  z5 = _mm_add_pi16(tmp10, tmp12); \
+  z5 = _mm_mulhi_pi16(z5, PW_F1847); \
+  \
+  tmp10 = _mm_mulhi_pi16(tmp10, PW_F1082); \
+  tmp10 = _mm_sub_pi16(tmp10, z5); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_MF1613); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_add_pi16(tmp12, z5); \
+  \
+  /* Final output stage */ \
+  \
+  tmp6 = _mm_sub_pi16(tmp12, tmp7); \
+  tmp5 = _mm_sub_pi16(tmp11, tmp6); \
+  tmp4 = _mm_add_pi16(tmp10, tmp5); \
+  \
+  out0 = _mm_add_pi16(tmp0, tmp7); \
+  out7 = _mm_sub_pi16(tmp0, tmp7); \
+  out1 = _mm_add_pi16(tmp1, tmp6); \
+  out6 = _mm_sub_pi16(tmp1, tmp6); \
+  \
+  out2 = _mm_add_pi16(tmp2, tmp5); \
+  out5 = _mm_sub_pi16(tmp2, tmp5); \
+  out4 = _mm_add_pi16(tmp3, tmp4); \
+  out3 = _mm_sub_pi16(tmp3, tmp4); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+  __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+  __m64 quant0l, quant1l, quant2l, quant3l; \
+  __m64 quant4l, quant5l, quant6l, quant7l; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m32 col0a, col1a, mm0; \
+  \
+  col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+  col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+  mm0 = _mm_or_si32(col0a, col1a); \
+  \
+  if (test_m32_zero(mm0)) { \
+    __m64 mm1, mm2; \
+    \
+    col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+    col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+    col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+    col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+    col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+    col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+    col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+    col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+    \
+    mm1 = _mm_or_si64(col1l, col3l); \
+    mm2 = _mm_or_si64(col2l, col4l); \
+    mm1 = _mm_or_si64(mm1, col5l); \
+    mm2 = _mm_or_si64(mm2, col6l); \
+    mm1 = _mm_or_si64(mm1, col7l); \
+    mm1 = _mm_or_si64(mm1, mm2); \
+    \
+    if (test_m64_zero(mm1)) { \
+      __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+      \
+      /* AC terms all zero */ \
+      \
+      quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+      \
+      dcval = _mm_mullo_pi16(col0l, quant0l);    /* dcval=(00 10 20 30) */ \
+      \
+      dcvall = _mm_unpacklo_pi16(dcval, dcval);  /* dcvall=(00 00 10 10) */ \
+      dcvalh = _mm_unpackhi_pi16(dcval, dcval);  /* dcvalh=(20 20 30 30) */ \
+      \
+      row0 = _mm_unpacklo_pi32(dcvall, dcvall);  /* row0=(00 00 00 00) */ \
+      row1 = _mm_unpackhi_pi32(dcvall, dcvall);  /* row1=(10 10 10 10) */ \
+      row2 = _mm_unpacklo_pi32(dcvalh, dcvalh);  /* row2=(20 20 20 20) */ \
+      row3 = _mm_unpackhi_pi32(dcvalh, dcvalh);  /* row3=(30 30 30 30) */ \
+      \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+      \
+      goto nextcolumn##iter; \
+    } \
+  } \
+  \
+  /* Even part */ \
+  \
+  col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]);  /* (04 14 24 34) */ \
+  col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]);  /* (06 16 26 36) */ \
+  \
+  quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+  quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+  quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+  quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+  \
+  tmp0 = _mm_mullo_pi16(col0l, quant0l); \
+  tmp1 = _mm_mullo_pi16(col2l, quant2l); \
+  tmp2 = _mm_mullo_pi16(col4l, quant4l); \
+  tmp3 = _mm_mullo_pi16(col6l, quant6l); \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp2); \
+  tmp11 = _mm_sub_pi16(tmp0, tmp2); \
+  tmp13 = _mm_add_pi16(tmp1, tmp3); \
+  \
+  tmp12 = _mm_sub_pi16(tmp1, tmp3); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]);  /* (05 15 25 35) */ \
+  col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]);  /* (07 17 27 37) */ \
+  \
+  quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+  quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+  quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+  quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+  \
+  tmp4 = _mm_mullo_pi16(col1l, quant1l); \
+  tmp5 = _mm_mullo_pi16(col3l, quant3l); \
+  tmp6 = _mm_mullo_pi16(col5l, quant5l); \
+  tmp7 = _mm_mullo_pi16(col7l, quant7l); \
+  \
+  z13 = _mm_add_pi16(tmp6, tmp5); \
+  z10 = _mm_sub_pi16(tmp6, tmp5); \
+  z11 = _mm_add_pi16(tmp4, tmp7); \
+  z12 = _mm_sub_pi16(tmp4, tmp7); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+  /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+  /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+  /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row01a = _mm_unpacklo_pi16(out0, out1);     /* row01a=(00 01 10 11) */ \
+  row23a = _mm_unpackhi_pi16(out0, out1);     /* row23a=(20 21 30 31) */ \
+  row01d = _mm_unpacklo_pi16(out6, out7);     /* row01d=(06 07 16 17) */ \
+  row23d = _mm_unpackhi_pi16(out6, out7);     /* row23d=(26 27 36 37) */ \
+  \
+  row01b = _mm_unpacklo_pi16(out2, out3);     /* row01b=(02 03 12 13) */ \
+  row23b = _mm_unpackhi_pi16(out2, out3);     /* row23b=(22 23 32 33) */ \
+  row01c = _mm_unpacklo_pi16(out4, out5);     /* row01c=(04 05 14 15) */ \
+  row23c = _mm_unpackhi_pi16(out4, out5);     /* row23c=(24 25 34 35) */ \
+  \
+  row0l = _mm_unpacklo_pi32(row01a, row01b);  /* row0l=(00 01 02 03) */ \
+  row1l = _mm_unpackhi_pi32(row01a, row01b);  /* row1l=(10 11 12 13) */ \
+  row2l = _mm_unpacklo_pi32(row23a, row23b);  /* row2l=(20 21 22 23) */ \
+  row3l = _mm_unpackhi_pi32(row23a, row23b);  /* row3l=(30 31 32 33) */ \
+  \
+  row0h = _mm_unpacklo_pi32(row01c, row01d);  /* row0h=(04 05 06 07) */ \
+  row1h = _mm_unpackhi_pi32(row01c, row01d);  /* row1h=(14 15 16 17) */ \
+  row2h = _mm_unpacklo_pi32(row23c, row23d);  /* row2h=(24 25 26 27) */ \
+  row3h = _mm_unpackhi_pi32(row23c, row23d);  /* row3h=(34 35 36 37) */ \
+  \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+  __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+  __m64 col0123a, col0123b, col0123c, col0123d; \
+  __m64 col01l, col01h, col23l, col23h; \
+  __m64 col0, col1, col2, col3; \
+  __m64 row06, row17, row24, row35; \
+  \
+  row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]);  /* (00 01 02 03) */ \
+  row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]);  /* (10 11 12 13) */ \
+  row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]);  /* (20 21 22 23) */ \
+  row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]);  /* (30 31 32 33) */ \
+  row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]);  /* (40 41 42 43) */ \
+  row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]);  /* (50 51 52 53) */ \
+  row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]);  /* (60 61 62 63) */ \
+  row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]);  /* (70 71 72 73) */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(row0l, row4l); \
+  tmp11 = _mm_sub_pi16(row0l, row4l); \
+  tmp13 = _mm_add_pi16(row2l, row6l); \
+  \
+  tmp12 = _mm_sub_pi16(row2l, row6l); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  z13 = _mm_add_pi16(row5l, row3l); \
+  z10 = _mm_sub_pi16(row5l, row3l); \
+  z11 = _mm_add_pi16(row1l, row7l); \
+  z12 = _mm_sub_pi16(row1l, row7l); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+  /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+  /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+  /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+  \
+  out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \
+  out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \
+  out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \
+  out3 = _mm_srai_pi16(out3, PASS1_BITS + 3); \
+  out4 = _mm_srai_pi16(out4, PASS1_BITS + 3); \
+  out5 = _mm_srai_pi16(out5, PASS1_BITS + 3); \
+  out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \
+  out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \
+  \
+  row06 = _mm_packs_pi16(out0, out6);  /* row06=(00 01 02 03 60 61 62 63) */ \
+  row17 = _mm_packs_pi16(out1, out7);  /* row17=(10 11 12 13 70 71 72 73) */ \
+  row24 = _mm_packs_pi16(out2, out4);  /* row24=(20 21 22 23 40 41 42 43) */ \
+  row35 = _mm_packs_pi16(out3, out5);  /* row35=(30 31 32 33 50 51 52 53) */ \
+  \
+  row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+  row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+  row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+  row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+  \
+  /* Transpose coefficients */ \
+  \
+  col0123a = _mm_unpacklo_pi8(row06, row17);  /* col0123a=(00 10 01 11 02 12 03 13) */ \
+  col0123d = _mm_unpackhi_pi8(row06, row17);  /* col0123d=(60 70 61 71 62 72 63 73) */ \
+  col0123b = _mm_unpacklo_pi8(row24, row35);  /* col0123b=(20 30 21 31 22 32 23 33) */ \
+  col0123c = _mm_unpackhi_pi8(row24, row35);  /* col0123c=(40 50 41 51 42 52 43 53) */ \
+  \
+  col01l = _mm_unpacklo_pi16(col0123a, col0123b);  /* col01l=(00 10 20 30 01 11 21 31) */ \
+  col23l = _mm_unpackhi_pi16(col0123a, col0123b);  /* col23l=(02 12 22 32 03 13 23 33) */ \
+  col01h = _mm_unpacklo_pi16(col0123c, col0123d);  /* col01h=(40 50 60 70 41 51 61 71) */ \
+  col23h = _mm_unpackhi_pi16(col0123c, col0123d);  /* col23h=(42 52 62 72 43 53 63 73) */ \
+  \
+  col0 = _mm_unpacklo_pi32(col01l, col01h);   /* col0=(00 10 20 30 40 50 60 70) */ \
+  col1 = _mm_unpackhi_pi32(col01l, col01h);   /* col1=(01 11 21 31 41 51 61 71) */ \
+  col2 = _mm_unpacklo_pi32(col23l, col23h);   /* col2=(02 12 22 32 42 52 62 72) */ \
+  col3 = _mm_unpackhi_pi32(col23l, col23h);   /* col3=(03 13 23 33 43 53 63 73) */ \
+  \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_ifast_mmi(void *dct_table, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 tmp10, tmp11, tmp12, tmp13;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 z5, z10, z11, z12, z13;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE *quantptr;
+  JCOEF *wsptr;
+  JCOEF workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *)dct_table;
+  wsptr = workspace;
+
+  DO_IDCT_PASS1(1)
+nextcolumn1:
+  inptr += 4;
+  quantptr += 4;
+  wsptr += DCTSIZE * 4;
+  DO_IDCT_PASS1(2)
+nextcolumn2:
+
+  /* Pass 2: process rows. */
+
+  wsptr = workspace;
+
+  DO_IDCT_PASS2(0)
+  wsptr += 4;
+  DO_IDCT_PASS2(4)
+}
diff --git a/media/libjpeg/simd/mips64/jidctint-mmi.c b/media/libjpeg/simd/mips64/jidctint-mmi.c
new file mode 100644
index 0000000000..cd3db980c5
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jidctint-mmi.c
@@ -0,0 +1,571 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCUATE INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+#define CENTERJSAMPLE  128
+
+#define FIX_0_298  ((short)2446)  /* FIX(0.298631336) */
+#define FIX_0_390  ((short)3196)  /* FIX(0.390180644) */
+#define FIX_0_899  ((short)7373)  /* FIX(0.899976223) */
+#define FIX_0_541  ((short)4433)  /* FIX(0.541196100) */
+#define FIX_0_765  ((short)6270)  /* FIX(0.765366865) */
+#define FIX_1_175  ((short)9633)  /* FIX(1.175875602) */
+#define FIX_1_501  ((short)12299) /* FIX(1.501321110) */
+#define FIX_1_847  ((short)15137) /* FIX(1.847759065) */
+#define FIX_1_961  ((short)16069) /* FIX(1.961570560) */
+#define FIX_2_053  ((short)16819) /* FIX(2.053119869) */
+#define FIX_2_562  ((short)20995) /* FIX(2.562915447) */
+#define FIX_3_072  ((short)25172) /* FIX(3.072711026) */
+
+enum const_index {
+  index_PW_F130_F054,
+  index_PW_F054_MF130,
+  index_PW_MF078_F117,
+  index_PW_F117_F078,
+  index_PW_MF060_MF089,
+  index_PW_MF089_F060,
+  index_PW_MF050_MF256,
+  index_PW_MF256_F050,
+  index_PD_DESCALE_P1,
+  index_PD_DESCALE_P2,
+  index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+                   FIX_0_541, (FIX_0_541 + FIX_0_765)),
+  _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+                   (FIX_0_541 - FIX_1_847), FIX_0_541),
+  _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+                   FIX_1_175, (FIX_1_175 - FIX_1_961)),
+  _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+                   (FIX_1_175 - FIX_0_390), FIX_1_175),
+  _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+                   -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+  _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+                   (FIX_1_501 - FIX_0_899), -FIX_0_899),
+  _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+                   -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+  _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+                   (FIX_3_072 - FIX_2_562), -FIX_2_562),
+  _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+  _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+  _uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE,
+                  CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE)
+};
+
+#define PW_F130_F054    get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130   get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117   get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078    get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089  get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060   get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256  get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050   get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1   get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2   get_const_value(index_PD_DESCALE_P2)
+#define PB_CENTERJSAMP  get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32)  (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64)  (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON(PASS) { \
+  __m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+  __m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \
+  __m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \
+  \
+  z3 = _mm_add_pi16(tmp0, tmp2); \
+  z4 = _mm_add_pi16(tmp1, tmp3); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = _mm_unpacklo_pi16(z3, z4); \
+  z34h = _mm_unpackhi_pi16(z3, z4); \
+  z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+  z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+  z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+  z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+  \
+  /* (Original) \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+   * tmp0 += z3;  tmp1 += z4; \
+   * tmp2 += z3;  tmp3 += z4; \
+   */ \
+  \
+  tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \
+  tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \
+  \
+  tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \
+  tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \
+  tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \
+  tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \
+  \
+  tmp0l = _mm_add_pi32(tmp0l, z3l); \
+  tmp0h = _mm_add_pi32(tmp0h, z3h); \
+  tmp3l = _mm_add_pi32(tmp3l, z4l); \
+  tmp3h = _mm_add_pi32(tmp3h, z4h); \
+  \
+  tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \
+  tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \
+  \
+  tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \
+  tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \
+  tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \
+  tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \
+  \
+  tmp1l = _mm_add_pi32(tmp1l, z4l); \
+  tmp1h = _mm_add_pi32(tmp1h, z4h); \
+  tmp2l = _mm_add_pi32(tmp2l, z3l); \
+  tmp2h = _mm_add_pi32(tmp2h, z3h); \
+  \
+  /* Final output stage */ \
+  \
+  out0l = _mm_add_pi32(tmp10l, tmp3l); \
+  out0h = _mm_add_pi32(tmp10h, tmp3h); \
+  out7l = _mm_sub_pi32(tmp10l, tmp3l); \
+  out7h = _mm_sub_pi32(tmp10h, tmp3h); \
+  \
+  out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \
+  out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \
+  out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \
+  out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \
+  \
+  out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+  out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+  out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+  out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+  \
+  out0 = _mm_packs_pi32(out0l, out0h); \
+  out7 = _mm_packs_pi32(out7l, out7h); \
+  \
+  out1l = _mm_add_pi32(tmp11l, tmp2l); \
+  out1h = _mm_add_pi32(tmp11h, tmp2h); \
+  out6l = _mm_sub_pi32(tmp11l, tmp2l); \
+  out6h = _mm_sub_pi32(tmp11h, tmp2h); \
+  \
+  out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+  out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+  out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+  out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+  \
+  out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+  out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+  out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+  out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+  \
+  out1 = _mm_packs_pi32(out1l, out1h); \
+  out6 = _mm_packs_pi32(out6l, out6h); \
+  \
+  out2l = _mm_add_pi32(tmp12l, tmp1l); \
+  out2h = _mm_add_pi32(tmp12h, tmp1h); \
+  out5l = _mm_sub_pi32(tmp12l, tmp1l); \
+  out5h = _mm_sub_pi32(tmp12h, tmp1h); \
+  \
+  out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+  out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+  out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+  out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+  \
+  out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+  out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+  out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+  out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+  \
+  out2 = _mm_packs_pi32(out2l, out2h); \
+  out5 = _mm_packs_pi32(out5l, out5h); \
+  \
+  out3l = _mm_add_pi32(tmp13l, tmp0l); \
+  out3h = _mm_add_pi32(tmp13h, tmp0h); \
+  \
+  out4l = _mm_sub_pi32(tmp13l, tmp0l); \
+  out4h = _mm_sub_pi32(tmp13h, tmp0h); \
+  \
+  out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+  out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+  out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+  out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+  \
+  out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \
+  out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \
+  out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \
+  out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \
+  \
+  out3 = _mm_packs_pi32(out3l, out3h); \
+  out4 = _mm_packs_pi32(out4l, out4h); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+  __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+  __m64 quant0l, quant1l, quant2l, quant3l; \
+  __m64 quant4l, quant5l, quant6l, quant7l; \
+  __m64 z23, z2, z3, z23l, z23h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+  __m32 col0a, col1a, mm0; \
+  \
+  col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+  col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+  mm0 = _mm_or_si32(col0a, col1a); \
+  \
+  if (test_m32_zero(mm0)) { \
+    __m64 mm1, mm2; \
+    \
+    col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+    col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+    col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+    col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+    col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+    col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+    col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+    col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+    \
+    mm1 = _mm_or_si64(col1l, col3l); \
+    mm2 = _mm_or_si64(col2l, col4l); \
+    mm1 = _mm_or_si64(mm1, col5l); \
+    mm2 = _mm_or_si64(mm2, col6l); \
+    mm1 = _mm_or_si64(mm1, col7l); \
+    mm1 = _mm_or_si64(mm1, mm2); \
+    \
+    if (test_m64_zero(mm1)) { \
+      __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+      \
+      /* AC terms all zero */ \
+      \
+      quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+      \
+      dcval = _mm_mullo_pi16(col0l, quant0l); \
+      dcval = _mm_slli_pi16(dcval, PASS1_BITS);  /* dcval=(00 10 20 30) */ \
+      \
+      dcvall = _mm_unpacklo_pi16(dcval, dcval);  /* dcvall=(00 00 10 10) */ \
+      dcvalh = _mm_unpackhi_pi16(dcval, dcval);  /* dcvalh=(20 20 30 30) */ \
+      \
+      row0 = _mm_unpacklo_pi32(dcvall, dcvall);  /* row0=(00 00 00 00) */ \
+      row1 = _mm_unpackhi_pi32(dcvall, dcvall);  /* row1=(10 10 10 10) */ \
+      row2 = _mm_unpacklo_pi32(dcvalh, dcvalh);  /* row2=(20 20 20 20) */ \
+      row3 = _mm_unpackhi_pi32(dcvalh, dcvalh);  /* row3=(30 30 30 30) */ \
+      \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+      \
+      goto nextcolumn##iter; \
+    } \
+  } \
+  \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]);  /* (04 14 24 34) */ \
+  col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]);  /* (06 16 26 36) */ \
+  \
+  quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+  quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+  quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+  quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+  \
+  z2 = _mm_mullo_pi16(col2l, quant2l); \
+  z3 = _mm_mullo_pi16(col6l, quant6l); \
+  \
+  z23l = _mm_unpacklo_pi16(z2, z3); \
+  z23h = _mm_unpackhi_pi16(z2, z3); \
+  tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+  tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+  tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+  tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+  \
+  z2 = _mm_mullo_pi16(col0l, quant0l); \
+  z3 = _mm_mullo_pi16(col4l, quant4l); \
+  \
+  z23 = _mm_add_pi16(z2, z3); \
+  tmp0l = _mm_loadlo_pi16_f(z23); \
+  tmp0h = _mm_loadhi_pi16_f(z23); \
+  tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+  tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+  \
+  tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+  tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+  tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+  tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+  \
+  z23 = _mm_sub_pi16(z2, z3); \
+  tmp1l = _mm_loadlo_pi16_f(z23); \
+  tmp1h = _mm_loadhi_pi16_f(z23); \
+  tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+  tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+  \
+  tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+  tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+  tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+  tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]);  /* (05 15 25 35) */ \
+  col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]);  /* (07 17 27 37) */ \
+  \
+  quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+  quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+  quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+  quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+  \
+  tmp0 = _mm_mullo_pi16(col7l, quant7l); \
+  tmp1 = _mm_mullo_pi16(col5l, quant5l); \
+  tmp2 = _mm_mullo_pi16(col3l, quant3l); \
+  tmp3 = _mm_mullo_pi16(col1l, quant1l); \
+  \
+  DO_IDCT_COMMON(1) \
+  \
+  /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+  /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+  /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+  /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row01a = _mm_unpacklo_pi16(out0, out1);     /* row01a=(00 01 10 11) */ \
+  row23a = _mm_unpackhi_pi16(out0, out1);     /* row23a=(20 21 30 31) */ \
+  row01d = _mm_unpacklo_pi16(out6, out7);     /* row01d=(06 07 16 17) */ \
+  row23d = _mm_unpackhi_pi16(out6, out7);     /* row23d=(26 27 36 37) */ \
+  \
+  row01b = _mm_unpacklo_pi16(out2, out3);     /* row01b=(02 03 12 13) */ \
+  row23b = _mm_unpackhi_pi16(out2, out3);     /* row23b=(22 23 32 33) */ \
+  row01c = _mm_unpacklo_pi16(out4, out5);     /* row01c=(04 05 14 15) */ \
+  row23c = _mm_unpackhi_pi16(out4, out5);     /* row23c=(24 25 34 35) */ \
+  \
+  row0l = _mm_unpacklo_pi32(row01a, row01b);  /* row0l=(00 01 02 03) */ \
+  row1l = _mm_unpackhi_pi32(row01a, row01b);  /* row1l=(10 11 12 13) */ \
+  row2l = _mm_unpacklo_pi32(row23a, row23b);  /* row2l=(20 21 22 23) */ \
+  row3l = _mm_unpackhi_pi32(row23a, row23b);  /* row3l=(30 31 32 33) */ \
+  \
+  row0h = _mm_unpacklo_pi32(row01c, row01d);  /* row0h=(04 05 06 07) */ \
+  row1h = _mm_unpackhi_pi32(row01c, row01d);  /* row1h=(14 15 16 17) */ \
+  row2h = _mm_unpacklo_pi32(row23c, row23d);  /* row2h=(24 25 26 27) */ \
+  row3h = _mm_unpackhi_pi32(row23c, row23d);  /* row3h=(34 35 36 37) */ \
+  \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+  __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+  __m64 z23, z23l, z23h; \
+  __m64 col0123a, col0123b, col0123c, col0123d; \
+  __m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \
+  __m64 col0, col1, col2, col3; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+  \
+  row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]);  /* (00 01 02 03) */ \
+  row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]);  /* (10 11 12 13) */ \
+  row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]);  /* (20 21 22 23) */ \
+  row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]);  /* (30 31 32 33) */ \
+  row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]);  /* (40 41 42 43) */ \
+  row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]);  /* (50 51 52 53) */ \
+  row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]);  /* (60 61 62 63) */ \
+  row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]);  /* (70 71 72 73) */ \
+  \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  z23l = _mm_unpacklo_pi16(row2l, row6l); \
+  z23h = _mm_unpackhi_pi16(row2l, row6l); \
+  \
+  tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+  tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+  tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+  tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+  \
+  z23 = _mm_add_pi16(row0l, row4l); \
+  tmp0l = _mm_loadlo_pi16_f(z23); \
+  tmp0h = _mm_loadhi_pi16_f(z23); \
+  tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+  tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+  \
+  tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+  tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+  tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+  tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+  \
+  z23 = _mm_sub_pi16(row0l, row4l); \
+  tmp1l = _mm_loadlo_pi16_f(z23); \
+  tmp1h = _mm_loadhi_pi16_f(z23); \
+  tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+  tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+  \
+  tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+  tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+  tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+  tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  tmp0 = row7l; \
+  tmp1 = row5l; \
+  tmp2 = row3l; \
+  tmp3 = row1l; \
+  \
+  DO_IDCT_COMMON(2) \
+  \
+  /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+  /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+  /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+  /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+  \
+  row06 = _mm_packs_pi16(out0, out6);  /* row06=(00 01 02 03 60 61 62 63) */ \
+  row17 = _mm_packs_pi16(out1, out7);  /* row17=(10 11 12 13 70 71 72 73) */ \
+  row24 = _mm_packs_pi16(out2, out4);  /* row24=(20 21 22 23 40 41 42 43) */ \
+  row35 = _mm_packs_pi16(out3, out5);  /* row35=(30 31 32 33 50 51 52 53) */ \
+  \
+  row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+  row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+  row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+  row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+  \
+  /* Transpose coefficients */ \
+  \
+  col0123a = _mm_unpacklo_pi8(row06, row17);  /* col0123a=(00 10 01 11 02 12 03 13) */ \
+  col0123d = _mm_unpackhi_pi8(row06, row17);  /* col0123d=(60 70 61 71 62 72 63 73) */ \
+  col0123b = _mm_unpacklo_pi8(row24, row35);  /* col0123b=(20 30 21 31 22 32 23 33) */ \
+  col0123c = _mm_unpackhi_pi8(row24, row35);  /* col0123c=(40 50 41 51 42 52 43 53) */ \
+  \
+  col01l = _mm_unpacklo_pi16(col0123a, col0123b);  /* col01l=(00 10 20 30 01 11 21 31) */ \
+  col23l = _mm_unpackhi_pi16(col0123a, col0123b);  /* col23l=(02 12 22 32 03 13 23 33) */ \
+  col01h = _mm_unpacklo_pi16(col0123c, col0123d);  /* col01h=(40 50 60 70 41 51 61 71) */ \
+  col23h = _mm_unpackhi_pi16(col0123c, col0123d);  /* col23h=(42 52 62 72 43 53 63 73) */ \
+  \
+  col0 = _mm_unpacklo_pi32(col01l, col01h);   /* col0=(00 10 20 30 40 50 60 70) */ \
+  col1 = _mm_unpackhi_pi32(col01l, col01h);   /* col1=(01 11 21 31 41 51 61 71) */ \
+  col2 = _mm_unpacklo_pi32(col23l, col23h);   /* col2=(02 12 22 32 42 52 62 72) */ \
+  col3 = _mm_unpackhi_pi32(col23l, col23h);   /* col3=(03 13 23 33 43 53 63 73) */ \
+  \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_islow_mmi(void *dct_table, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE *quantptr;
+  JCOEF *wsptr;
+  JCOEF workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *)dct_table;
+  wsptr = workspace;
+
+  DO_IDCT_PASS1(1)
+nextcolumn1:
+  inptr += 4;
+  quantptr += 4;
+  wsptr += DCTSIZE * 4;
+  DO_IDCT_PASS1(2)
+nextcolumn2:
+
+  /* Pass 2: process rows. */
+
+  wsptr = workspace;
+
+  DO_IDCT_PASS2(0)
+  wsptr += 4;
+  DO_IDCT_PASS2(4)
+}
diff --git a/media/libjpeg/simd/mips64/jquanti-mmi.c b/media/libjpeg/simd/mips64/jquanti-mmi.c
new file mode 100644
index 0000000000..339002fd80
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jquanti-mmi.c
@@ -0,0 +1,124 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * Copyright (C) 2018-2019, D. R. Commander.  All Rights Reserved.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define DO_QUANT() { \
+  __m64 rowl, rowh, rowls, rowhs, rowlsave, rowhsave; \
+  __m64 corrl, corrh, recipl, reciph, scalel, scaleh; \
+  \
+  rowl = _mm_load_si64((__m64 *)&workspace[0]); \
+  rowh = _mm_load_si64((__m64 *)&workspace[4]); \
+  \
+  /* Branch-less absolute value */ \
+  rowls = _mm_srai_pi16(rowl, (WORD_BIT - 1));  /* -1 if value < 0, */ \
+                                                /* 0 otherwise */ \
+  rowhs = _mm_srai_pi16(rowh, (WORD_BIT - 1)); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  corrl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]);  /* correction */ \
+  corrh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
+  \
+  rowlsave = rowl = _mm_add_pi16(rowl, corrl);  /* correction + roundfactor */ \
+  rowhsave = rowh = _mm_add_pi16(rowh, corrh); \
+  \
+  recipl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]);  /* reciprocal */ \
+  reciph = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, recipl); \
+  rowh = _mm_mulhi_pi16(rowh, reciph); \
+  \
+  /* reciprocal is always negative (MSB=1), so we always need to add the */ \
+  /* initial value (input value is never negative as we inverted it at the */ \
+  /* start of this routine) */ \
+  rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  scalel = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]);  /* scale */ \
+  scaleh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, scalel); \
+  rowh = _mm_mulhi_pi16(rowh, scaleh); \
+  \
+  /* determine if scale is negative */ \
+  scalel = _mm_srai_pi16(scalel, (WORD_BIT - 1)); \
+  scaleh = _mm_srai_pi16(scaleh, (WORD_BIT - 1)); \
+  \
+  /* and add input if it is */ \
+  scalel = _mm_and_si64(scalel, rowlsave); \
+  scaleh = _mm_and_si64(scaleh, rowhsave); \
+  rowl = _mm_add_pi16(rowl, scalel); \
+  rowh = _mm_add_pi16(rowh, scaleh); \
+  \
+  /* then check if negative input */ \
+  rowlsave = _mm_srai_pi16(rowlsave, (WORD_BIT - 1)); \
+  rowhsave = _mm_srai_pi16(rowhsave, (WORD_BIT - 1)); \
+  \
+  /* and add scale if it is */ \
+  rowlsave = _mm_and_si64(rowlsave, scalel); \
+  rowhsave = _mm_and_si64(rowhsave, scaleh); \
+  rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  _mm_store_si64((__m64 *)&output_ptr[0], rowl); \
+  _mm_store_si64((__m64 *)&output_ptr[4], rowh); \
+  \
+  workspace += DCTSIZE; \
+  divisors += DCTSIZE; \
+  output_ptr += DCTSIZE; \
+}
+
+
+void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
+                        DCTELEM *workspace)
+{
+  JCOEFPTR output_ptr = coef_block;
+
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+}
diff --git a/media/libjpeg/simd/mips64/jsimd.c b/media/libjpeg/simd/mips64/jsimd.c
new file mode 100644
index 0000000000..e8f1af562b
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jsimd.c
@@ -0,0 +1,870 @@
+/*
+ * jsimd_mips64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015, 2018, Matthieu Darbois.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "ASEs implemented", 16) != 0)
+    return 0;
+  buffer += 16;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "loongson-mmi"))
+        simd_support |= JSIMD_MMI;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+#if defined(__linux__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__linux__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#elif defined(__mips_loongson_vector_rev)
+  /* Only enable MMI by default on non-Linux platforms when the compiler flags
+   * support it. */
+  simd_support |= JSIMD_MMI;
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEMMI");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_MMI;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_extrgb_ycc_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_extrgbx_ycc_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_extbgr_ycc_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_extbgrx_ycc_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_extxbgr_ycc_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_extxrgb_ycc_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_rgb_ycc_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_extrgb_gray_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_extrgbx_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_extbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_extbgrx_gray_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_extxbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_extxrgb_gray_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_rgb_gray_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_ycc_extrgb_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_ycc_extrgbx_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_ycc_extbgr_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_ycc_extbgrx_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_ycc_extxbgr_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_ycc_extxrgb_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_ycc_rgb_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_mmi(cinfo->image_width, cinfo->max_v_samp_factor,
+                            compptr->v_samp_factor, compptr->width_in_blocks,
+                            input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v2_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v2_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v2_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v2_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v2_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v2_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v2_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v1_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v1_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v1_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v1_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v1_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v1_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v1_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_mmi(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/media/libjpeg/simd/mips64/jsimd_mmi.h b/media/libjpeg/simd/mips64/jsimd_mmi.h
new file mode 100644
index 0000000000..5e4261c9d9
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jsimd_mmi.h
@@ -0,0 +1,69 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           QingfaLiu   <liuqingfa-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jdct.h"
+#include "loongson-mmintrin.h"
+
+
+/* Common code */
+#if defined(_ABI64) && _MIPS_SIM == _ABI64
+# define PTR_ADDU  "daddu "
+# define PTR_SLL   "dsll "
+#else
+# define PTR_ADDU  "addu "
+# define PTR_SLL   "sll "
+#endif
+
+#define SIZEOF_MMWORD  8
+#define BYTE_BIT  8
+#define WORD_BIT  16
+#define SCALEBITS  16
+
+#define _uint64_set_pi8(a, b, c, d, e, f, g, h) \
+  (((uint64_t)(uint8_t)a << 56) | \
+   ((uint64_t)(uint8_t)b << 48) | \
+   ((uint64_t)(uint8_t)c << 40) | \
+   ((uint64_t)(uint8_t)d << 32) | \
+   ((uint64_t)(uint8_t)e << 24) | \
+   ((uint64_t)(uint8_t)f << 16) | \
+   ((uint64_t)(uint8_t)g << 8)  | \
+   ((uint64_t)(uint8_t)h))
+#define _uint64_set1_pi8(a)  _uint64_set_pi8(a, a, a, a, a, a, a, a)
+#define _uint64_set_pi16(a, b, c, d) \
+  (((uint64_t)(uint16_t)a << 48) | \
+   ((uint64_t)(uint16_t)b << 32) | \
+   ((uint64_t)(uint16_t)c << 16) | \
+   ((uint64_t)(uint16_t)d))
+#define _uint64_set1_pi16(a)  _uint64_set_pi16(a, a, a, a)
+#define _uint64_set_pi32(a, b) \
+  (((uint64_t)(uint32_t)a << 32) | \
+   ((uint64_t)(uint32_t)b))
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
diff --git a/media/libjpeg/simd/mips64/loongson-mmintrin.h b/media/libjpeg/simd/mips64/loongson-mmintrin.h
new file mode 100644
index 0000000000..db9b35ab60
--- /dev/null
+++ b/media/libjpeg/simd/mips64/loongson-mmintrin.h
@@ -0,0 +1,1334 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Copyright (C) 2019, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#ifndef __LOONGSON_MMINTRIN_H__
+#define __LOONGSON_MMINTRIN_H__
+
+#include <stdint.h>
+
+
+#define FUNCTION_ATTRIBS \
+  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+
+/* Vectors are stored in 64-bit floating-point registers. */
+typedef double __m64;
+
+/* Having a 32-bit datatype allows us to use 32-bit loads in places like
+   load8888. */
+typedef float __m32;
+
+
+/********** Set Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setzero_si64(void)
+{
+  return 0.0;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
+            uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
+{
+  __m64 ret;
+  uint32_t lo = ((uint32_t)__b6 << 24) |
+                ((uint32_t)__b4 << 16) |
+                ((uint32_t)__b2 << 8) |
+                (uint32_t)__b0;
+  uint32_t hi = ((uint32_t)__b7 << 24) |
+                ((uint32_t)__b5 << 16) |
+                ((uint32_t)__b3 << 8) |
+                (uint32_t)__b1;
+
+  asm("mtc1      %1, %0\n\t"
+      "mtc1      %2, $f0\n\t"
+      "punpcklbh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (lo), "r" (hi)
+      : "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
+{
+  __m64 ret;
+  uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
+  uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
+
+  asm("mtc1      %1, %0\n\t"
+      "mtc1      %2, $f0\n\t"
+      "punpcklhw %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (lo), "r" (hi)
+      : "$f0"
+     );
+
+  return ret;
+}
+
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi32(uint32_t __i1, uint32_t __i0)
+{
+  if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
+    uint64_t val = ((uint64_t)__i1 << 32) |
+                   ((uint64_t)__i0 <<  0);
+
+    return *(__m64 *)&val;
+  } else if (__i1 == __i0) {
+    uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
+    __m64 ret;
+
+    asm("pshufh %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
+       );
+
+    return ret;
+  } else {
+    uint64_t val = ((uint64_t)__i1 << 32) |
+                   ((uint64_t)__i0 <<  0);
+
+    return *(__m64 *)&val;
+  }
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi8(uint8_t __b0)
+{
+  __m64 ret;
+
+  asm("sll    $8, %1, 8\n\t"
+      "or     %1, %1, $8\n\t"
+      "mtc1   %1, %0\n\t"
+      "mtc1   $0, $f0\n\t"
+      "pshufh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (__b0)
+      : "$8", "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi16(uint16_t __h0)
+{
+  __m64 ret;
+
+  asm("mtc1   %1, %0\n\t"
+      "mtc1   $0, $f0\n\t"
+      "pshufh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (__h0)
+      : "$8", "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi32(unsigned __i0)
+{
+  return _mm_set_pi32(__i0, __i0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
+             uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
+{
+  return _mm_set_pi8(__h7, __h6, __h5, __h4,
+                     __h3, __h2, __h1, __h0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
+{
+  return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi32(uint32_t __i0, uint32_t __i1)
+{
+  return _mm_set_pi32(__i1, __i0);
+}
+
+
+/********** Arithmetic Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddsb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddusb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddush %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pavgb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pavgh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaddhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaxsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaxub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pminsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pminub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline int FUNCTION_ATTRIBS
+_mm_movemask_pi8(__m64 __m1)
+{
+  int ret;
+
+  asm("pmovmskb %0, %1\n\t"
+      : "=r" (ret)
+      : "y" (__m1)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmulhh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmulhuh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmullh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mul_pu32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmuluw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sad_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psadbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_asub_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pasubub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_biadd_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("biadd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubsb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubusb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubush %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Logical Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("and %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("andn %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si32(__m32 __m1, __m32 __m2)
+{
+  __m32 ret;
+
+  asm("or %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("or %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("xor %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Shift Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psllh  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psllw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsll  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrlh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrlw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsrl  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrah %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psraw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsra %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+
+/********** Conversion Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+to_m64(uint64_t x)
+{
+  return *(__m64 *)&x;
+}
+
+extern __inline uint64_t FUNCTION_ATTRIBS
+to_uint64(__m64 x)
+{
+  return *(uint64_t *)&x;
+}
+
+
+/********** Comparison Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgtb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgth %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgtw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpltb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmplth %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpltw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Miscellaneous Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsshb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsswh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsswh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packushb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_extract_pi16(__m64 __m, int64_t __pos)
+{
+  __m64 ret;
+
+  asm("pextrh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__pos)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
+{
+  __m64 ret;
+
+  switch (__pos) {
+  case 0:
+
+    asm("pinsrh_0 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+
+  case 1:
+
+    asm("pinsrh_1 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+  case 2:
+
+    asm("pinsrh_2 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+
+  case 3:
+
+    asm("pinsrh_3 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+  }
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_shuffle_pi16(__m64 __m, int64_t __n)
+{
+  __m64 ret;
+
+  asm("pshufh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__n)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+/* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
+   which preserves the data. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
+   datatype, which allows load8888 to use 32-bit loads. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_pi32(__m32 *dest, __m64 src)
+{
+  src = _mm_packs_pu16(src, _mm_setzero_si64());
+
+  asm("swc1 %1, %0\n\t"
+      : "=m" (*dest)
+      : "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_si64(__m64 *dest, __m64 src)
+{
+  asm("sdc1 %1, %0 \n\t"
+      : "=m" (*dest)
+      : "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_storeu_si64(__m64 *dest, __m64 src)
+{
+  asm("gssdlc1 %1, 7(%0) \n\t"
+      "gssdrc1 %1, 0(%0) \n\t"
+      :
+      : "r" (dest), "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si32(const __m32 *src)
+{
+  __m32 ret;
+
+  asm("lwc1 %0, %1\n\t"
+      : "=f" (ret)
+      : "m" (*src)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si64(const __m64 *src)
+{
+  __m64 ret;
+
+  asm("ldc1 %0, %1\n\t"
+      : "=f" (ret)
+      : "m" (*src)
+      : "memory"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadu_si64(const __m64 *src)
+{
+  __m64 ret;
+
+  asm("gsldlc1 %0,  7(%1)\n\t"
+      "gsldrc1 %0,  0(%1)\n\t"
+      : "=f" (ret)
+      : "r" (src)
+      : "memory"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8(const uint32_t *src)
+{
+  return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8_f(__m64 src)
+{
+  return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi8_f(__m64 src)
+{
+  return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16(__m64 src)
+{
+  return _mm_unpacklo_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16_f(__m64 src)
+{
+  return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16(__m64 src)
+{
+  return _mm_unpackhi_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16_f(__m64 src)
+{
+  return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha(__m64 pixel)
+{
+  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha_rev(__m64 pixel)
+{
+  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+#endif  /* __LOONGSON_MMINTRIN_H__ */
diff --git a/media/libjpeg/simd/nasm/jcolsamp.inc b/media/libjpeg/simd/nasm/jcolsamp.inc
new file mode 100644
index 0000000000..6f6d7f29d1
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jcolsamp.inc
@@ -0,0 +1,135 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_RED == 0
+%define mmA  mm0
+%define mmB  mm1
+%define xmmA  xmm0
+%define xmmB  xmm1
+%define ymmA  ymm0
+%define ymmB  ymm1
+%elif RGB_GREEN == 0
+%define mmA  mm2
+%define mmB  mm3
+%define xmmA  xmm2
+%define xmmB  xmm3
+%define ymmA  ymm2
+%define ymmB  ymm3
+%elif RGB_BLUE == 0
+%define mmA  mm4
+%define mmB  mm5
+%define xmmA  xmm4
+%define xmmB  xmm5
+%define ymmA  ymm4
+%define ymmB  ymm5
+%else
+%define mmA  mm6
+%define mmB  mm7
+%define xmmA  xmm6
+%define xmmB  xmm7
+%define ymmA  ymm6
+%define ymmB  ymm7
+%endif
+
+%if RGB_RED == 1
+%define mmC  mm0
+%define mmD  mm1
+%define xmmC  xmm0
+%define xmmD  xmm1
+%define ymmC  ymm0
+%define ymmD  ymm1
+%elif RGB_GREEN == 1
+%define mmC  mm2
+%define mmD  mm3
+%define xmmC  xmm2
+%define xmmD  xmm3
+%define ymmC  ymm2
+%define ymmD  ymm3
+%elif RGB_BLUE == 1
+%define mmC  mm4
+%define mmD  mm5
+%define xmmC  xmm4
+%define xmmD  xmm5
+%define ymmC  ymm4
+%define ymmD  ymm5
+%else
+%define mmC  mm6
+%define mmD  mm7
+%define xmmC  xmm6
+%define xmmD  xmm7
+%define ymmC  ymm6
+%define ymmD  ymm7
+%endif
+
+%if RGB_RED == 2
+%define mmE  mm0
+%define mmF  mm1
+%define xmmE  xmm0
+%define xmmF  xmm1
+%define ymmE  ymm0
+%define ymmF  ymm1
+%elif RGB_GREEN == 2
+%define mmE  mm2
+%define mmF  mm3
+%define xmmE  xmm2
+%define xmmF  xmm3
+%define ymmE  ymm2
+%define ymmF  ymm3
+%elif RGB_BLUE == 2
+%define mmE  mm4
+%define mmF  mm5
+%define xmmE  xmm4
+%define xmmF  xmm5
+%define ymmE  ymm4
+%define ymmF  ymm5
+%else
+%define mmE  mm6
+%define mmF  mm7
+%define xmmE  xmm6
+%define xmmF  xmm7
+%define ymmE  ymm6
+%define ymmF  ymm7
+%endif
+
+%if RGB_RED == 3
+%define mmG  mm0
+%define mmH  mm1
+%define xmmG  xmm0
+%define xmmH  xmm1
+%define ymmG  ymm0
+%define ymmH  ymm1
+%elif RGB_GREEN == 3
+%define mmG  mm2
+%define mmH  mm3
+%define xmmG  xmm2
+%define xmmH  xmm3
+%define ymmG  ymm2
+%define ymmH  ymm3
+%elif RGB_BLUE == 3
+%define mmG  mm4
+%define mmH  mm5
+%define xmmG  xmm4
+%define xmmH  xmm5
+%define ymmG  ymm4
+%define ymmH  ymm5
+%else
+%define mmG  mm6
+%define mmH  mm7
+%define xmmG  xmm6
+%define xmmH  xmm7
+%define ymmG  ymm6
+%define ymmH  ymm7
+%endif
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/jdct.inc b/media/libjpeg/simd/nasm/jdct.inc
index b9761071e9..9192f66f0c 100644
--- a/media/libjpeg/simd/jdct.inc
+++ b/media/libjpeg/simd/nasm/jdct.inc
@@ -2,12 +2,11 @@
 ; jdct.inc - private declarations for forward & reverse DCT subsystems
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2018, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; [TAB8]
 
 ; Each IDCT routine is responsible for range-limiting its results and
 ; converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
@@ -17,11 +16,16 @@
 ;
 %define RANGE_MASK  (MAXJSAMPLE * 4 + 3)  ; 2 bits wider than legal samples
 
-%define ROW(n,b,s)              ((b)+(n)*(s))
-%define COL(n,b,s)              ((b)+(n)*(s)*DCTSIZE)
+%define ROW(n, b, s)  ((b) + (n) * (s))
+%define COL(n, b, s)  ((b) + (n) * (s) * DCTSIZE)
 
-%define DWBLOCK(m,n,b,s)        ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
-%define MMBLOCK(m,n,b,s)        ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
-%define XMMBLOCK(m,n,b,s)       ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
+%define DWBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_DWORD)
+%define MMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_MMWORD)
+%define XMMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_XMMWORD)
+%define YMMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_YMMWORD)
 
 ; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/jsimdcfg.inc b/media/libjpeg/simd/nasm/jsimdcfg.inc
index 9d4aedec9e..667024a5f9 100755..100644
--- a/media/libjpeg/simd/jsimdcfg.inc
+++ b/media/libjpeg/simd/nasm/jsimdcfg.inc
@@ -90,5 +90,4 @@
 %define JSIMD_3DNOW 0x02
 %define JSIMD_SSE 0x04
 %define JSIMD_SSE2 0x08
-; Short forms of external names for systems with brain-damaged linkers.
-;
+%define JSIMD_AVX2 0x80
diff --git a/media/libjpeg/simd/nasm/jsimdcfg.inc.h b/media/libjpeg/simd/nasm/jsimdcfg.inc.h
new file mode 100644
index 0000000000..bf2a45ad50
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdcfg.inc.h
@@ -0,0 +1,133 @@
+/*
+ * This file generates the include file for the assembly
+ * implementations by abusing the C preprocessor.
+ *
+ * Note: Some things are manually defined as they need to
+ * be mapped to NASM types.
+ */
+
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+
+#define JPEG_INTERNALS
+
+#include "../jpeglib.h"
+#include "../jconfig.h"
+#include "../jmorecfg.h"
+#include "jsimd.h"
+
+;
+; -- jpeglib.h
+;
+
+%define _cpp_protection_DCTSIZE   DCTSIZE
+%define _cpp_protection_DCTSIZE2  DCTSIZE2
+
+;
+; -- jmorecfg.h
+;
+
+%define _cpp_protection_RGB_RED             RGB_RED
+%define _cpp_protection_RGB_GREEN           RGB_GREEN
+%define _cpp_protection_RGB_BLUE            RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE       RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGB_RED         EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN       EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE        EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE   EXT_RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGBX_RED        EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN      EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE       EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE  EXT_RGBX_PIXELSIZE
+
+%define _cpp_protection_EXT_BGR_RED         EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN       EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE        EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE   EXT_BGR_PIXELSIZE
+
+%define _cpp_protection_EXT_BGRX_RED        EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN      EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE       EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE  EXT_BGRX_PIXELSIZE
+
+%define _cpp_protection_EXT_XBGR_RED        EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN      EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE       EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE  EXT_XBGR_PIXELSIZE
+
+%define _cpp_protection_EXT_XRGB_RED        EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN      EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE       EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+
+%define RGBX_FILLER_0XFF  1
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+
+%define JSAMPLE            byte            ; unsigned char
+%define SIZEOF_JSAMPLE     SIZEOF_BYTE     ; sizeof(JSAMPLE)
+
+%define _cpp_protection_CENTERJSAMPLE  CENTERJSAMPLE
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF              word            ; short
+%define SIZEOF_JCOEF       SIZEOF_WORD     ; sizeof(JCOEF)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION         dword           ; unsigned int
+%define SIZEOF_JDIMENSION  SIZEOF_DWORD    ; sizeof(JDIMENSION)
+
+%define JSAMPROW           POINTER         ; JSAMPLE *     (jpeglib.h)
+%define JSAMPARRAY         POINTER         ; JSAMPROW *    (jpeglib.h)
+%define JSAMPIMAGE         POINTER         ; JSAMPARRAY *  (jpeglib.h)
+%define JCOEFPTR           POINTER         ; JCOEF *       (jpeglib.h)
+%define SIZEOF_JSAMPROW    SIZEOF_POINTER  ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY  SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE  SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR    SIZEOF_POINTER  ; sizeof(JCOEFPTR)
+
+;
+; -- jdct.h
+;
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM                 word         ; short
+%define SIZEOF_DCTELEM          SIZEOF_WORD  ; sizeof(DCTELEM)
+
+%define FAST_FLOAT              FP32         ; float
+%define SIZEOF_FAST_FLOAT       SIZEOF_FP32  ; sizeof(FAST_FLOAT)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define ISLOW_MULT_TYPE         word         ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE  SIZEOF_WORD  ; sizeof(ISLOW_MULT_TYPE)
+
+%define IFAST_MULT_TYPE         word         ; must be short
+%define SIZEOF_IFAST_MULT_TYPE  SIZEOF_WORD  ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS        2            ; fractional bits in scale factors
+
+%define FLOAT_MULT_TYPE         FP32         ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE  SIZEOF_FP32  ; sizeof(FLOAT_MULT_TYPE)
+
+;
+; -- jsimd.h
+;
+
+%define _cpp_protection_JSIMD_NONE   JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX    JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW  JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE    JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2   JSIMD_SSE2
+%define _cpp_protection_JSIMD_AVX2   JSIMD_AVX2
diff --git a/media/libjpeg/simd/nasm/jsimdext.inc b/media/libjpeg/simd/nasm/jsimdext.inc
new file mode 100644
index 0000000000..e8d50b0349
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdext.inc
@@ -0,0 +1,520 @@
+;
+; jsimdext.inc - common declarations
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
+; Copyright (C) 2018, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty.  In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+;    claim that you wrote the original software. If you use this software
+;    in a product, an acknowledgment in the product documentation would be
+;    appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+;    misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+; ==========================================================================
+;  System-dependent configurations
+
+%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT   .text  align=32
+%define SEG_CONST  .rdata align=32
+%else
+%define SEG_TEXT   .text  align=32 public use32 class=CODE
+%define SEG_CONST  .rdata align=32 public use32 class=CONST
+%endif
+
+%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
+; * Microsoft Visual C++
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT    .text  align=32
+%define SEG_CONST   .rdata align=32
+%else
+%define SEG_TEXT    .text  align=32 public use64 class=CODE
+%define SEG_CONST   .rdata align=32 public use64 class=CONST
+%endif
+%define EXTN(name)  name                ; foo() -> foo
+
+%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT   _text align=32 public use32 class=CODE
+%define SEG_CONST  _data align=32 public use32 class=DATA
+
+%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; mark stack as non-executable
+section .note.GNU-stack noalloc noexec nowrite progbits
+
+; -- segment definition --
+;
+%ifdef __x86_64__
+%define SEG_TEXT   .text   progbits align=32
+%define SEG_CONST  .rodata progbits align=32
+%else
+%define SEG_TEXT   .text   progbits alloc exec   nowrite align=32
+%define SEG_CONST  .rodata progbits alloc noexec nowrite align=32
+%endif
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_  ; ELF supports PIC
+%define EXTN(name)  name                   ; foo() -> foo
+
+%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text
+%define SEG_CONST  .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_  ; BSD-style a.out supports PIC
+
+%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text  ;align=32     ; nasm doesn't accept align=32. why?
+%define SEG_CONST  .rodata align=32
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
+
+%else           ; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text
+%define SEG_CONST  .data
+
+%endif          ; ----------------------------------------------
+
+; ==========================================================================
+
+; --------------------------------------------------------------------------
+;  Common types
+;
+%ifdef __x86_64__
+%ifnidn __OUTPUT_FORMAT__, elfx32
+%define POINTER         qword           ; general pointer type
+%define SIZEOF_POINTER  SIZEOF_QWORD    ; sizeof(POINTER)
+%define POINTER_BIT     QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%define resp            resq
+%define dp              dq
+%define raxp            rax
+%define rbxp            rbx
+%define rcxp            rcx
+%define rdxp            rdx
+%define rsip            rsi
+%define rdip            rdi
+%define rbpp            rbp
+%define rspp            rsp
+%define r8p             r8
+%define r9p             r9
+%define r10p            r10
+%define r11p            r11
+%define r12p            r12
+%define r13p            r13
+%define r14p            r14
+%define r15p            r15
+%endif
+%endif
+%ifndef raxp
+%define POINTER         dword           ; general pointer type
+%define SIZEOF_POINTER  SIZEOF_DWORD    ; sizeof(POINTER)
+%define POINTER_BIT     DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%define resp            resd
+%define dp              dd
+; x86_64 ILP32 ABI (x32)
+%define raxp            eax
+%define rbxp            ebx
+%define rcxp            ecx
+%define rdxp            edx
+%define rsip            esi
+%define rdip            edi
+%define rbpp            ebp
+%define rspp            esp
+%define r8p             r8d
+%define r9p             r9d
+%define r10p            r10d
+%define r11p            r11d
+%define r12p            r12d
+%define r13p            r13d
+%define r14p            r14d
+%define r15p            r15d
+%endif
+
+%define INT             dword           ; signed integer type
+%define SIZEOF_INT      SIZEOF_DWORD    ; sizeof(INT)
+%define INT_BIT         DWORD_BIT       ; sizeof(INT)*BYTE_BIT
+
+%define FP32            dword           ; IEEE754 single
+%define SIZEOF_FP32     SIZEOF_DWORD    ; sizeof(FP32)
+%define FP32_BIT        DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
+
+%define MMWORD          qword           ; int64  (MMX register)
+%define SIZEOF_MMWORD   SIZEOF_QWORD    ; sizeof(MMWORD)
+%define MMWORD_BIT      QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
+
+; NASM is buggy and doesn't properly handle operand sizes for SSE
+; instructions, so for now we have to define XMMWORD as blank.
+%define XMMWORD                         ; int128 (SSE register)
+%define SIZEOF_XMMWORD  SIZEOF_OWORD    ; sizeof(XMMWORD)
+%define XMMWORD_BIT     OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
+
+%define YMMWORD                         ; int256 (AVX register)
+%define SIZEOF_YMMWORD  SIZEOF_YWORD    ; sizeof(YMMWORD)
+%define YMMWORD_BIT     YWORD_BIT       ; sizeof(YMMWORD)*BYTE_BIT
+
+; Similar hacks for when we load a dword or MMWORD into an xmm# register
+%define XMM_DWORD
+%define XMM_MMWORD
+
+%define SIZEOF_BYTE   1                 ; sizeof(byte)
+%define SIZEOF_WORD   2                 ; sizeof(word)
+%define SIZEOF_DWORD  4                 ; sizeof(dword)
+%define SIZEOF_QWORD  8                 ; sizeof(qword)
+%define SIZEOF_OWORD  16                ; sizeof(oword)
+%define SIZEOF_YWORD  32                ; sizeof(yword)
+
+%define BYTE_BIT      8                 ; CHAR_BIT in C
+%define WORD_BIT      16                ; sizeof(word)*BYTE_BIT
+%define DWORD_BIT     32                ; sizeof(dword)*BYTE_BIT
+%define QWORD_BIT     64                ; sizeof(qword)*BYTE_BIT
+%define OWORD_BIT     128               ; sizeof(oword)*BYTE_BIT
+%define YWORD_BIT     256               ; sizeof(yword)*BYTE_BIT
+
+; --------------------------------------------------------------------------
+;  External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name)  _ %+ name           ; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+;  Hidden symbols
+;
+%ifdef ELF      ; ----(nasm -felf[64] -DELF ...)--------
+%define GLOBAL_FUNCTION(name)  global EXTN(name):function hidden
+%define GLOBAL_DATA(name)      global EXTN(name):data hidden
+%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
+%ifdef __YASM_VER__
+%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
+%define GLOBAL_DATA(name)      global EXTN(name):private_extern
+%else
+%if __NASM_VERSION_ID__ >= 0x020E0000
+%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
+%define GLOBAL_DATA(name)      global EXTN(name):private_extern
+%endif
+%endif
+%endif
+
+%ifndef GLOBAL_FUNCTION
+%define GLOBAL_FUNCTION(name)  global EXTN(name)
+%endif
+%ifndef GLOBAL_DATA
+%define GLOBAL_DATA(name)      global EXTN(name)
+%endif
+
+; --------------------------------------------------------------------------
+;  Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC  ; -------------------------------------------
+
+%ifidn GOT_SYMBOL, _MACHO_PIC_  ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+    SECTION     SEG_CONST
+const_base:
+
+%define GOTOFF(got, sym)  (got) + (sym) - const_base
+
+%imacro get_GOT 1
+    ; NOTE: this macro destroys ecx resister.
+    call        %%geteip
+    add         ecx, byte (%%ref - $)
+    jmp         short %%adjust
+%%geteip:
+    mov         ecx, POINTER [esp]
+    ret
+%%adjust:
+    push        ebp
+    xor         ebp, ebp                ; ebp = 0
+%ifidni %1, ebx  ; (%1 == ebx)
+    ; db 0x8D,0x9C + jmp near const_base =
+    ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+    db          0x8D, 0x9C              ; 8D,9C
+    jmp         near const_base         ; E9,(const_base-%%ref)
+%%ref:
+%else  ; (%1 != ebx)
+    ; db 0x8D,0x8C + jmp near const_base =
+    ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+    db          0x8D, 0x8C              ; 8D,8C
+    jmp         near const_base         ; E9,(const_base-%%ref)
+%%ref:
+    mov         %1, ecx
+%endif  ; (%1 == ebx)
+    pop         ebp
+%endmacro
+
+%else     ; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got, sym)  (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT 1
+    extern      GOT_SYMBOL
+    call        %%geteip
+    add         %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+    jmp         short %%done
+%%geteip:
+    mov         %1, POINTER [esp]
+    ret
+%%done:
+%endmacro
+
+%endif    ; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic 1.nolist
+    push        %1
+%endmacro
+%imacro poppic  1.nolist
+    pop         %1
+%endmacro
+%imacro movpic  2.nolist
+    mov         %1, %2
+%endmacro
+
+%else    ; !PIC -----------------------------------------
+
+%define GOTOFF(got, sym)  (sym)
+
+%imacro get_GOT 1.nolist
+%endmacro
+%imacro pushpic 1.nolist
+%endmacro
+%imacro poppic  1.nolist
+%endmacro
+%imacro movpic  2.nolist
+%endmacro
+
+%endif   ;  PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+;  Align the next instruction on {2,4,8,16,..}-byte boundary.
+;  ".balign n,,m" in GNU as
+;
+%define MSKLE(x, y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b, n)  (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs: \
+  times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
+        db 0x90                                      ; nop
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
+        db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00  ; lea ebx,[ebx+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
+        db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00  ; lea ebp,[ebp+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
+        db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00        ; lea ebp,[ebp+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
+        db 0x8D, 0x6C, 0x25, 0x00                    ; lea ebp,[ebp+0x00]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
+        db 0x8D, 0x6D, 0x00                          ; lea ebp,[ebp+0x00]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
+        db 0x8B, 0xED                                ; mov ebp,ebp
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
+        db 0x90                                      ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+    align       %1, db 0                ; filling zeros
+%endmacro
+
+%ifdef __x86_64__
+
+%ifdef WIN64
+
+%imacro collect_args 1
+    sub         rsp, SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp], xmm6
+    sub         rsp, SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp], xmm7
+    mov         r10, rcx
+%if %1 > 1
+    mov         r11, rdx
+%endif
+%if %1 > 2
+    push        r12
+    mov         r12, r8
+%endif
+%if %1 > 3
+    push        r13
+    mov         r13, r9
+%endif
+%if %1 > 4
+    push        r14
+    mov         r14, [rax+48]
+%endif
+%if %1 > 5
+    push        r15
+    mov         r15, [rax+56]
+%endif
+    push        rsi
+    push        rdi
+%endmacro
+
+%imacro uncollect_args 1
+    pop         rdi
+    pop         rsi
+%if %1 > 5
+    pop         r15
+%endif
+%if %1 > 4
+    pop         r14
+%endif
+%if %1 > 3
+    pop         r13
+%endif
+%if %1 > 2
+    pop         r12
+%endif
+    movaps      xmm7, XMMWORD [rsp]
+    add         rsp, SIZEOF_XMMWORD
+    movaps      xmm6, XMMWORD [rsp]
+    add         rsp, SIZEOF_XMMWORD
+%endmacro
+
+%imacro push_xmm 1
+    sub         rsp, %1 * SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
+%if %1 > 1
+    movaps      XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
+%endif
+%if %1 > 2
+    movaps      XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
+%endif
+%if %1 > 3
+    movaps      XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
+%endif
+%endmacro
+
+%imacro pop_xmm 1
+    movaps      xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+%if %1 > 1
+    movaps      xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+%endif
+%if %1 > 2
+    movaps      xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+%endif
+%if %1 > 3
+    movaps      xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
+%endif
+    add         rsp, %1 * SIZEOF_XMMWORD
+%endmacro
+
+%else
+
+%imacro collect_args 1
+    push        r10
+    mov         r10, rdi
+%if %1 > 1
+    push        r11
+    mov         r11, rsi
+%endif
+%if %1 > 2
+    push        r12
+    mov         r12, rdx
+%endif
+%if %1 > 3
+    push        r13
+    mov         r13, rcx
+%endif
+%if %1 > 4
+    push        r14
+    mov         r14, r8
+%endif
+%if %1 > 5
+    push        r15
+    mov         r15, r9
+%endif
+%endmacro
+
+%imacro uncollect_args 1
+%if %1 > 5
+    pop         r15
+%endif
+%if %1 > 4
+    pop         r14
+%endif
+%if %1 > 3
+    pop         r13
+%endif
+%if %1 > 2
+    pop         r12
+%endif
+%if %1 > 1
+    pop         r11
+%endif
+    pop         r10
+%endmacro
+
+%imacro push_xmm 1
+%endmacro
+
+%imacro pop_xmm 1
+%endmacro
+
+%endif
+
+%endif
+
+; --------------------------------------------------------------------------
+;  Defines picked up from the C headers
+;
+%include "jsimdcfg.inc"
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/jccolext-altivec.c b/media/libjpeg/simd/powerpc/jccolext-altivec.c
index 849825eb06..170f90ff80 100644
--- a/media/libjpeg/simd/jccolext-altivec.c
+++ b/media/libjpeg/simd/powerpc/jccolext-altivec.c
@@ -24,9 +24,9 @@
 /* This file is included by jccolor-altivec.c */
 
 
-void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
-                                    JSAMPIMAGE output_buf,
-                                    JDIMENSION output_row, int num_rows)
+void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+                                   JSAMPIMAGE output_buf,
+                                   JDIMENSION output_row, int num_rows)
 {
   JSAMPROW inptr, outptr0, outptr1, outptr2;
   int pitch = img_width * RGB_PIXELSIZE, num_cols;
@@ -35,13 +35,13 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
 #endif
   unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
 
-  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+  __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
     rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
 #if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
-  __vector unsigned char rgb3 = {0};
+  __vector unsigned char rgb3 = { 0 };
 #endif
 #if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
-  __vector unsigned char rgb4 = {0};
+  __vector unsigned char rgb4 = { 0 };
 #endif
   __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
   __vector unsigned short yl, yh, crl, crh, cbl, cbh;
@@ -57,9 +57,11 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
     pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
   __vector unsigned char pb_zero = { __16X(0) },
 #if __BIG_ENDIAN__
-    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
 #else
-    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
 #endif
 
   while (--num_rows >= 0) {
diff --git a/media/libjpeg/simd/powerpc/jccolor-altivec.c b/media/libjpeg/simd/powerpc/jccolor-altivec.c
new file mode 100644
index 0000000000..d670dbcda3
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jccolor-altivec.c
@@ -0,0 +1,116 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_081  5329                 /* FIX(0.08131) */
+#define F_0_114  7471                 /* FIX(0.11400) */
+#define F_0_168  11059                /* FIX(0.16874) */
+#define F_0_250  16384                /* FIX(0.25000) */
+#define F_0_299  19595                /* FIX(0.29900) */
+#define F_0_331  21709                /* FIX(0.33126) */
+#define F_0_418  27439                /* FIX(0.41869) */
+#define F_0_500  32768                /* FIX(0.50000) */
+#define F_0_587  38470                /* FIX(0.58700) */
+#define F_0_337  (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 \
+  {  0,  1,  3,  4,  6,  7,  9, 10,  2,  1,  5,  4,  8,  7, 11, 10 }
+#define RGBG_INDEX1 \
+  { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+  {  8,  9, 11, 12, 14, 15, 17, 18, 10,  9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+  {  4,  5,  7,  8, 10, 11, 13, 14,  6,  5,  9,  8, 12, 11, 15, 14 }
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX \
+  {  0,  1,  4,  5,  8,  9, 12, 13,  2,  1,  6,  5, 10,  9, 14, 13 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extrgbx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 \
+  {  2,  1,  5,  4,  8,  7, 11, 10,  0,  1,  3,  4,  6,  7,  9, 10 }
+#define RGBG_INDEX1 \
+  { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+  { 10,  9, 13, 12, 16, 15, 19, 18,  8,  9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+  {  6,  5,  9,  8, 12, 11, 15, 14,  4,  5,  7,  8, 10, 11, 13, 14 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX \
+  {  2,  1,  6,  5, 10,  9, 14, 13,  0,  1,  4,  5,  8,  9, 12, 13 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extbgrx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX \
+  {  3,  2,  7,  6, 11, 10, 15, 14,  1,  2,  5,  6,  9, 10, 13, 14 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extxbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX \
+  {  1,  2,  5,  6,  9, 10, 13, 14,  3,  2,  7,  6, 11, 10, 15, 14 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extxrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
diff --git a/media/libjpeg/simd/powerpc/jcgray-altivec.c b/media/libjpeg/simd/powerpc/jcgray-altivec.c
new file mode 100644
index 0000000000..a11a7e7021
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcgray-altivec.c
@@ -0,0 +1,111 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_114  7471                 /* FIX(0.11400) */
+#define F_0_250  16384                /* FIX(0.25000) */
+#define F_0_299  19595                /* FIX(0.29900) */
+#define F_0_587  38470                /* FIX(0.58700) */
+#define F_0_337  (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 \
+  {  0,  1,  3,  4,  6,  7,  9, 10,  2,  1,  5,  4,  8,  7, 11, 10 }
+#define RGBG_INDEX1 \
+  { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+  {  8,  9, 11, 12, 14, 15, 17, 18, 10,  9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+  {  4,  5,  7,  8, 10, 11, 13, 14,  6,  5,  9,  8, 12, 11, 15, 14 }
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_altivec  jsimd_extrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX \
+  {  0,  1,  4,  5,  8,  9, 12, 13,  2,  1,  6,  5, 10,  9, 14, 13 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extrgbx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 \
+  {  2,  1,  5,  4,  8,  7, 11, 10,  0,  1,  3,  4,  6,  7,  9, 10 }
+#define RGBG_INDEX1 \
+  { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+  { 10,  9, 13, 12, 16, 15, 19, 18,  8,  9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+  {  6,  5,  9,  8, 12, 11, 15, 14,  4,  5,  7,  8, 10, 11, 13, 14 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX \
+  {  2,  1,  6,  5, 10,  9, 14, 13,  0,  1,  4,  5,  8,  9, 12, 13 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extbgrx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX \
+  {  3,  2,  7,  6, 11, 10, 15, 14,  1,  2,  5,  6,  9, 10, 13, 14 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extxbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX \
+  {  1,  2,  5,  6,  9, 10, 13, 14,  3,  2,  7,  6, 11, 10, 15, 14 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extxrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
diff --git a/media/libjpeg/simd/jcgryext-altivec.c b/media/libjpeg/simd/powerpc/jcgryext-altivec.c
index 7f8232bb24..b280cbbded 100644
--- a/media/libjpeg/simd/jcgryext-altivec.c
+++ b/media/libjpeg/simd/powerpc/jcgryext-altivec.c
@@ -24,10 +24,9 @@
 /* This file is included by jcgray-altivec.c */
 
 
-void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
-                                     JSAMPARRAY input_buf,
-                                     JSAMPIMAGE output_buf,
-                                     JDIMENSION output_row, int num_rows)
+void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+                                    JSAMPIMAGE output_buf,
+                                    JDIMENSION output_row, int num_rows)
 {
   JSAMPROW inptr, outptr;
   int pitch = img_width * RGB_PIXELSIZE, num_cols;
@@ -36,13 +35,13 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
   unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
 #endif
 
-  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+  __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
     rgbg0, rgbg1, rgbg2, rgbg3, y;
 #if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
-  __vector unsigned char rgb3 = {0};
+  __vector unsigned char rgb3 = { 0 };
 #endif
 #if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
-  __vector unsigned char rgb4 = {0};
+  __vector unsigned char rgb4 = { 0 };
 #endif
   __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
   __vector unsigned short yl, yh;
@@ -54,9 +53,11 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
   __vector int pd_onehalf = { __4X(ONE_HALF) };
   __vector unsigned char pb_zero = { __16X(0) },
 #if __BIG_ENDIAN__
-    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+    shift_pack_index =
+      { 0, 1, 4, 5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
 #else
-    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+    shift_pack_index =
+      { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
 #endif
 
   while (--num_rows >= 0) {
diff --git a/media/libjpeg/simd/jcsample-altivec.c b/media/libjpeg/simd/powerpc/jcsample-altivec.c
index 11609d9dab..6e25b8db90 100644
--- a/media/libjpeg/simd/jcsample-altivec.c
+++ b/media/libjpeg/simd/powerpc/jcsample-altivec.c
@@ -26,14 +26,15 @@
 #include "jcsample.h"
 
 
-void
-jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
-                               JDIMENSION v_samp_factor,
-                               JDIMENSION width_blocks,
-                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+void jsimd_h2v1_downsample_altivec(JDIMENSION image_width,
+                                   int max_v_samp_factor,
+                                   JDIMENSION v_samp_factor,
+                                   JDIMENSION width_in_blocks,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY output_data)
 {
   int outrow, outcol;
-  JDIMENSION output_cols = width_blocks * DCTSIZE;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
   JSAMPROW inptr, outptr;
 
   __vector unsigned char this0, next0, out;
@@ -43,7 +44,7 @@ jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
   __vector unsigned short pw_bias = { __4X2(0, 1) },
     pw_one = { __8X(1) };
   __vector unsigned char even_odd_index =
-    {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15},
+    {  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15 },
     pb_zero = { __16X(0) };
 
   expand_right_edge(input_data, max_v_samp_factor, image_width,
@@ -83,13 +84,13 @@ jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
 
 
 void
-jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
-                               JDIMENSION v_samp_factor,
-                               JDIMENSION width_blocks,
-                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor,
+                              JDIMENSION v_samp_factor,
+                              JDIMENSION width_in_blocks,
+                              JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow, outcol;
-  JDIMENSION output_cols = width_blocks * DCTSIZE;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
   JSAMPROW inptr0, inptr1, outptr;
 
   __vector unsigned char this0, next0, this1, next1, out;
@@ -100,7 +101,7 @@ jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
   __vector unsigned short pw_bias = { __4X2(1, 2) },
     pw_two = { __8X(2) };
   __vector unsigned char even_odd_index =
-    { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+    {  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15 },
     pb_zero = { __16X(0) };
 
   expand_right_edge(input_data, max_v_samp_factor, image_width,
diff --git a/media/libjpeg/simd/powerpc/jcsample.h b/media/libjpeg/simd/powerpc/jcsample.h
new file mode 100644
index 0000000000..bd07fcc4ed
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
+{
+  register JSAMPROW ptr;
+  register JSAMPLE pixval;
+  register int count;
+  int row;
+  int numcols = (int)(output_cols - input_cols);
+
+  if (numcols > 0) {
+    for (row = 0; row < num_rows; row++) {
+      ptr = image_data[row] + input_cols;
+      pixval = ptr[-1];
+      for (count = numcols; count > 0; count--)
+        *ptr++ = pixval;
+    }
+  }
+}
diff --git a/media/libjpeg/simd/jdcolext-altivec.c b/media/libjpeg/simd/powerpc/jdcolext-altivec.c
index fb121ce745..68d52bd8a2 100644
--- a/media/libjpeg/simd/jdcolext-altivec.c
+++ b/media/libjpeg/simd/powerpc/jdcolext-altivec.c
@@ -23,9 +23,9 @@
 /* This file is included by jdcolor-altivec.c */
 
 
-void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
-                                    JDIMENSION input_row,
-                                    JSAMPARRAY output_buf, int num_rows)
+void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                                   JDIMENSION input_row, JSAMPARRAY output_buf,
+                                   int num_rows)
 {
   JSAMPROW outptr, inptr0, inptr1, inptr2;
   int pitch = out_width * RGB_PIXELSIZE, num_cols;
@@ -61,9 +61,11 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
   __vector int pd_onehalf = { __4X(ONE_HALF) };
   __vector unsigned char pb_zero = { __16X(0) },
 #if __BIG_ENDIAN__
-    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
 #else
-    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
 #endif
 
   while (--num_rows >= 0) {
diff --git a/media/libjpeg/simd/powerpc/jdcolor-altivec.c b/media/libjpeg/simd/powerpc/jdcolor-altivec.c
new file mode 100644
index 0000000000..eb35b67176
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdcolor-altivec.c
@@ -0,0 +1,106 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344  22554              /* FIX(0.34414) */
+#define F_0_714  46802              /* FIX(0.71414) */
+#define F_1_402  91881              /* FIX(1.40200) */
+#define F_1_772  116130             /* FIX(1.77200) */
+#define F_0_402  (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 \
+  {  0,  1,  8,  2,  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+  {  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+  { 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGB_INDEX \
+  {  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extrgbx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 \
+  {  8,  1,  0, 10,  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+  {  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+  {  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGB_INDEX \
+  {  8,  1,  0,  9, 10,  3,  2, 11, 12,  5,  4, 13, 14,  7,  6, 15 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extbgrx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  8,  1,  0, 11, 10,  3,  2, 13, 12,  5,  4, 15, 14,  7,  6 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extxbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  0,  1,  8, 11,  2,  3, 10, 13,  4,  5, 12, 15,  6,  7, 14 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extxrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
diff --git a/media/libjpeg/simd/powerpc/jdmerge-altivec.c b/media/libjpeg/simd/powerpc/jdmerge-altivec.c
new file mode 100644
index 0000000000..79c577f141
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdmerge-altivec.c
@@ -0,0 +1,130 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344  22554              /* FIX(0.34414) */
+#define F_0_714  46802              /* FIX(0.71414) */
+#define F_1_402  91881              /* FIX(1.40200) */
+#define F_1_772  116130             /* FIX(1.77200) */
+#define F_0_402  (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 \
+  {  0,  1,  8,  2,  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+  {  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+  { 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGB_INDEX \
+  {  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extrgbx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extrgbx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 \
+  {  8,  1,  0, 10,  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+  {  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+  {  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGB_INDEX \
+  {  8,  1,  0,  9, 10,  3,  2, 11, 12,  5,  4, 13, 14,  7,  6, 15 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extbgrx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extbgrx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  8,  1,  0, 11, 10,  3,  2, 13, 12,  5,  4, 15, 14,  7,  6 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extxbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extxbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  0,  1,  8, 11,  2,  3, 10, 13,  4,  5, 12, 15,  6,  7, 14 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extxrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extxrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/media/libjpeg/simd/jdmrgext-altivec.c b/media/libjpeg/simd/powerpc/jdmrgext-altivec.c
index 55205bb1f9..40f02c33ea 100644
--- a/media/libjpeg/simd/jdmrgext-altivec.c
+++ b/media/libjpeg/simd/powerpc/jdmrgext-altivec.c
@@ -23,10 +23,10 @@
 /* This file is included by jdmerge-altivec.c */
 
 
-void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
-                                         JSAMPIMAGE input_buf,
-                                         JDIMENSION in_row_group_ctr,
-                                         JSAMPARRAY output_buf)
+void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf)
 {
   JSAMPROW outptr, inptr0, inptr1, inptr2;
   int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
@@ -63,13 +63,19 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
   __vector int pd_onehalf = { __4X(ONE_HALF) };
   __vector unsigned char pb_zero = { __16X(0) },
 #if __BIG_ENDIAN__
-    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29},
-    even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30},
-    odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31};
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+    even_index =
+      {  0, 16,  0, 18,  0, 20,  0, 22,  0, 24,  0, 26,  0, 28,  0, 30 },
+    odd_index =
+      {  0, 17,  0, 19,  0, 21,  0, 23,  0, 25,  0, 27,  0, 29,  0, 31 };
 #else
-    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31},
-    even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0},
-    odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0};
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+    even_index =
+      { 16,  0, 18,  0, 20,  0, 22,  0, 24,  0, 26,  0, 28,  0, 30,  0 },
+    odd_index =
+      { 17,  0, 19,  0, 21,  0, 23,  0, 25,  0, 27,  0, 29,  0, 31,  0 };
 #endif
 
   inptr0 = input_buf[0][in_row_group_ctr];
@@ -299,10 +305,10 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
 }
 
 
-void jsimd_h2v2_merged_upsample_altivec (JDIMENSION output_width,
-                                         JSAMPIMAGE input_buf,
-                                         JDIMENSION in_row_group_ctr,
-                                         JSAMPARRAY output_buf)
+void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf)
 {
   JSAMPROW inptr, outptr;
 
diff --git a/media/libjpeg/simd/jdsample-altivec.c b/media/libjpeg/simd/powerpc/jdsample-altivec.c
index b40ce55c89..04df0cf108 100644
--- a/media/libjpeg/simd/jdsample-altivec.c
+++ b/media/libjpeg/simd/powerpc/jdsample-altivec.c
@@ -25,31 +25,36 @@
 #include "jsimd_altivec.h"
 
 
-void
-jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
-                                   JDIMENSION downsampled_width,
-                                   JSAMPARRAY input_data,
-                                   JSAMPARRAY *output_data_ptr)
+void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
+                                       JDIMENSION downsampled_width,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr, outptr;
   int inrow, incol;
 
-  __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
+  __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
     out;
   __vector short this0e, this0o, this0l, this0h, last0l, last0h,
     next0l, next0h, outle, outhe, outlo, outho;
 
   /* Constants */
   __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
-    last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
-    last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
-    next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
-    next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
+    last_index_col0 =
+      {  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14 },
+    last_index =
+      { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
+    next_index =
+      {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16 },
+    next_index_lastcol =
+      {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15 },
 #if __BIG_ENDIAN__
-    merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+    merge_pack_index =
+      {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
 #else
-    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+    merge_pack_index =
+      {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
 #endif
   __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
 
@@ -121,11 +126,10 @@ jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
 }
 
 
-void
-jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
-                                   JDIMENSION downsampled_width,
-                                   JSAMPARRAY input_data,
-                                   JSAMPARRAY *output_data_ptr)
+void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
+                                       JDIMENSION downsampled_width,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
@@ -136,21 +140,27 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
     lastcolsum_1h, lastcolsum1h,
     p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
     thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
-    nextcolsum_1l = {0}, nextcolsum_1h = {0},
-    nextcolsum1l = {0}, nextcolsum1h = {0},
+    nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
+    nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
     p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
     tmpl, tmph, outle, outhe, outlo, outho;
 
   /* Constants */
   __vector unsigned char pb_zero = { __16X(0) },
-    last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
-    last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
-    next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
-    next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
+    last_index_col0 =
+      {  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13 },
+    last_index =
+      { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
+    next_index =
+      {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17 },
+    next_index_lastcol =
+      {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15 },
 #if __BIG_ENDIAN__
-    merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+    merge_pack_index =
+      {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
 #else
-    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+    merge_pack_index =
+      {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
 #endif
   __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
     pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
@@ -306,11 +316,10 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
 
 /* These are rarely used (mainly just for decompressing YCCK images) */
 
-void
-jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
-                             JDIMENSION output_width,
-                             JSAMPARRAY input_data,
-                             JSAMPARRAY *output_data_ptr)
+void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
+                                 JDIMENSION output_width,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr, outptr;
@@ -345,11 +354,10 @@ jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
 }
 
 
-void
-jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
-                             JDIMENSION output_width,
-                             JSAMPARRAY input_data,
-                             JSAMPARRAY *output_data_ptr)
+void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
+                                 JDIMENSION output_width,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr, outptr0, outptr1;
diff --git a/media/libjpeg/simd/jfdctfst-altivec.c b/media/libjpeg/simd/powerpc/jfdctfst-altivec.c
index 04157f77ea..ad9af81e0c 100644
--- a/media/libjpeg/simd/jfdctfst-altivec.c
+++ b/media/libjpeg/simd/powerpc/jfdctfst-altivec.c
@@ -32,64 +32,62 @@
 #include "jsimd_altivec.h"
 
 
-#define F_0_382 98   /* FIX(0.382683433) */
-#define F_0_541 139  /* FIX(0.541196100) */
-#define F_0_707 181  /* FIX(0.707106781) */
-#define F_1_306 334  /* FIX(1.306562965) */
+#define F_0_382  98   /* FIX(0.382683433) */
+#define F_0_541  139  /* FIX(0.541196100) */
+#define F_0_707  181  /* FIX(0.707106781) */
+#define F_1_306  334  /* FIX(1.306562965) */
 
-#define CONST_BITS 8
-#define PRE_MULTIPLY_SCALE_BITS 2
-#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+#define CONST_BITS  8
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
 
 
-#define DO_FDCT()  \
-{  \
-  /* Even part */  \
+#define DO_FDCT() { \
+  /* Even part */ \
   \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
   \
-  out0  = vec_add(tmp10, tmp11);  \
-  out4  = vec_sub(tmp10, tmp11);  \
+  out0  = vec_add(tmp10, tmp11); \
+  out4  = vec_sub(tmp10, tmp11); \
   \
-  z1 = vec_add(tmp12, tmp13);  \
-  z1 = vec_sl(z1, pre_multiply_scale_bits);  \
-  z1 = vec_madds(z1, pw_0707, pw_zero);  \
+  z1 = vec_add(tmp12, tmp13); \
+  z1 = vec_sl(z1, pre_multiply_scale_bits); \
+  z1 = vec_madds(z1, pw_0707, pw_zero); \
   \
-  out2 = vec_add(tmp13, z1);  \
-  out6 = vec_sub(tmp13, z1);  \
+  out2 = vec_add(tmp13, z1); \
+  out6 = vec_sub(tmp13, z1); \
   \
-  /* Odd part */  \
+  /* Odd part */ \
   \
-  tmp10 = vec_add(tmp4, tmp5);  \
-  tmp11 = vec_add(tmp5, tmp6);  \
-  tmp12 = vec_add(tmp6, tmp7);  \
+  tmp10 = vec_add(tmp4, tmp5); \
+  tmp11 = vec_add(tmp5, tmp6); \
+  tmp12 = vec_add(tmp6, tmp7); \
   \
-  tmp10 = vec_sl(tmp10, pre_multiply_scale_bits);  \
-  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
-  z5 = vec_sub(tmp10, tmp12);  \
-  z5 = vec_madds(z5, pw_0382, pw_zero);  \
+  tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+  z5 = vec_sub(tmp10, tmp12); \
+  z5 = vec_madds(z5, pw_0382, pw_zero); \
   \
-  z2 = vec_madds(tmp10, pw_0541, z5);  \
-  z4 = vec_madds(tmp12, pw_1306, z5);  \
+  z2 = vec_madds(tmp10, pw_0541, z5); \
+  z4 = vec_madds(tmp12, pw_1306, z5); \
   \
-  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
-  z3 = vec_madds(tmp11, pw_0707, pw_zero);  \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+  z3 = vec_madds(tmp11, pw_0707, pw_zero); \
   \
-  z11 = vec_add(tmp7, z3);  \
-  z13 = vec_sub(tmp7, z3);  \
+  z11 = vec_add(tmp7, z3); \
+  z13 = vec_sub(tmp7, z3); \
   \
-  out5 = vec_add(z13, z2);  \
-  out3 = vec_sub(z13, z2);  \
-  out1 = vec_add(z11, z4);  \
-  out7 = vec_sub(z11, z4);  \
+  out5 = vec_add(z13, z2); \
+  out3 = vec_sub(z13, z2); \
+  out1 = vec_add(z11, z4); \
+  out7 = vec_sub(z11, z4); \
 }
 
 
-void
-jsimd_fdct_ifast_altivec (DCTELEM *data)
+void jsimd_fdct_ifast_altivec(DCTELEM *data)
 {
   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
     col0, col1, col2, col3, col4, col5, col6, col7,
diff --git a/media/libjpeg/simd/powerpc/jfdctint-altivec.c b/media/libjpeg/simd/powerpc/jfdctint-altivec.c
new file mode 100644
index 0000000000..3d4f017103
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jfdctint-altivec.c
@@ -0,0 +1,258 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298  2446   /* FIX(0.298631336) */
+#define F_0_390  3196   /* FIX(0.390180644) */
+#define F_0_541  4433   /* FIX(0.541196100) */
+#define F_0_765  6270   /* FIX(0.765366865) */
+#define F_0_899  7373   /* FIX(0.899976223) */
+#define F_1_175  9633   /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+  /* (Original) \
+   * z1 = (tmp12 + tmp13) * 0.541196100; \
+   * data2 = z1 + tmp13 * 0.765366865; \
+   * data6 = z1 + tmp12 * -1.847759065; \
+   * \
+   * (This implementation) \
+   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+   */ \
+  \
+  tmp1312l = vec_mergeh(tmp13, tmp12); \
+  tmp1312h = vec_mergel(tmp13, tmp12); \
+  \
+  out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
+  out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
+  out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
+  out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS); \
+  out2h = vec_sra(out2h, descale_p##PASS); \
+  out6l = vec_sra(out6l, descale_p##PASS); \
+  out6h = vec_sra(out6h, descale_p##PASS); \
+  \
+  out2 = vec_pack(out2l, out2h); \
+  out6 = vec_pack(out6l, out6h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = vec_add(tmp4, tmp6); \
+  z4 = vec_add(tmp5, tmp7); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = vec_mergeh(z3, z4); \
+  z34h = vec_mergel(z3, z4); \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
+  \
+  /* (Original) \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4; \
+   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+   * data7 = tmp4 + z3;  data5 = tmp5 + z4; \
+   * data3 = tmp6 + z3;  data1 = tmp7 + z4; \
+   */ \
+  \
+  tmp47l = vec_mergeh(tmp4, tmp7); \
+  tmp47h = vec_mergel(tmp4, tmp7); \
+  \
+  out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
+  out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
+  out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
+  out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
+  \
+  out7l = vec_sra(out7l, descale_p##PASS); \
+  out7h = vec_sra(out7h, descale_p##PASS); \
+  out1l = vec_sra(out1l, descale_p##PASS); \
+  out1h = vec_sra(out1h, descale_p##PASS); \
+  \
+  out7 = vec_pack(out7l, out7h); \
+  out1 = vec_pack(out1l, out1h); \
+  \
+  tmp56l = vec_mergeh(tmp5, tmp6); \
+  tmp56h = vec_mergel(tmp5, tmp6); \
+  \
+  out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
+  out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
+  out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
+  out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
+  \
+  out5l = vec_sra(out5l, descale_p##PASS); \
+  out5h = vec_sra(out5h, descale_p##PASS); \
+  out3l = vec_sra(out3l, descale_p##PASS); \
+  out3h = vec_sra(out3h, descale_p##PASS); \
+  \
+  out5 = vec_pack(out5l, out5h); \
+  out3 = vec_pack(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
+  \
+  out0  = vec_add(tmp10, tmp11); \
+  out0  = vec_sl(out0, pass1_bits); \
+  out4  = vec_sub(tmp10, tmp11); \
+  out4  = vec_sl(out4, pass1_bits); \
+  \
+  DO_FDCT_COMMON(1); \
+}
+
+#define DO_FDCT_PASS2() { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
+  \
+  out0  = vec_add(tmp10, tmp11); \
+  out0  = vec_add(out0, pw_descale_p2x); \
+  out0  = vec_sra(out0, pass1_bits); \
+  out4  = vec_sub(tmp10, tmp11); \
+  out4  = vec_add(out4, pw_descale_p2x); \
+  out4  = vec_sra(out4, pass1_bits); \
+  \
+  DO_FDCT_COMMON(2); \
+}
+
+
+void jsimd_fdct_islow_altivec(DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+    z3, z4, z34l, z34h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int z3l, z3h, z4l, z4h,
+    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+    out7l, out7h;
+
+  /* Constants */
+  __vector short
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
+    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) };
+
+  /* Pass 1: process rows */
+
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT_PASS1();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT_PASS2();
+
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
+}
diff --git a/media/libjpeg/simd/jidctfst-altivec.c b/media/libjpeg/simd/powerpc/jidctfst-altivec.c
index ec30c3995b..456c6c6174 100644
--- a/media/libjpeg/simd/jidctfst-altivec.c
+++ b/media/libjpeg/simd/powerpc/jidctfst-altivec.c
@@ -32,87 +32,85 @@
 #include "jsimd_altivec.h"
 
 
-#define F_1_082 277              /* FIX(1.082392200) */
-#define F_1_414 362              /* FIX(1.414213562) */
-#define F_1_847 473              /* FIX(1.847759065) */
-#define F_2_613 669              /* FIX(2.613125930) */
-#define F_1_613 (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
+#define F_1_082  277              /* FIX(1.082392200) */
+#define F_1_414  362              /* FIX(1.414213562) */
+#define F_1_847  473              /* FIX(1.847759065) */
+#define F_2_613  669              /* FIX(2.613125930) */
+#define F_1_613  (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
 
-#define CONST_BITS 8
-#define PASS1_BITS 2
-#define PRE_MULTIPLY_SCALE_BITS 2
-#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+#define CONST_BITS  8
+#define PASS1_BITS  2
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
 
 
-#define DO_IDCT(in)  \
-{  \
-  /* Even part */  \
+#define DO_IDCT(in) { \
+  /* Even part */ \
   \
-  tmp10 = vec_add(in##0, in##4);  \
-  tmp11 = vec_sub(in##0, in##4);  \
-  tmp13 = vec_add(in##2, in##6);  \
+  tmp10 = vec_add(in##0, in##4); \
+  tmp11 = vec_sub(in##0, in##4); \
+  tmp13 = vec_add(in##2, in##6); \
   \
-  tmp12 = vec_sub(in##2, in##6);  \
-  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
-  tmp12 = vec_madds(tmp12, pw_F1414, pw_zero);  \
-  tmp12 = vec_sub(tmp12, tmp13);  \
+  tmp12 = vec_sub(in##2, in##6); \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+  tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
+  tmp12 = vec_sub(tmp12, tmp13); \
   \
-  tmp0 = vec_add(tmp10, tmp13);  \
-  tmp3 = vec_sub(tmp10, tmp13);  \
-  tmp1 = vec_add(tmp11, tmp12);  \
-  tmp2 = vec_sub(tmp11, tmp12);  \
+  tmp0 = vec_add(tmp10, tmp13); \
+  tmp3 = vec_sub(tmp10, tmp13); \
+  tmp1 = vec_add(tmp11, tmp12); \
+  tmp2 = vec_sub(tmp11, tmp12); \
   \
-  /* Odd part */  \
+  /* Odd part */ \
   \
-  z13 = vec_add(in##5, in##3);  \
-  z10 = vec_sub(in##5, in##3);  \
-  z10s = vec_sl(z10, pre_multiply_scale_bits);  \
-  z11 = vec_add(in##1, in##7);  \
-  z12s = vec_sub(in##1, in##7);  \
-  z12s = vec_sl(z12s, pre_multiply_scale_bits);  \
+  z13 = vec_add(in##5, in##3); \
+  z10 = vec_sub(in##5, in##3); \
+  z10s = vec_sl(z10, pre_multiply_scale_bits); \
+  z11 = vec_add(in##1, in##7); \
+  z12s = vec_sub(in##1, in##7); \
+  z12s = vec_sl(z12s, pre_multiply_scale_bits); \
   \
-  tmp11 = vec_sub(z11, z13);  \
-  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
-  tmp11 = vec_madds(tmp11, pw_F1414, pw_zero);  \
+  tmp11 = vec_sub(z11, z13); \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+  tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
   \
-  tmp7 = vec_add(z11, z13);  \
+  tmp7 = vec_add(z11, z13); \
   \
-  /* To avoid overflow...  \
-   *  \
-   * (Original)  \
-   * tmp12 = -2.613125930 * z10 + z5;  \
-   *  \
-   * (This implementation)  \
-   * tmp12 = (-1.613125930 - 1) * z10 + z5;  \
-   *       = -1.613125930 * z10 - z10 + z5;  \
-   */  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
   \
-  z5 = vec_add(z10s, z12s);  \
-  z5 = vec_madds(z5, pw_F1847, pw_zero);  \
+  z5 = vec_add(z10s, z12s); \
+  z5 = vec_madds(z5, pw_F1847, pw_zero); \
   \
-  tmp10 = vec_madds(z12s, pw_F1082, pw_zero);  \
-  tmp10 = vec_sub(tmp10, z5);  \
-  tmp12 = vec_madds(z10s, pw_MF1613, z5);  \
-  tmp12 = vec_sub(tmp12, z10);  \
+  tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
+  tmp10 = vec_sub(tmp10, z5); \
+  tmp12 = vec_madds(z10s, pw_MF1613, z5); \
+  tmp12 = vec_sub(tmp12, z10); \
   \
-  tmp6 = vec_sub(tmp12, tmp7);  \
-  tmp5 = vec_sub(tmp11, tmp6);  \
-  tmp4 = vec_add(tmp10, tmp5);  \
+  tmp6 = vec_sub(tmp12, tmp7); \
+  tmp5 = vec_sub(tmp11, tmp6); \
+  tmp4 = vec_add(tmp10, tmp5); \
   \
-  out0 = vec_add(tmp0, tmp7);  \
-  out1 = vec_add(tmp1, tmp6);  \
-  out2 = vec_add(tmp2, tmp5);  \
-  out3 = vec_sub(tmp3, tmp4);  \
-  out4 = vec_add(tmp3, tmp4);  \
-  out5 = vec_sub(tmp2, tmp5);  \
-  out6 = vec_sub(tmp1, tmp6);  \
-  out7 = vec_sub(tmp0, tmp7);  \
+  out0 = vec_add(tmp0, tmp7); \
+  out1 = vec_add(tmp1, tmp6); \
+  out2 = vec_add(tmp2, tmp5); \
+  out3 = vec_sub(tmp3, tmp4); \
+  out4 = vec_add(tmp3, tmp4); \
+  out5 = vec_sub(tmp2, tmp5); \
+  out6 = vec_sub(tmp1, tmp6); \
+  out7 = vec_sub(tmp0, tmp7); \
 }
 
 
-void
-jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block,
-                          JSAMPARRAY output_buf, JDIMENSION output_col)
+void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
+                              JSAMPARRAY output_buf, JDIMENSION output_col)
 {
   short *dct_table = (short *)dct_table_;
   int *outptr;
diff --git a/media/libjpeg/simd/jidctint-altivec.c b/media/libjpeg/simd/powerpc/jidctint-altivec.c
index 935f35d1eb..60e619f11d 100644
--- a/media/libjpeg/simd/jidctint-altivec.c
+++ b/media/libjpeg/simd/powerpc/jidctint-altivec.c
@@ -1,7 +1,7 @@
 /*
  * AltiVec optimizations for libjpeg-turbo
  *
- * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2014-2015, 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -20,194 +20,192 @@
  * 3. This notice may not be removed or altered from any source distribution.
  */
 
-/* SLOW INTEGER INVERSE DCT */
+/* ACCURATE INTEGER INVERSE DCT */
 
 #include "jsimd_altivec.h"
 
 
-#define F_0_298 2446   /* FIX(0.298631336) */
-#define F_0_390 3196   /* FIX(0.390180644) */
-#define F_0_541 4433   /* FIX(0.541196100) */
-#define F_0_765 6270   /* FIX(0.765366865) */
-#define F_0_899 7373   /* FIX(0.899976223) */
-#define F_1_175 9633   /* FIX(1.175875602) */
-#define F_1_501 12299  /* FIX(1.501321110) */
-#define F_1_847 15137  /* FIX(1.847759065) */
-#define F_1_961 16069  /* FIX(1.961570560) */
-#define F_2_053 16819  /* FIX(2.053119869) */
-#define F_2_562 20995  /* FIX(2.562915447) */
-#define F_3_072 25172  /* FIX(3.072711026) */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
-
-
-#define DO_IDCT(in, PASS)  \
-{  \
-  /* Even part  \
-   *  \
-   * (Original)  \
-   * z1 = (z2 + z3) * 0.541196100;  \
-   * tmp2 = z1 + z3 * -1.847759065;  \
-   * tmp3 = z1 + z2 * 0.765366865;  \
-   *  \
-   * (This implementation)  \
-   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);  \
-   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;  \
-   */  \
+#define F_0_298  2446   /* FIX(0.298631336) */
+#define F_0_390  3196   /* FIX(0.390180644) */
+#define F_0_541  4433   /* FIX(0.541196100) */
+#define F_0_765  6270   /* FIX(0.765366865) */
+#define F_0_899  7373   /* FIX(0.899976223) */
+#define F_1_175  9633   /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS) { \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
   \
-  in##26l = vec_mergeh(in##2, in##6);  \
-  in##26h = vec_mergel(in##2, in##6);  \
+  in##26l = vec_mergeh(in##2, in##6); \
+  in##26h = vec_mergel(in##2, in##6); \
   \
-  tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero);  \
-  tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero);  \
-  tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero);  \
-  tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero);  \
+  tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
+  tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
+  tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
+  tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
   \
-  tmp0 = vec_add(in##0, in##4);  \
-  tmp1 = vec_sub(in##0, in##4);  \
+  tmp0 = vec_add(in##0, in##4); \
+  tmp1 = vec_sub(in##0, in##4); \
   \
-  tmp0l = vec_unpackh(tmp0);  \
-  tmp0h = vec_unpackl(tmp0);  \
-  tmp0l = vec_sl(tmp0l, const_bits);  \
-  tmp0h = vec_sl(tmp0h, const_bits);  \
-  tmp0l = vec_add(tmp0l, pd_descale_p##PASS);  \
-  tmp0h = vec_add(tmp0h, pd_descale_p##PASS);  \
+  tmp0l = vec_unpackh(tmp0); \
+  tmp0h = vec_unpackl(tmp0); \
+  tmp0l = vec_sl(tmp0l, const_bits); \
+  tmp0h = vec_sl(tmp0h, const_bits); \
+  tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
+  tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
   \
-  tmp10l = vec_add(tmp0l, tmp3l);  \
-  tmp10h = vec_add(tmp0h, tmp3h);  \
-  tmp13l = vec_sub(tmp0l, tmp3l);  \
-  tmp13h = vec_sub(tmp0h, tmp3h);  \
+  tmp10l = vec_add(tmp0l, tmp3l); \
+  tmp10h = vec_add(tmp0h, tmp3h); \
+  tmp13l = vec_sub(tmp0l, tmp3l); \
+  tmp13h = vec_sub(tmp0h, tmp3h); \
   \
-  tmp1l = vec_unpackh(tmp1);  \
-  tmp1h = vec_unpackl(tmp1);  \
-  tmp1l = vec_sl(tmp1l, const_bits);  \
-  tmp1h = vec_sl(tmp1h, const_bits);  \
-  tmp1l = vec_add(tmp1l, pd_descale_p##PASS);  \
-  tmp1h = vec_add(tmp1h, pd_descale_p##PASS);  \
+  tmp1l = vec_unpackh(tmp1); \
+  tmp1h = vec_unpackl(tmp1); \
+  tmp1l = vec_sl(tmp1l, const_bits); \
+  tmp1h = vec_sl(tmp1h, const_bits); \
+  tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
+  tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
   \
-  tmp11l = vec_add(tmp1l, tmp2l);  \
-  tmp11h = vec_add(tmp1h, tmp2h);  \
-  tmp12l = vec_sub(tmp1l, tmp2l);  \
-  tmp12h = vec_sub(tmp1h, tmp2h);  \
+  tmp11l = vec_add(tmp1l, tmp2l); \
+  tmp11h = vec_add(tmp1h, tmp2h); \
+  tmp12l = vec_sub(tmp1l, tmp2l); \
+  tmp12h = vec_sub(tmp1h, tmp2h); \
   \
-  /* Odd part */  \
+  /* Odd part */ \
   \
-  z3 = vec_add(in##3, in##7);  \
-  z4 = vec_add(in##1, in##5);  \
+  z3 = vec_add(in##3, in##7); \
+  z4 = vec_add(in##1, in##5); \
   \
-  /* (Original)  \
-   * z5 = (z3 + z4) * 1.175875602;  \
-   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
-   * z3 += z5;  z4 += z5;  \
-   *  \
-   * (This implementation)  \
-   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
-   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
-   */  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
   \
-  z34l = vec_mergeh(z3, z4);  \
-  z34h = vec_mergel(z3, z4);  \
+  z34l = vec_mergeh(z3, z4); \
+  z34h = vec_mergel(z3, z4); \
   \
-  z3l = vec_msums(z34l, pw_mf078_f117, pd_zero);  \
-  z3h = vec_msums(z34h, pw_mf078_f117, pd_zero);  \
-  z4l = vec_msums(z34l, pw_f117_f078, pd_zero);  \
-  z4h = vec_msums(z34h, pw_f117_f078, pd_zero);  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
   \
-  /* (Original)  \
-   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;  \
-   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;  \
-   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;  \
-   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
-   * tmp0 += z1 + z3;  tmp1 += z2 + z4;  \
-   * tmp2 += z2 + z3;  tmp3 += z1 + z4;  \
-   *  \
-   * (This implementation)  \
-   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;  \
-   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;  \
-   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);  \
-   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);  \
-   * tmp0 += z3;  tmp1 += z4;  \
-   * tmp2 += z3;  tmp3 += z4;  \
-   */  \
+  /* (Original) \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+   * tmp0 += z3;  tmp1 += z4; \
+   * tmp2 += z3;  tmp3 += z4; \
+   */ \
   \
-  in##71l = vec_mergeh(in##7, in##1);  \
-  in##71h = vec_mergel(in##7, in##1);  \
+  in##71l = vec_mergeh(in##7, in##1); \
+  in##71h = vec_mergel(in##7, in##1); \
   \
-  tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l);  \
-  tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h);  \
-  tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l);  \
-  tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h);  \
+  tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
+  tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
+  tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
+  tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
   \
-  in##53l = vec_mergeh(in##5, in##3);  \
-  in##53h = vec_mergel(in##5, in##3);  \
+  in##53l = vec_mergeh(in##5, in##3); \
+  in##53h = vec_mergel(in##5, in##3); \
   \
-  tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l);  \
-  tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h);  \
-  tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l);  \
-  tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h);  \
+  tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
+  tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
+  tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
+  tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
   \
-  /* Final output stage */  \
+  /* Final output stage */ \
   \
-  out0l = vec_add(tmp10l, tmp3l);  \
-  out0h = vec_add(tmp10h, tmp3h);  \
-  out7l = vec_sub(tmp10l, tmp3l);  \
-  out7h = vec_sub(tmp10h, tmp3h);  \
+  out0l = vec_add(tmp10l, tmp3l); \
+  out0h = vec_add(tmp10h, tmp3h); \
+  out7l = vec_sub(tmp10l, tmp3l); \
+  out7h = vec_sub(tmp10h, tmp3h); \
   \
-  out0l = vec_sra(out0l, descale_p##PASS);  \
-  out0h = vec_sra(out0h, descale_p##PASS);  \
-  out7l = vec_sra(out7l, descale_p##PASS);  \
-  out7h = vec_sra(out7h, descale_p##PASS);  \
+  out0l = vec_sra(out0l, descale_p##PASS); \
+  out0h = vec_sra(out0h, descale_p##PASS); \
+  out7l = vec_sra(out7l, descale_p##PASS); \
+  out7h = vec_sra(out7h, descale_p##PASS); \
   \
-  out0 = vec_pack(out0l, out0h);  \
-  out7 = vec_pack(out7l, out7h);  \
+  out0 = vec_pack(out0l, out0h); \
+  out7 = vec_pack(out7l, out7h); \
   \
-  out1l = vec_add(tmp11l, tmp2l);  \
-  out1h = vec_add(tmp11h, tmp2h);  \
-  out6l = vec_sub(tmp11l, tmp2l);  \
-  out6h = vec_sub(tmp11h, tmp2h);  \
+  out1l = vec_add(tmp11l, tmp2l); \
+  out1h = vec_add(tmp11h, tmp2h); \
+  out6l = vec_sub(tmp11l, tmp2l); \
+  out6h = vec_sub(tmp11h, tmp2h); \
   \
-  out1l = vec_sra(out1l, descale_p##PASS);  \
-  out1h = vec_sra(out1h, descale_p##PASS);  \
-  out6l = vec_sra(out6l, descale_p##PASS);  \
-  out6h = vec_sra(out6h, descale_p##PASS);  \
+  out1l = vec_sra(out1l, descale_p##PASS); \
+  out1h = vec_sra(out1h, descale_p##PASS); \
+  out6l = vec_sra(out6l, descale_p##PASS); \
+  out6h = vec_sra(out6h, descale_p##PASS); \
   \
-  out1 = vec_pack(out1l, out1h);  \
-  out6 = vec_pack(out6l, out6h);  \
+  out1 = vec_pack(out1l, out1h); \
+  out6 = vec_pack(out6l, out6h); \
   \
-  out2l = vec_add(tmp12l, tmp1l);  \
-  out2h = vec_add(tmp12h, tmp1h);  \
-  out5l = vec_sub(tmp12l, tmp1l);  \
-  out5h = vec_sub(tmp12h, tmp1h);  \
+  out2l = vec_add(tmp12l, tmp1l); \
+  out2h = vec_add(tmp12h, tmp1h); \
+  out5l = vec_sub(tmp12l, tmp1l); \
+  out5h = vec_sub(tmp12h, tmp1h); \
   \
-  out2l = vec_sra(out2l, descale_p##PASS);  \
-  out2h = vec_sra(out2h, descale_p##PASS);  \
-  out5l = vec_sra(out5l, descale_p##PASS);  \
-  out5h = vec_sra(out5h, descale_p##PASS);  \
+  out2l = vec_sra(out2l, descale_p##PASS); \
+  out2h = vec_sra(out2h, descale_p##PASS); \
+  out5l = vec_sra(out5l, descale_p##PASS); \
+  out5h = vec_sra(out5h, descale_p##PASS); \
   \
-  out2 = vec_pack(out2l, out2h);  \
-  out5 = vec_pack(out5l, out5h);  \
+  out2 = vec_pack(out2l, out2h); \
+  out5 = vec_pack(out5l, out5h); \
   \
-  out3l = vec_add(tmp13l, tmp0l);  \
-  out3h = vec_add(tmp13h, tmp0h);  \
-  out4l = vec_sub(tmp13l, tmp0l);  \
-  out4h = vec_sub(tmp13h, tmp0h);  \
+  out3l = vec_add(tmp13l, tmp0l); \
+  out3h = vec_add(tmp13h, tmp0h); \
+  out4l = vec_sub(tmp13l, tmp0l); \
+  out4h = vec_sub(tmp13h, tmp0h); \
   \
-  out3l = vec_sra(out3l, descale_p##PASS);  \
-  out3h = vec_sra(out3h, descale_p##PASS);  \
-  out4l = vec_sra(out4l, descale_p##PASS);  \
-  out4h = vec_sra(out4h, descale_p##PASS);  \
+  out3l = vec_sra(out3l, descale_p##PASS); \
+  out3h = vec_sra(out3h, descale_p##PASS); \
+  out4l = vec_sra(out4l, descale_p##PASS); \
+  out4h = vec_sra(out4h, descale_p##PASS); \
   \
-  out3 = vec_pack(out3l, out3h);  \
-  out4 = vec_pack(out4l, out4h);  \
+  out3 = vec_pack(out3l, out3h); \
+  out4 = vec_pack(out4l, out4h); \
 }
 
 
-void
-jsimd_idct_islow_altivec (void *dct_table_, JCOEFPTR coef_block,
-                          JSAMPARRAY output_buf, JDIMENSION output_col)
+void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block,
+                              JSAMPARRAY output_buf, JDIMENSION output_col)
 {
   short *dct_table = (short *)dct_table_;
   int *outptr;
diff --git a/media/libjpeg/simd/jquanti-altivec.c b/media/libjpeg/simd/powerpc/jquanti-altivec.c
index 25cc296f7a..7d6e32542b 100644
--- a/media/libjpeg/simd/jquanti-altivec.c
+++ b/media/libjpeg/simd/powerpc/jquanti-altivec.c
@@ -31,26 +31,25 @@
  */
 #if __BIG_ENDIAN__
 
-#define LOAD_ROW(row) {  \
-  elemptr = sample_data[row] + start_col;  \
-  in##row = vec_ld(0, elemptr);  \
-  if ((size_t)elemptr & 15)  \
-    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr));  \
+#define LOAD_ROW(row) { \
+  elemptr = sample_data[row] + start_col; \
+  in##row = vec_ld(0, elemptr); \
+  if ((size_t)elemptr & 15) \
+    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
 }
 
 #else
 
-#define LOAD_ROW(row) {  \
-  elemptr = sample_data[row] + start_col;  \
-  in##row = vec_vsx_ld(0, elemptr);  \
+#define LOAD_ROW(row) { \
+  elemptr = sample_data[row] + start_col; \
+  in##row = vec_vsx_ld(0, elemptr); \
 }
 
 #endif
 
 
-void
-jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
-                        DCTELEM *workspace)
+void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
+                            DCTELEM *workspace)
 {
   JSAMPROW elemptr;
 
@@ -99,24 +98,23 @@ jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
 }
 
 
-#define WORD_BIT 16
+#define WORD_BIT  16
 
 /* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
    We basically need an unsigned equivalent of vec_madds(). */
 
-#define MULTIPLY(vs0, vs1, out) {  \
-  tmpe = vec_mule((__vector unsigned short)vs0,  \
-                  (__vector unsigned short)vs1);  \
-  tmpo = vec_mulo((__vector unsigned short)vs0,  \
-                  (__vector unsigned short)vs1);  \
-  out = (__vector short)vec_perm((__vector unsigned short)tmpe,  \
-                                 (__vector unsigned short)tmpo,  \
-                                 shift_pack_index);  \
+#define MULTIPLY(vs0, vs1, out) { \
+  tmpe = vec_mule((__vector unsigned short)vs0, \
+                  (__vector unsigned short)vs1); \
+  tmpo = vec_mulo((__vector unsigned short)vs0, \
+                  (__vector unsigned short)vs1); \
+  out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
+                                 (__vector unsigned short)tmpo, \
+                                 shift_pack_index); \
 }
 
-void
-jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors,
-                        DCTELEM *workspace)
+void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
+                            DCTELEM *workspace)
 {
   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
     row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
@@ -129,10 +127,10 @@ jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors,
   __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
 #if __BIG_ENDIAN__
   __vector unsigned char shift_pack_index =
-    {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
+    {  0,  1, 16, 17,  4,  5, 20, 21,  8,  9, 24, 25, 12, 13, 28, 29 };
 #else
   __vector unsigned char shift_pack_index =
-    {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
+    {  2,  3, 18, 19,  6,  7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
 #endif
 
   row0 = vec_ld(0, workspace);
diff --git a/media/libjpeg/simd/powerpc/jsimd.c b/media/libjpeg/simd/powerpc/jsimd.c
new file mode 100644
index 0000000000..b9e86dcfac
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jsimd.c
@@ -0,0 +1,881 @@
+/*
+ * jsimd_powerpc.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * PowerPC architecture.
+ */
+
+#ifdef __amigaos4__
+/* This must be defined first as it re-defines GLOBAL otherwise */
+#include <proto/exec.h>
+#endif
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#if defined(__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#elif defined(__FreeBSD__)
+#include <machine/cpu.h>
+#include <sys/auxv.h>
+#endif
+
+static unsigned int simd_support = ~0;
+
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "cpu", 3) != 0)
+    return 0;
+  buffer += 3;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "altivec"))
+        simd_support |= JSIMD_ALTIVEC;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#elif defined(__amigaos4__)
+  uint32 altivec = 0;
+#elif defined(__OpenBSD__)
+  int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
+  int altivec;
+  size_t len = sizeof(altivec);
+#elif defined(__FreeBSD__)
+  unsigned long cpufeatures = 0;
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__ALTIVEC__) || defined(__APPLE__)
+  simd_support |= JSIMD_ALTIVEC;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#elif defined(__amigaos4__)
+  IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE);
+  if (altivec == VECTORTYPE_ALTIVEC)
+    simd_support |= JSIMD_ALTIVEC;
+#elif defined(__OpenBSD__)
+  if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0)
+    simd_support |= JSIMD_ALTIVEC;
+#elif defined(__FreeBSD__)
+  elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
+  if (cpufeatures & PPC_FEATURE_HAS_ALTIVEC)
+    simd_support |= JSIMD_ALTIVEC;
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEALTIVEC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_ALTIVEC;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_extrgb_ycc_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_extrgbx_ycc_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_extbgr_ycc_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_extbgrx_ycc_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_extxbgr_ycc_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_extxrgb_ycc_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_rgb_ycc_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_extrgb_gray_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_extrgbx_gray_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_extbgr_gray_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_extbgrx_gray_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_extxbgr_gray_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_extxrgb_gray_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_rgb_gray_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_ycc_extrgb_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_ycc_extrgbx_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_ycc_extbgr_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_ycc_extbgrx_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_ycc_extxbgr_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_ycc_extxrgb_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_ycc_rgb_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks, input_data,
+                                output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks, input_data,
+                                output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec;
+    break;
+  default:
+    altivecfct = jsimd_h2v2_merged_upsample_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec;
+    break;
+  default:
+    altivecfct = jsimd_h2v1_merged_upsample_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_altivec(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_altivec(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/media/libjpeg/simd/jsimd_altivec.h b/media/libjpeg/simd/powerpc/jsimd_altivec.h
index 62dbc5cdf0..e8bdb06a54 100644
--- a/media/libjpeg/simd/jsimd_altivec.h
+++ b/media/libjpeg/simd/powerpc/jsimd_altivec.h
@@ -21,28 +21,27 @@
  */
 
 #define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
 #include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
 #include <altivec.h>
 
 
 /* Common code */
 
-#define __4X(a) a, a, a, a
-#define __4X2(a, b) a, b, a, b, a, b, a, b
-#define __8X(a) __4X(a), __4X(a)
-#define __16X(a) __8X(a), __8X(a)
+#define __4X(a)      a, a, a, a
+#define __4X2(a, b)  a, b, a, b, a, b, a, b
+#define __8X(a)      __4X(a), __4X(a)
+#define __16X(a)     __8X(a), __8X(a)
 
-#define TRANSPOSE(row, col)  \
-{  \
-  __vector short row04l, row04h, row15l, row15h,  \
-                 row26l, row26h, row37l, row37h;  \
-  __vector short col01e, col01o, col23e, col23o,  \
-                 col45e, col45o, col67e, col67o;  \
+#define TRANSPOSE(row, col) { \
+  __vector short row04l, row04h, row15l, row15h, \
+                 row26l, row26h, row37l, row37h; \
+  __vector short col01e, col01o, col23e, col23o, \
+                 col45e, col45o, col67e, col67o; \
   \
                                        /* transpose coefficients (phase 1) */ \
   row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
@@ -65,18 +64,18 @@
   col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
   \
                                        /* transpose coefficients (phase 3) */ \
-  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */   \
-  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */   \
-  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */   \
-  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */   \
-  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */   \
-  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */   \
-  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */   \
-  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */   \
+  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
+  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
+  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
+  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
+  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
+  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
+  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
+  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
 }
 
 #ifndef min
-#define min(a,b) ((a) < (b) ? (a) : (b))
+#define min(a, b)  ((a) < (b) ? (a) : (b))
 #endif
 
 
@@ -84,16 +83,16 @@
 
 #if __BIG_ENDIAN__
 
-#define VEC_LD(a, b) vec_ld(a, b)
-#define VEC_ST(a, b, c) vec_st(a, b, c)
-#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
-#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
+#define VEC_LD(a, b)     vec_ld(a, b)
+#define VEC_ST(a, b, c)  vec_st(a, b, c)
+#define VEC_UNPACKHU(a)  vec_mergeh(pb_zero, a)
+#define VEC_UNPACKLU(a)  vec_mergel(pb_zero, a)
 
 #else
 
-#define VEC_LD(a, b) vec_vsx_ld(a, b)
-#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
-#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
-#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
+#define VEC_LD(a, b)     vec_vsx_ld(a, b)
+#define VEC_ST(a, b, c)  vec_vsx_st(a, b, c)
+#define VEC_UNPACKHU(a)  vec_mergeh(a, pb_zero)
+#define VEC_UNPACKLU(a)  vec_mergel(a, pb_zero)
 
 #endif
diff --git a/media/libjpeg/simd/x86_64/jccolext-avx2.asm b/media/libjpeg/simd/x86_64/jccolext-avx2.asm
new file mode 100644
index 0000000000..ffb527db00
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolext-avx2.asm
@@ -0,0 +1,559 @@
+;
+; jccolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  8
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdx
+    push        rbx
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+    mov         rbxp, JSAMPROW [rbx]    ; outptr1
+    mov         rdxp, JSAMPROW [rdx]    ; outptr2
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         rcx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         rcx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
+    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
+    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
+    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vmovdqa     ymm7, ymm1
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    vpmaddwd    ymm7, ymm7, [rel PW_MF016_MF033]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vpxor       ymm1, ymm1, ymm1
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
+    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
+    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
+
+    vmovdqa     ymm5, [rel PD_ONEHALFM1_CJ]  ; ymm5=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm7, ymm7, ymm5
+    vpaddd      ymm4, ymm4, ymm5
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
+    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
+
+    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vmovdqa     ymm5, ymm0
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF016_MF033]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vpxor       ymm0, ymm0, ymm0
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
+    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
+    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
+
+    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm5, ymm5, ymm0
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm5, ymm5, ymm1
+    vpaddd      ymm4, ymm4, ymm1
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
+    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
+    vmovdqu     YMMWORD [rbx], ymm5     ; Save Cb
+
+    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
+    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
+    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vmovdqa     ymm7, ymm0
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    vpmaddwd    ymm7, ymm7, [rel PW_MF008_MF041]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vpxor       ymm3, ymm3, ymm3
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
+    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
+    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
+
+    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm3
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm5, ymm5, ymm1
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
+    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
+
+    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vmovdqa     ymm1, ymm6
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    vpmaddwd    ymm1, ymm1, [rel PW_MF008_MF041]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
+
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
+    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
+    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
+
+    vmovdqa     ymm0, [rel PD_ONEHALFM1_CJ]  ; ymm0=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm1, ymm1, ymm2
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm1, ymm1, ymm0
+    vpaddd      ymm5, ymm5, ymm0
+    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
+    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
+    vmovdqu     YMMWORD [rdx], ymm1     ; Save Cr
+
+    sub         rcx, byte SIZEOF_YMMWORD
+    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
+    add         rbx, byte SIZEOF_YMMWORD           ; outptr1
+    add         rdx, byte SIZEOF_YMMWORD           ; outptr2
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+    pop         rbx
+    pop         rdx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jccolext-sse2.asm b/media/libjpeg/simd/x86_64/jccolext-sse2.asm
new file mode 100644
index 0000000000..af70ed6010
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolext-sse2.asm
@@ -0,0 +1,484 @@
+;
+; jccolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  8
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdx
+    push        rbx
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+    mov         rbxp, JSAMPROW [rbx]    ; outptr1
+    mov         rdxp, JSAMPROW [rdx]    ; outptr2
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    movd        xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
+    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     xmm7, [rel PW_MF016_MF033]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        xmm1, xmm1
+    pxor        xmm6, xmm6
+    punpcklwd   xmm1, xmm5              ; xmm1=BOL
+    punpckhwd   xmm6, xmm5              ; xmm6=BOH
+    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
+
+    movdqa      xmm5, [rel PD_ONEHALFM1_CJ]  ; xmm5=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm1
+    paddd       xmm4, xmm6
+    paddd       xmm7, xmm5
+    paddd       xmm4, xmm5
+    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
+    packssdw    xmm7, xmm4              ; xmm7=CbO
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     xmm5, [rel PW_MF016_MF033]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        xmm0, xmm0
+    pxor        xmm6, xmm6
+    punpcklwd   xmm0, xmm1              ; xmm0=BEL
+    punpckhwd   xmm6, xmm1              ; xmm6=BEH
+    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
+
+    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm5, xmm0
+    paddd       xmm4, xmm6
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
+    packssdw    xmm5, xmm4              ; xmm5=CbE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm5, xmm7              ; xmm5=Cb
+    movdqa      XMMWORD [rbx], xmm5     ; Save Cb
+
+    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    movdqa      xmm7, xmm0
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     xmm7, [rel PW_MF008_MF041]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, XMMWORD [wk(4)]
+    paddd       xmm4, XMMWORD [wk(5)]
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    pxor        xmm3, xmm3
+    pxor        xmm4, xmm4
+    punpcklwd   xmm3, xmm1              ; xmm3=ROL
+    punpckhwd   xmm4, xmm1              ; xmm4=ROH
+    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
+
+    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm3
+    paddd       xmm5, xmm4
+    paddd       xmm7, xmm1
+    paddd       xmm5, xmm1
+    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
+    packssdw    xmm7, xmm5              ; xmm7=CrO
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     xmm1, [rel PW_MF008_MF041]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(6)]
+    paddd       xmm4, XMMWORD [wk(7)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [rdi], xmm6     ; Save Y
+
+    pxor        xmm2, xmm2
+    pxor        xmm4, xmm4
+    punpcklwd   xmm2, xmm3              ; xmm2=REL
+    punpckhwd   xmm4, xmm3              ; xmm4=REH
+    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
+
+    movdqa      xmm0, [rel PD_ONEHALFM1_CJ]  ; xmm0=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm1, xmm2
+    paddd       xmm5, xmm4
+    paddd       xmm1, xmm0
+    paddd       xmm5, xmm0
+    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
+    packssdw    xmm1, xmm5              ; xmm1=CrE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Cr
+    movdqa      XMMWORD [rdx], xmm1     ; Save Cr
+
+    sub         rcx, byte SIZEOF_XMMWORD
+    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
+    add         rbx, byte SIZEOF_XMMWORD                ; outptr1
+    add         rdx, byte SIZEOF_XMMWORD                ; outptr2
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+    pop         rbx
+    pop         rdx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jccolor-avx2.asm b/media/libjpeg/simd/x86_64/jccolor-avx2.asm
new file mode 100644
index 0000000000..16b78298dc
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolor-avx2.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337  times 8 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 8 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 8 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jccolor-sse2.asm b/media/libjpeg/simd/x86_64/jccolor-sse2.asm
new file mode 100644
index 0000000000..e2955c2134
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolor-sse2.asm
@@ -0,0 +1,120 @@
+;
+; jccolor.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337  times 4 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgbx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgrx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgray-avx2.asm b/media/libjpeg/simd/x86_64/jcgray-avx2.asm
new file mode 100644
index 0000000000..591255bb11
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgray-avx2.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF     times 8 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgray-sse2.asm b/media/libjpeg/simd/x86_64/jcgray-sse2.asm
new file mode 100644
index 0000000000..e389904f2f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgray-sse2.asm
@@ -0,0 +1,112 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF     times 4 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgbx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgrx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgryext-avx2.asm b/media/libjpeg/simd/x86_64/jcgryext-avx2.asm
new file mode 100644
index 0000000000..ddcc2c0a2f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgryext-avx2.asm
@@ -0,0 +1,438 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         rcx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         rcx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     ymm0, ymm5              ; ymm0=BO
+    vmovdqa     ymm6, ymm4              ; ymm6=BE
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, ymm1
+    vpaddd      ymm4, ymm4, ymm7
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
+
+    sub         rcx, byte SIZEOF_YMMWORD
+    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jcgryext-sse2.asm b/media/libjpeg/simd/x86_64/jcgryext-sse2.asm
new file mode 100644
index 0000000000..f1d399a63b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgryext-sse2.asm
@@ -0,0 +1,363 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    movd        xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      xmm0, xmm5              ; xmm0=BO
+    movdqa      xmm6, xmm4              ; xmm6=BE
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, xmm1
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(0)]
+    paddd       xmm4, XMMWORD [wk(1)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [rdi], xmm6     ; Save Y
+
+    sub         rcx, byte SIZEOF_XMMWORD
+    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jchuff-sse2.asm b/media/libjpeg/simd/x86_64/jchuff-sse2.asm
new file mode 100644
index 0000000000..9ea6df946e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jchuff-sse2.asm
@@ -0,0 +1,583 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based on jchuff.c; see jchuff.c for more details.
+
+%include "jsimdext.inc"
+
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
+               dd 0x000f, 0x001f, 0x003f, 0x007f
+               dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+    alignz      32
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+times 1 << 15 db 16
+
+    alignz      32
+
+%define NBITS(x)      nbits_base + x
+%define MASK_BITS(x)  NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values
+; bN[i]:  perform the same operation on all 16 unsigned 8-bit values, i=0..15
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - the label to which to jump when the macro completes
+; %2 (optional) - extra instructions to execute after nbits has been set
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 1-2
+    add         nbitsb, free_bitsb      ; nbits += free_bits;
+    neg         free_bitsb              ; free_bits = -free_bits;
+    mov         tempd, code             ; temp = code;
+    shl         put_buffer, nbitsb      ; put_buffer <<= nbits;
+    mov         nbitsb, free_bitsb      ; nbits = free_bits;
+    neg         free_bitsb              ; free_bits = -free_bits;
+    shr         tempd, nbitsb           ; temp >>= nbits;
+    or          tempq, put_buffer       ; temp |= put_buffer;
+    movq        xmm0, tempq             ; xmm0.u64 = { temp, 0 };
+    bswap       tempq                   ; temp = htonl(temp);
+    mov         put_buffer, codeq       ; put_buffer = code;
+    pcmpeqb     xmm0, xmm1              ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
+    %2
+    pmovmskb    code, xmm0              ; code = 0;  code |= ((b0[i] >> 7) << i);
+    mov         qword [buffer], tempq   ; memcpy(buffer, &temp, 8);
+                                        ; (speculative; will be overwritten if
+                                        ; code contains any 0xFF bytes)
+    add         free_bitsb, 64          ; free_bits += 64;
+    add         bufferp, 8              ; buffer += 8;
+    test        code, code              ; if (code == 0)  /* No 0xFF bytes */
+    jz          %1                      ;   return;
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer-7], 0      ; buffer[-7] = 0;
+    sbb         bufferp, 6              ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempq, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempq, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempd, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    jmp         %1                      ; return;
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+;                                  JCOEFPTR block, int last_dc_val,
+;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; rax - buffer
+; rbx - temp
+; rcx - nbits
+; rdx - block --> free_bits
+; rsi - nbits_base
+; rdi - t
+; rbp - code
+; r8  - dctbl --> code_temp
+; r9  - actbl
+; r10 - state
+; r11 - index
+; r12 - put_buffer
+
+%define buffer       rax
+%ifdef WIN64
+%define bufferp      rax
+%else
+%define bufferp      raxp
+%endif
+%define tempq        rbx
+%define tempd        ebx
+%define tempb        bl
+%define temph        bh
+%define nbitsq       rcx
+%define nbits        ecx
+%define nbitsb       cl
+%define block        rdx
+%define nbits_base   rsi
+%define t            rdi
+%define td           edi
+%define codeq        rbp
+%define code         ebp
+%define dctbl        r8
+%define actbl        r9
+%define state        r10
+%define index        r11
+%define indexd       r11d
+%define put_buffer   r12
+%define put_bufferd  r12d
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+
+%ifdef WIN64
+
+; rcx = working_state *state
+; rdx = JOCTET *buffer
+; r8 = JCOEFPTR block
+; r9 = int last_dc_val
+; [rax+48] = c_derived_tbl *dctbl
+; [rax+56] = c_derived_tbl *actbl
+
+                                                          ;X: X = code stream
+    mov         buffer, rdx
+    mov         block, r8
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    push        rbx
+    push        rbp
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    push        rsi
+    push        rdi
+    push        r12
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    mov         state, rcx
+    movsx       code, word [block]                        ;Z:     code = block[0];
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    sub         code, r9d                                 ;Z:     code -= last_dc_val;
+    mov         dctbl, POINTER [rsp+6*8+4*8]
+    mov         actbl, POINTER [rsp+6*8+5*8]
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    lea         nbits_base, [rel jpeg_nbits_table]
+    add         rsp, -DCTSIZE2 * SIZEOF_WORD
+    mov         t, rsp
+
+%else
+
+; rdi = working_state *state
+; rsi = JOCTET *buffer
+; rdx = JCOEFPTR block
+; rcx = int last_dc_val
+; r8 = c_derived_tbl *dctbl
+; r9 = c_derived_tbl *actbl
+
+                                                          ;X: X = code stream
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    push        rbx
+    push        rbp
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    push        r12
+    mov         state, rdi
+    mov         buffer, rsi
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    movsx       codeq, word [block]                       ;Z:     code = block[0];
+    lea         nbits_base, [rel jpeg_nbits_table]
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    sub         codeq, rcx                                ;Z:     code -= last_dc_val;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    lea         t, [rsp - DCTSIZE2 * SIZEOF_WORD]         ;   use red zone for t_
+
+%endif
+
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
+
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    cmp         code, 1 << 31                             ;Z:     Set CF if code < 0x80000000,
+                                                          ;Z:     i.e. if code is positive
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    adc         code, -1                                  ;Z:     code += -1 + (code >= 0 ? 1 : 0);
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    movsxd      codeq, code                               ;Z:     sign extend code
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD], xmm2      ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    movzx       nbitsq, byte [NBITS(codeq)]               ;Z:     nbits = JPEG_NBITS(code);
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    tempd, xmm2                               ;Z:     temp = 0;  temp |= ((b2[i] >> 7) << i);
+    pmovmskb    put_bufferd, xmm0                         ;Z:     put_buffer = 0;  put_buffer |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         tempd, 16                                 ;Z:     temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    or          put_bufferd, tempd                        ;Z:     put_buffer |= temp;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    mov         tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
+                                                          ;Z:     temp = dctbl->ehufco[nbits];
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    and         code, dword [MASK_BITS(nbitsq)]           ;Z:     code &= (1 << nbits) - 1;
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    shl         tempq, nbitsb                             ;Z:     temp <<= nbits;
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    or          code, tempd                               ;Z:     code |= temp;
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
+
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    pmovmskb    tempd, xmm4                               ;Z:     temp = 0;  temp |= ((b4[i] >> 7) << i);
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + 40 * SIZEOF_WORD], xmm1      ;F: t[40+i] = w1[i];
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define free_bitsq  rdx
+%define free_bitsd  edx
+%define free_bitsb  dl
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    shl         tempq, 48                                 ;Z:     temp <<= 48;
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    or          tempq, put_buffer                         ;Z:     temp |= put_buffer;
+    movaps      XMMWORD [t + 32 * SIZEOF_WORD], xmm5      ;E: t[32+i] = w5[i];
+    lea         t, [dword t - 2]                          ;Z:     t = &t[-1];
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    add         nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
+                                                          ;Z:     nbits += dctbl->ehufsi[nbits];
+%undef dctbl
+%define code_temp  r8d
+    pmovmskb    indexd, xmm5                              ;Z:     index = 0;  index |= ((b5[i] >> 7) << i);
+    mov         free_bitsd, [state+working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+    pcmpeqw     xmm1, xmm1                                ;Z:     b1[i] = 0xFF;
+    shl         index, 32                                 ;Z:     index <<= 32;
+    mov         put_buffer, [state+working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    or          index, tempq                              ;Z:     index |= temp;
+    not         index                                     ;Z:     index = ~index;
+    sub         free_bitsb, nbitsb                        ;Z:     if ((free_bits -= nbits) >= 0)
+    jnl         .ENTRY_SKIP_EMIT_CODE                     ;Z:       goto .ENTRY_SKIP_EMIT_CODE;
+    align       16
+.EMIT_CODE:                                               ;Z:     .EMIT_CODE:
+    EMIT_QWORD  .BLOOP_COND                               ;Z:     insert code, flush buffer, goto .BLOOP_COND
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.BRLOOP:                                                  ; do {
+    lea         code_temp, [nbitsq - 16]                  ;   code_temp = nbits - 16;
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    mov         code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP_CODE                         ;     goto .EMIT_BRLOOP_CODE;
+    shl         put_buffer, nbitsb                        ;   put_buffer <<= nbits;
+    mov         nbits, code_temp                          ;   nbits = code_temp;
+    or          put_buffer, codeq                         ;   put_buffer |= code;
+    cmp         nbits, 16                                 ;   if (nbits <= 16)
+    jle         .ERLOOP                                   ;     break;
+    jmp         .BRLOOP                                   ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+    times 5     nop
+.ENTRY_SKIP_EMIT_CODE:                                    ; .ENTRY_SKIP_EMIT_CODE:
+    shl         put_buffer, nbitsb                        ; put_buffer <<= nbits;
+    or          put_buffer, codeq                         ; put_buffer |= code;
+.BLOOP_COND:                                              ; .BLOOP_COND:
+    test        index, index                              ; if (index != 0)
+    jz          .ELOOP                                    ; {
+.BLOOP:                                                   ;   do {
+    xor         nbits, nbits                              ;     nbits = 0;  /* kill tzcnt input dependency */
+    tzcnt       nbitsq, index                             ;     nbits = # of trailing 0 bits in index
+    inc         nbits                                     ;     ++nbits;
+    lea         t, [t + nbitsq * 2]                       ;     t = &t[nbits];
+    shr         index, nbitsb                             ;     index >>= nbits;
+.EMIT_BRLOOP_CODE_END:                                    ; .EMIT_BRLOOP_CODE_END:
+    cmp         nbits, 16                                 ;     if (nbits > 16)
+    jg          .BRLOOP                                   ;       goto .BRLOOP;
+.ERLOOP:                                                  ; .ERLOOP:
+    movsx       codeq, word [t]                           ;     code = *t;
+    lea         tempd, [nbitsq * 2]                       ;     temp = nbits * 2;
+    movzx       nbits, byte [NBITS(codeq)]                ;     nbits = JPEG_NBITS(code);
+    lea         tempd, [nbitsq + tempq * 8]               ;     temp = temp * 8 + nbits;
+    mov         code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
+                                                          ;     code_temp = actbl->ehufco[temp-16];
+    shl         code_temp, nbitsb                         ;     code_temp <<= nbits;
+    and         code, dword [MASK_BITS(nbitsq)]           ;     code &= (1 << nbits) - 1;
+    add         nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
+                                                          ;     free_bits -= actbl->ehufsi[temp-16];
+    or          code, code_temp                           ;     code |= code_temp;
+    sub         free_bitsb, nbitsb                        ;     if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_CODE                                ;       goto .EMIT_CODE;
+    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
+    or          put_buffer, codeq                         ;     put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP                                    ;   } while (index != 0);
+.ELOOP:                                                   ; }  /* index != 0 */
+    sub         td, esp                                   ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
+%ifdef WIN64
+    cmp         td, (DCTSIZE2 - 2) * SIZEOF_WORD          ; if (t != 62)
+%else
+    cmp         td, -2 * SIZEOF_WORD                      ; if (t != -2)
+%endif
+    je          .EFN                                      ; {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    mov         code, [actbl + c_derived_tbl.ehufco + 0]  ;   code = actbl->ehufco[0];
+    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  .EFN                                      ;     insert code, flush buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
+    or          put_buffer, codeq                         ;     put_buffer |= code;
+.EFN:                                                     ; } }
+    mov         [state + working_state.cur.put_buffer.simd], put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    mov         byte [state + working_state.cur.free_bits], free_bitsb
+                                                          ; state->cur.free_bits = free_bits;
+%ifdef WIN64
+    sub         rsp, -DCTSIZE2 * SIZEOF_WORD
+    pop         r12
+    pop         rdi
+    pop         rsi
+    pop         rbp
+    pop         rbx
+%else
+    pop         r12
+    pop         rbp
+    pop         rbx
+%endif
+    ret
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP_CODE:
+    EMIT_QWORD  .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
+                                                          ; insert code, flush buffer,
+                                                          ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jcphuff-sse2.asm b/media/libjpeg/simd/x86_64/jcphuff-sse2.asm
new file mode 100644
index 0000000000..01b5c0235f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcphuff-sse2.asm
@@ -0,0 +1,639 @@
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
+; (64-bit SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding.  See jcphuff.c for more details.
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+    pxor        N0, N0
+    pxor        N1, N1
+
+    mov         T0d, INT [LUT +  0*SIZEOF_INT]
+    mov         T1d, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0d, INT [LUT +  1*SIZEOF_INT]
+    mov         T1d, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    mov         T0d, INT [LUT +  2*SIZEOF_INT]
+    mov         T1d, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    mov         T0d, INT [LUT +  3*SIZEOF_INT]
+    mov         T1d, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    mov         T0d, INT [LUT +  4*SIZEOF_INT]
+    mov         T1d, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    mov         T0d, INT [LUT +  5*SIZEOF_INT]
+    mov         T1d, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    mov         T0d, INT [LUT +  6*SIZEOF_INT]
+    mov         T1d, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+
+    mov         T0d, INT [LUT +  7*SIZEOF_INT]
+    mov         T1d, INT [LUT + 15*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+    pinsrw      X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+    pxor        N0, N0
+    pxor        N1, N1
+    pxor        X1, X1
+
+    mov         T0d, INT [LUT +  0*SIZEOF_INT]
+    mov         T1d, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0d, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0d, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0d, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0d, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0d, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0d, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0d, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+    pxor        N0, N0
+
+    mov         T0d, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+
+    mov         T0d, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0d, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0d, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0d, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0d, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0d, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0d, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+    pxor        N0, N0
+    pxor        X0, X0
+
+    mov         T1d, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 0
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
+    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
+    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
+    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
+    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
+    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
+    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
+    movdqa      xmm7, XMMWORD [VALUES + (56*2)]
+
+    pcmpeqw     xmm0, ZERO
+    pcmpeqw     xmm1, ZERO
+    pcmpeqw     xmm2, ZERO
+    pcmpeqw     xmm3, ZERO
+    pcmpeqw     xmm4, ZERO
+    pcmpeqw     xmm5, ZERO
+    pcmpeqw     xmm6, ZERO
+    pcmpeqw     xmm7, ZERO
+
+    packsswb    xmm0, xmm1
+    packsswb    xmm2, xmm3
+    packsswb    xmm4, xmm5
+    packsswb    xmm6, xmm7
+
+    pmovmskb    eax, xmm0
+    pmovmskb    ecx, xmm2
+    pmovmskb    edx, xmm4
+    pmovmskb    esi, xmm6
+
+    shl         rcx, 16
+    shl         rdx, 32
+    shl         rsi, 48
+
+    or          rax, rcx
+    or          rdx, rsi
+    or          rax, rdx
+
+    not         rax
+
+    mov         MMWORD [r15], rax
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+;                                        const int *jpeg_natural_order_start,
+;                                        int Sl, int Al, JCOEF *values,
+;                                        size_t *zerobits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *zerobits
+
+%define ZERO    xmm9
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define LUT     r11
+%define T0      rcx
+%define T0d     ecx
+%define T1      rdx
+%define T1d     edx
+%define BLOCK   r10
+%define VALUES  r14
+%define LEN     r12d
+%define LENEND  r13d
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [rbp - 16]
+    collect_args 6
+
+    movdqa      XMMWORD [rbp - 16], ZERO
+
+    movd        AL, r13d
+    pxor        ZERO, ZERO
+    mov         K, LEN
+    mov         LENEND, LEN
+    and         K, -16
+    and         LENEND, 7
+    shr         K, 4
+    jz          .ELOOP16
+.BLOOP16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    dec         K
+    jnz         .BLOOP16
+    test        LEN, 15
+    je          .PADDING
+.ELOOP16:
+    test        LEN, 8
+    jz          .TRY7
+    test        LEN, 7
+    jz          .TRY8
+
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    jmp         .PADDING
+.TRY8:
+    LOAD8
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+    jmp         .PADDING
+.TRY7:
+    LOAD7
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+.PADDING:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDING
+    align       16
+.ZEROLOOP:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOP
+.EPADDING:
+    sub         VALUES, DCTSIZE2*2
+
+    REDUCE0
+
+    movdqa      ZERO, XMMWORD [rbp - 16]
+    uncollect_args 6
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+;                                         const int *jpeg_natural_order_start,
+;                                         int Sl, int Al, JCOEF *absvalues,
+;                                         size_t *bits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *bits
+
+%define ZERO    xmm9
+%define ONE     xmm5
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define KK      r9d
+%define EOB     r8d
+%define SIGN    rdi
+%define LUT     r11
+%define T0      rcx
+%define T0d     ecx
+%define T1      rdx
+%define T1d     edx
+%define BLOCK   r10
+%define VALUES  r14
+%define LEN     r12d
+%define LENEND  r13d
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [rbp - 16]
+    collect_args 6
+
+    movdqa      XMMWORD [rbp - 16], ZERO
+
+    xor         SIGN, SIGN
+    xor         EOB, EOB
+    xor         KK, KK
+    movd        AL, r13d
+    pxor        ZERO, ZERO
+    pcmpeqw     ONE, ONE
+    psrlw       ONE, 15
+    mov         K, LEN
+    mov         LENEND, LEN
+    and         K, -16
+    and         LENEND, 7
+    shr         K, 4
+    jz          .ELOOPR16
+.BLOOPR16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 16                ; make room for sizebits
+    shl         T0, 48
+    or          SIGN, T0
+    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER16            ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER16:
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    add         KK, 16
+    dec         K
+    jnz         .BLOOPR16
+    test        LEN, 15
+    je          .PADDINGR
+.ELOOPR16:
+    test        LEN, 8
+    jz          .TRYR7
+    test        LEN, 7
+    jz          .TRYR8
+
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 16                ; make room for sizebits
+    shl         T0, 48
+    or          SIGN, T0
+    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER15            ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER15:
+    add         VALUES, 16*2
+    jmp         .PADDINGR
+.TRYR8:
+    LOAD8
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 8                 ; make room for sizebits
+    shl         T0, 56
+    or          SIGN, T0
+    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER8             ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER8:
+    add         VALUES, 8*2
+    jmp         .PADDINGR
+.TRYR7:
+    LOAD7
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 8                 ; make room for sizebits
+    shl         T0, 56
+    or          SIGN, T0
+    bsr         T1d, T1d                ; idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER7             ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER7:
+    add         VALUES, 8*2
+.PADDINGR:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDINGR
+    align       16
+.ZEROLOOPR:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    shr         SIGN, 8
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOPR
+.EPADDINGR:
+    not         SIGN
+    sub         VALUES, DCTSIZE2*2
+    mov         MMWORD [r15+SIZEOF_MMWORD], SIGN
+
+    REDUCE0
+
+    mov         eax, EOB
+    movdqa      ZERO, XMMWORD [rbp - 16]
+    uncollect_args 6
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jcsample-avx2.asm b/media/libjpeg/simd/x86_64/jcsample-avx2.asm
new file mode 100644
index 0000000000..b32527aebe
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcsample-avx2.asm
@@ -0,0 +1,367 @@
+;
+; jcsample.asm - downsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         rdx, 0x00010000         ; bias pattern
+    vmovd       xmm7, edx
+    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+
+.columnloop_r24:
+    ; rcx can possibly be 8, 16, 24
+    cmp         rcx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         rcx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpsrlw      ymm2, ymm0, BYTE_BIT
+    vpand       ymm0, ymm0, ymm6
+    vpsrlw      ymm3, ymm1, BYTE_BIT
+    vpand       ymm1, ymm1, ymm6
+
+    vpaddw      ymm0, ymm0, ymm2
+    vpaddw      ymm1, ymm1, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 1
+    vpsrlw      ymm1, ymm1, 1
+
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
+    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r24
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        rax, rax
+    jle         near .return
+
+    mov         rdx, 0x00020001         ; bias pattern
+    vmovd       xmm7, edx
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+    vperm2i128  ymm7, ymm7, ymm7, 0
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdip, JSAMPROW [rdi]                    ; outptr
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+
+.columnloop_r24:
+    cmp         rcx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vmovdqu     xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         rcx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    vmovdqu     xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vmovdqu     ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpand       ymm4, ymm0, ymm6
+    vpsrlw      ymm0, ymm0, BYTE_BIT
+    vpand       ymm5, ymm1, ymm6
+    vpsrlw      ymm1, ymm1, BYTE_BIT
+    vpaddw      ymm0, ymm0, ymm4
+    vpaddw      ymm1, ymm1, ymm5
+
+    vpand       ymm4, ymm2, ymm6
+    vpsrlw      ymm2, ymm2, BYTE_BIT
+    vpand       ymm5, ymm3, ymm6
+    vpsrlw      ymm3, ymm3, BYTE_BIT
+    vpaddw      ymm2, ymm2, ymm4
+    vpaddw      ymm3, ymm3, ymm5
+
+    vpaddw      ymm0, ymm0, ymm1
+    vpaddw      ymm2, ymm2, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpsrlw      ymm0, ymm0, 2
+    vpsrlw      ymm2, ymm2, 2
+
+    vpackuswb   ymm0, ymm0, ymm2
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
+    add         rdx, byte 2*SIZEOF_YMMWORD  ; inptr0
+    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr1
+    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r24
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         rax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jcsample-sse2.asm b/media/libjpeg/simd/x86_64/jcsample-sse2.asm
new file mode 100644
index 0000000000..2fcfe4567a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcsample-sse2.asm
@@ -0,0 +1,330 @@
+;
+; jcsample.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         rdx, 0x00010000         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    pxor        xmm1, xmm1
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    pand        xmm0, xmm6
+    psrlw       xmm2, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm3, BYTE_BIT
+
+    paddw       xmm0, xmm2
+    paddw       xmm1, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    psrlw       xmm0, 1
+    psrlw       xmm1, 1
+
+    packuswb    xmm0, xmm1
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    test        rcx, rcx
+    jnz         short .columnloop_r8
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        rax, rax
+    jle         near .return
+
+    mov         rdx, 0x00020001         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdip, JSAMPROW [rdi]                    ; outptr
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    pxor        xmm2, xmm2
+    pxor        xmm3, xmm3
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    pand        xmm0, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm0, xmm4
+    paddw       xmm1, xmm5
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    pand        xmm2, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm3, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    paddw       xmm0, xmm1
+    paddw       xmm2, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm7
+    psrlw       xmm0, 2
+    psrlw       xmm2, 2
+
+    packuswb    xmm0, xmm2
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
+    add         rdx, byte 2*SIZEOF_XMMWORD  ; inptr0
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr1
+    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r8
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         rax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdcolext-avx2.asm b/media/libjpeg/simd/x86_64/jdcolext-avx2.asm
new file mode 100644
index 0000000000..2370fda642
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolext-avx2.asm
@@ -0,0 +1,496 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d               ; num_cols
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rdi, r13
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rax
+    push        rdi
+    push        rdx
+    push        rbx
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr0
+    mov         rbxp, JSAMPROW [rbx]    ; inptr1
+    mov         rdxp, JSAMPROW [rdx]    ; inptr2
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+.columnloop:
+
+    vmovdqu     ymm5, YMMWORD [rbx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm1, YMMWORD [rdx]     ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm0, ymm0, ymm0
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsrlw      ymm0, ymm0, BYTE_BIT    ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpand       ymm4, ymm0, ymm5        ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+    vpsrlw      ymm5, ymm5, BYTE_BIT    ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+    vpand       ymm0, ymm0, ymm1        ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+    vpsrlw      ymm1, ymm1, BYTE_BIT    ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+    vpaddw      ymm2, ymm4, ymm7
+    vpaddw      ymm3, ymm5, ymm7
+    vpaddw      ymm6, ymm0, ymm7
+    vpaddw      ymm7, ymm1, ymm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbE
+    vpaddw      ymm5, ymm3, ymm3             ; ymm5=2*CbO
+    vpaddw      ymm0, ymm6, ymm6             ; ymm0=2*CrE
+    vpaddw      ymm1, ymm7, ymm7             ; ymm1=2*CrO
+
+    vpmulhw     ymm4, ymm4, [rel PW_MF0228]  ; ymm4=(2*CbE * -FIX(0.22800))
+    vpmulhw     ymm5, ymm5, [rel PW_MF0228]  ; ymm5=(2*CbO * -FIX(0.22800))
+    vpmulhw     ymm0, ymm0, [rel PW_F0402]   ; ymm0=(2*CrE * FIX(0.40200))
+    vpmulhw     ymm1, ymm1, [rel PW_F0402]   ; ymm1=(2*CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, [rel PW_ONE]
+    vpaddw      ymm5, ymm5, [rel PW_ONE]
+    vpsraw      ymm4, ymm4, 1                ; ymm4=(CbE * -FIX(0.22800))
+    vpsraw      ymm5, ymm5, 1                ; ymm5=(CbO * -FIX(0.22800))
+    vpaddw      ymm0, ymm0, [rel PW_ONE]
+    vpaddw      ymm1, ymm1, [rel PW_ONE]
+    vpsraw      ymm0, ymm0, 1                ; ymm0=(CrE * FIX(0.40200))
+    vpsraw      ymm1, ymm1, 1                ; ymm1=(CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm5, ymm5, ymm3
+    vpaddw      ymm4, ymm4, ymm2             ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+    vpaddw      ymm5, ymm5, ymm3             ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+    vpaddw      ymm0, ymm0, ymm6             ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+    vpaddw      ymm1, ymm1, ymm7             ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    vmovdqa     YMMWORD [wk(0)], ymm4        ; wk(0)=(B-Y)E
+    vmovdqa     YMMWORD [wk(1)], ymm5        ; wk(1)=(B-Y)O
+
+    vpunpckhwd  ymm4, ymm2, ymm6
+    vpunpcklwd  ymm2, ymm2, ymm6
+    vpmaddwd    ymm2, ymm2, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm4, ymm4, [rel PW_MF0344_F0285]
+    vpunpckhwd  ymm5, ymm3, ymm7
+    vpunpcklwd  ymm3, ymm3, ymm7
+    vpmaddwd    ymm3, ymm3, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm5, ymm5, [rel PW_MF0344_F0285]
+
+    vpaddd      ymm2, ymm2, [rel PD_ONEHALF]
+    vpaddd      ymm4, ymm4, [rel PD_ONEHALF]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm4, ymm4, SCALEBITS
+    vpaddd      ymm3, ymm3, [rel PD_ONEHALF]
+    vpaddd      ymm5, ymm5, [rel PD_ONEHALF]
+    vpsrad      ymm3, ymm3, SCALEBITS
+    vpsrad      ymm5, ymm5, SCALEBITS
+
+    vpackssdw   ymm2, ymm2, ymm4             ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    vpackssdw   ymm3, ymm3, ymm5             ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    vpsubw      ymm2, ymm2, ymm6             ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    vpsubw      ymm3, ymm3, ymm7             ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    vmovdqu     ymm5, YMMWORD [rsi]          ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm4, ymm4, ymm4
+    vpsrlw      ymm4, ymm4, BYTE_BIT         ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm4, ymm4, ymm5             ; ymm4=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm5, ymm5, BYTE_BIT         ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+    vpaddw      ymm0, ymm0, ymm4             ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm5             ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0             ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1             ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm4             ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm5             ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2             ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3             ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, YMMWORD [wk(0)]  ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, YMMWORD [wk(1)]  ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4             ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5             ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         rcx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    cmp         rcx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         rcx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_YMMWORD/16*4
+    sub         rcx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    vmovd       XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+    pop         rcx
+    pop         rsi
+    pop         rbx
+    pop         rdx
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdcolext-sse2.asm b/media/libjpeg/simd/x86_64/jdcolext-sse2.asm
new file mode 100644
index 0000000000..e07c8d7518
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolext-sse2.asm
@@ -0,0 +1,439 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d               ; num_cols
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rdi, r13
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rax
+    push        rdi
+    push        rdx
+    push        rbx
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr0
+    mov         rbxp, JSAMPROW [rbx]    ; inptr1
+    mov         rdxp, JSAMPROW [rdx]    ; inptr2
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+.columnloop:
+
+    movdqa      xmm5, XMMWORD [rbx]     ; xmm5=Cb(0123456789ABCDEF)
+    movdqa      xmm1, XMMWORD [rdx]     ; xmm1=Cr(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    pcmpeqw     xmm7, xmm7
+    psrlw       xmm4, BYTE_BIT
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+    movdqa      xmm0, xmm4              ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        xmm4, xmm5              ; xmm4=Cb(02468ACE)=CbE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Cb(13579BDF)=CbO
+    pand        xmm0, xmm1              ; xmm0=Cr(02468ACE)=CrE
+    psrlw       xmm1, BYTE_BIT          ; xmm1=Cr(13579BDF)=CrO
+
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm2, xmm4              ; xmm2=CbE
+    movdqa      xmm3, xmm5              ; xmm3=CbO
+    paddw       xmm4, xmm4              ; xmm4=2*CbE
+    paddw       xmm5, xmm5              ; xmm5=2*CbO
+    movdqa      xmm6, xmm0              ; xmm6=CrE
+    movdqa      xmm7, xmm1              ; xmm7=CrO
+    paddw       xmm0, xmm0              ; xmm0=2*CrE
+    paddw       xmm1, xmm1              ; xmm1=2*CrO
+
+    pmulhw      xmm4, [rel PW_MF0228]   ; xmm4=(2*CbE * -FIX(0.22800))
+    pmulhw      xmm5, [rel PW_MF0228]   ; xmm5=(2*CbO * -FIX(0.22800))
+    pmulhw      xmm0, [rel PW_F0402]    ; xmm0=(2*CrE * FIX(0.40200))
+    pmulhw      xmm1, [rel PW_F0402]    ; xmm1=(2*CrO * FIX(0.40200))
+
+    paddw       xmm4, [rel PW_ONE]
+    paddw       xmm5, [rel PW_ONE]
+    psraw       xmm4, 1                 ; xmm4=(CbE * -FIX(0.22800))
+    psraw       xmm5, 1                 ; xmm5=(CbO * -FIX(0.22800))
+    paddw       xmm0, [rel PW_ONE]
+    paddw       xmm1, [rel PW_ONE]
+    psraw       xmm0, 1                 ; xmm0=(CrE * FIX(0.40200))
+    psraw       xmm1, 1                 ; xmm1=(CrO * FIX(0.40200))
+
+    paddw       xmm4, xmm2
+    paddw       xmm5, xmm3
+    paddw       xmm4, xmm2              ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       xmm5, xmm3              ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       xmm0, xmm6              ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       xmm1, xmm7              ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm4, xmm6
+    pmaddwd     xmm2, [rel PW_MF0344_F0285]
+    pmaddwd     xmm4, [rel PW_MF0344_F0285]
+    punpcklwd   xmm3, xmm7
+    punpckhwd   xmm5, xmm7
+    pmaddwd     xmm3, [rel PW_MF0344_F0285]
+    pmaddwd     xmm5, [rel PW_MF0344_F0285]
+
+    paddd       xmm2, [rel PD_ONEHALF]
+    paddd       xmm4, [rel PD_ONEHALF]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm4, SCALEBITS
+    paddd       xmm3, [rel PD_ONEHALF]
+    paddd       xmm5, [rel PD_ONEHALF]
+    psrad       xmm3, SCALEBITS
+    psrad       xmm5, SCALEBITS
+
+    packssdw    xmm2, xmm4              ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    xmm3, xmm5              ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       xmm2, xmm6              ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       xmm3, xmm7              ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movdqa      xmm5, XMMWORD [rsi]     ; xmm5=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    psrlw       xmm4, BYTE_BIT          ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm4, xmm5              ; xmm4=Y(02468ACE)=YE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Y(13579BDF)=YO
+
+    paddw       xmm0, xmm4              ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm5              ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm4              ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm5              ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, XMMWORD [wk(0)]   ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+    paddw       xmm5, XMMWORD [wk(1)]   ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         rcx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    cmp         rcx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_XMMWORD/8*4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    movd        XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+    pop         rcx
+    pop         rsi
+    pop         rbx
+    pop         rdx
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdcolor-avx2.asm b/media/libjpeg/simd/x86_64/jdcolor-avx2.asm
new file mode 100644
index 0000000000..43de9db04d
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolor-avx2.asm
@@ -0,0 +1,118 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdcolor-sse2.asm b/media/libjpeg/simd/x86_64/jdcolor-sse2.asm
new file mode 100644
index 0000000000..b3f1fec07e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolor-sse2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgb_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgbx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgrx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxrgb_convert_sse2
+%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmerge-avx2.asm b/media/libjpeg/simd/x86_64/jdmerge-avx2.asm
new file mode 100644
index 0000000000..9515a17013
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmerge-avx2.asm
@@ -0,0 +1,136 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmerge-sse2.asm b/media/libjpeg/simd/x86_64/jdmerge-sse2.asm
new file mode 100644
index 0000000000..aedccc20f6
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmerge-sse2.asm
@@ -0,0 +1,135 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm b/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm
new file mode 100644
index 0000000000..8b264b4f03
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm
@@ -0,0 +1,596 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  3
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         ecx, r10d               ; col
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    mov         rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdip, JSAMPROW [rdi]                      ; outptr
+
+    pop         rcx                     ; col
+
+.columnloop:
+
+    vmovdqu     ymm6, YMMWORD [rbx]     ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm7, YMMWORD [rdx]     ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpcmpeqw    ymm3, ymm3, ymm3
+    vpsllw      ymm3, ymm3, 7           ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpermq      ymm6, ymm6, 0xd8        ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpermq      ymm7, ymm7, 0xd8        ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpunpcklbw  ymm4, ymm6, ymm1        ; ymm4=Cb(0123456789ABCDEF)=CbL
+    vpunpckhbw  ymm6, ymm6, ymm1        ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+    vpunpcklbw  ymm0, ymm7, ymm1        ; ymm0=Cr(0123456789ABCDEF)=CrL
+    vpunpckhbw  ymm7, ymm7, ymm1        ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+    vpaddw      ymm5, ymm6, ymm3
+    vpaddw      ymm2, ymm4, ymm3
+    vpaddw      ymm1, ymm7, ymm3
+    vpaddw      ymm3, ymm0, ymm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm6, ymm5, ymm5             ; ymm6=2*CbH
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbL
+    vpaddw      ymm7, ymm1, ymm1             ; ymm7=2*CrH
+    vpaddw      ymm0, ymm3, ymm3             ; ymm0=2*CrL
+
+    vpmulhw     ymm6, ymm6, [rel PW_MF0228]  ; ymm6=(2*CbH * -FIX(0.22800))
+    vpmulhw     ymm4, ymm4, [rel PW_MF0228]  ; ymm4=(2*CbL * -FIX(0.22800))
+    vpmulhw     ymm7, ymm7, [rel PW_F0402]   ; ymm7=(2*CrH * FIX(0.40200))
+    vpmulhw     ymm0, ymm0, [rel PW_F0402]   ; ymm0=(2*CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, [rel PW_ONE]
+    vpaddw      ymm4, ymm4, [rel PW_ONE]
+    vpsraw      ymm6, ymm6, 1                ; ymm6=(CbH * -FIX(0.22800))
+    vpsraw      ymm4, ymm4, 1                ; ymm4=(CbL * -FIX(0.22800))
+    vpaddw      ymm7, ymm7, [rel PW_ONE]
+    vpaddw      ymm0, ymm0, [rel PW_ONE]
+    vpsraw      ymm7, ymm7, 1                ; ymm7=(CrH * FIX(0.40200))
+    vpsraw      ymm0, ymm0, 1                ; ymm0=(CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, ymm5
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm6, ymm6, ymm5             ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+    vpaddw      ymm4, ymm4, ymm2             ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+    vpaddw      ymm7, ymm7, ymm1             ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+    vpaddw      ymm0, ymm0, ymm3             ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    vmovdqa     YMMWORD [wk(0)], ymm6        ; wk(0)=(B-Y)H
+    vmovdqa     YMMWORD [wk(1)], ymm7        ; wk(1)=(R-Y)H
+
+    vpunpckhwd  ymm6, ymm5, ymm1
+    vpunpcklwd  ymm5, ymm5, ymm1
+    vpmaddwd    ymm5, ymm5, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm6, ymm6, [rel PW_MF0344_F0285]
+    vpunpckhwd  ymm7, ymm2, ymm3
+    vpunpcklwd  ymm2, ymm2, ymm3
+    vpmaddwd    ymm2, ymm2, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm7, ymm7, [rel PW_MF0344_F0285]
+
+    vpaddd      ymm5, ymm5, [rel PD_ONEHALF]
+    vpaddd      ymm6, ymm6, [rel PD_ONEHALF]
+    vpsrad      ymm5, ymm5, SCALEBITS
+    vpsrad      ymm6, ymm6, SCALEBITS
+    vpaddd      ymm2, ymm2, [rel PD_ONEHALF]
+    vpaddd      ymm7, ymm7, [rel PD_ONEHALF]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm7, ymm7, SCALEBITS
+
+    vpackssdw   ymm5, ymm5, ymm6        ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    vpackssdw   ymm2, ymm2, ymm7        ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    vpsubw      ymm5, ymm5, ymm1        ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    vpsubw      ymm2, ymm2, ymm3        ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    vmovdqa     YMMWORD [wk(2)], ymm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+
+.Yloop_2nd:
+    vmovdqa     ymm0, YMMWORD [wk(1)]   ; ymm0=(R-Y)H
+    vmovdqa     ymm2, YMMWORD [wk(2)]   ; ymm2=(G-Y)H
+    vmovdqa     ymm4, YMMWORD [wk(0)]   ; ymm4=(B-Y)H
+
+.Yloop_1st:
+    vmovdqu     ymm7, YMMWORD [rsi]     ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm6, ymm6, ymm7        ; ymm6=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm7, ymm7, BYTE_BIT    ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+    vmovdqa     ymm1, ymm0              ; ymm1=ymm0=(R-Y)(L/H)
+    vmovdqa     ymm3, ymm2              ; ymm3=ymm2=(G-Y)(L/H)
+    vmovdqa     ymm5, ymm4              ; ymm5=ymm4=(B-Y)(L/H)
+
+    vpaddw      ymm0, ymm0, ymm6        ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm7        ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0        ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1        ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm6        ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm7        ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2        ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3        ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, ymm6        ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, ymm7        ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4        ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5        ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         rcx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    cmp         rcx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         rcx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_YMMWORD/16*4
+    sub         rcx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    vmovd       XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         eax, r10d
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+    sub         rsp, SIZEOF_JSAMPARRAY*4
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+
+    add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
+    add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
+
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+    add         rsp, SIZEOF_JSAMPARRAY*4
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm b/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm
new file mode 100644
index 0000000000..eb3ab9dbd9
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm
@@ -0,0 +1,538 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  3
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         ecx, r10d               ; col
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    mov         rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdip, JSAMPROW [rdi]                      ; outptr
+
+    pop         rcx                     ; col
+
+.columnloop:
+
+    movdqa      xmm6, XMMWORD [rbx]     ; xmm6=Cb(0123456789ABCDEF)
+    movdqa      xmm7, XMMWORD [rdx]     ; xmm7=Cr(0123456789ABCDEF)
+
+    pxor        xmm1, xmm1              ; xmm1=(all 0's)
+    pcmpeqw     xmm3, xmm3
+    psllw       xmm3, 7                 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    movdqa      xmm4, xmm6
+    punpckhbw   xmm6, xmm1              ; xmm6=Cb(89ABCDEF)=CbH
+    punpcklbw   xmm4, xmm1              ; xmm4=Cb(01234567)=CbL
+    movdqa      xmm0, xmm7
+    punpckhbw   xmm7, xmm1              ; xmm7=Cr(89ABCDEF)=CrH
+    punpcklbw   xmm0, xmm1              ; xmm0=Cr(01234567)=CrL
+
+    paddw       xmm6, xmm3
+    paddw       xmm4, xmm3
+    paddw       xmm7, xmm3
+    paddw       xmm0, xmm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm5, xmm6              ; xmm5=CbH
+    movdqa      xmm2, xmm4              ; xmm2=CbL
+    paddw       xmm6, xmm6              ; xmm6=2*CbH
+    paddw       xmm4, xmm4              ; xmm4=2*CbL
+    movdqa      xmm1, xmm7              ; xmm1=CrH
+    movdqa      xmm3, xmm0              ; xmm3=CrL
+    paddw       xmm7, xmm7              ; xmm7=2*CrH
+    paddw       xmm0, xmm0              ; xmm0=2*CrL
+
+    pmulhw      xmm6, [rel PW_MF0228]   ; xmm6=(2*CbH * -FIX(0.22800))
+    pmulhw      xmm4, [rel PW_MF0228]   ; xmm4=(2*CbL * -FIX(0.22800))
+    pmulhw      xmm7, [rel PW_F0402]    ; xmm7=(2*CrH * FIX(0.40200))
+    pmulhw      xmm0, [rel PW_F0402]    ; xmm0=(2*CrL * FIX(0.40200))
+
+    paddw       xmm6, [rel PW_ONE]
+    paddw       xmm4, [rel PW_ONE]
+    psraw       xmm6, 1                 ; xmm6=(CbH * -FIX(0.22800))
+    psraw       xmm4, 1                 ; xmm4=(CbL * -FIX(0.22800))
+    paddw       xmm7, [rel PW_ONE]
+    paddw       xmm0, [rel PW_ONE]
+    psraw       xmm7, 1                 ; xmm7=(CrH * FIX(0.40200))
+    psraw       xmm0, 1                 ; xmm0=(CrL * FIX(0.40200))
+
+    paddw       xmm6, xmm5
+    paddw       xmm4, xmm2
+    paddw       xmm6, xmm5              ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       xmm4, xmm2              ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       xmm7, xmm1              ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       xmm0, xmm3              ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm5, xmm1
+    punpckhwd   xmm6, xmm1
+    pmaddwd     xmm5, [rel PW_MF0344_F0285]
+    pmaddwd     xmm6, [rel PW_MF0344_F0285]
+    punpcklwd   xmm2, xmm3
+    punpckhwd   xmm7, xmm3
+    pmaddwd     xmm2, [rel PW_MF0344_F0285]
+    pmaddwd     xmm7, [rel PW_MF0344_F0285]
+
+    paddd       xmm5, [rel PD_ONEHALF]
+    paddd       xmm6, [rel PD_ONEHALF]
+    psrad       xmm5, SCALEBITS
+    psrad       xmm6, SCALEBITS
+    paddd       xmm2, [rel PD_ONEHALF]
+    paddd       xmm7, [rel PD_ONEHALF]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm7, SCALEBITS
+
+    packssdw    xmm5, xmm6              ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    xmm2, xmm7              ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       xmm5, xmm1              ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       xmm2, xmm3              ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movdqa      XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+
+.Yloop_2nd:
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+
+.Yloop_1st:
+    movdqa      xmm7, XMMWORD [rsi]     ; xmm7=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm6, xmm6
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm6, xmm7              ; xmm6=Y(02468ACE)=YE
+    psrlw       xmm7, BYTE_BIT          ; xmm7=Y(13579BDF)=YO
+
+    movdqa      xmm1, xmm0              ; xmm1=xmm0=(R-Y)(L/H)
+    movdqa      xmm3, xmm2              ; xmm3=xmm2=(G-Y)(L/H)
+    movdqa      xmm5, xmm4              ; xmm5=xmm4=(B-Y)(L/H)
+
+    paddw       xmm0, xmm6              ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm7              ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm6              ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm7              ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, xmm6              ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+    paddw       xmm5, xmm7              ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         rcx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    cmp         rcx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_XMMWORD/8*4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    movd        XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         eax, r10d
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+    sub         rsp, SIZEOF_JSAMPARRAY*4
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+
+    add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
+    add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
+
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+    add         rsp, SIZEOF_JSAMPARRAY*4
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdsample-avx2.asm b/media/libjpeg/simd/x86_64/jdsample-avx2.asm
new file mode 100644
index 0000000000..1e4979f933
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdsample-avx2.asm
@@ -0,0 +1,696 @@
+;
+; jdsample.asm - upsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE   times 16 dw 1
+PW_TWO   times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    push_xmm    3
+    collect_args 4
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+
+    vpxor       ymm0, ymm0, ymm0                 ; ymm0=(all 0's)
+    vpcmpeqb    xmm9, xmm9, xmm9
+    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
+
+    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-1)
+    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ff) MSB is ff
+
+.rowloop:
+    push        rax                     ; colctr
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    test        rax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    vpand       ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    add         rax, byte SIZEOF_YMMWORD-1
+    and         rax, byte -SIZEOF_YMMWORD
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    vpand       ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    jmp         short .upsample
+
+.columnloop:
+    vmovdqu     ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vperm2i128  ymm6, ymm0, ymm6, 0x20
+    vpslldq     ymm6, ymm6, 15
+
+.upsample:
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
+
+    vperm2i128  ymm2, ymm0, ymm1, 0x20
+    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
+    vperm2i128  ymm4, ymm0, ymm1, 0x03
+    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
+
+    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
+    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
+
+    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
+
+    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
+    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
+    vpunpcklbw  ymm8, ymm3, ymm0                ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+    vperm2i128  ymm3, ymm8, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vperm2i128  ymm6, ymm8, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vpmullw     ymm1, ymm1, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+    vpaddw      ymm2, ymm2, [rel PW_ONE]
+    vpaddw      ymm5, ymm5, [rel PW_ONE]
+    vpaddw      ymm3, ymm3, [rel PW_TWO]
+    vpaddw      ymm6, ymm6, [rel PW_TWO]
+
+    vpaddw      ymm2, ymm2, ymm1
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm3, ymm3, ymm1
+    vpaddw      ymm6, ymm6, ymm4
+    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm3, ymm3, BYTE_BIT
+    vpsllw      ymm6, ymm6, BYTE_BIT
+    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
+
+    sub         rax, byte SIZEOF_YMMWORD
+    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 4
+    pop_xmm     3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    push_xmm    3
+    collect_args 4
+    push        rbx
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    vpxor       ymm8, ymm8, ymm8                 ; ymm8=(all 0's)
+    vpcmpeqb    xmm9, xmm9, xmm9
+    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
+    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-2)
+    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+    test        rax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    push        rdx
+    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         rdx
+.skip:
+    ; -- process the first column block
+
+    vmovdqu     ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
+    vmovdqu     ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
+    vmovdqu     ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
+
+    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm3, ymm2, ymm8        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
+
+    vpand       ymm1, ymm1, ymm10       ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vpand       ymm2, ymm2, ymm10       ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+    vmovdqa     YMMWORD [wk(0)], ymm1
+    vmovdqa     YMMWORD [wk(1)], ymm2
+
+    add         rax, byte SIZEOF_YMMWORD-1
+    and         rax, byte -SIZEOF_YMMWORD
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    ; -- process the last column block
+
+    vpand       ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vpand       ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD]
+
+    vmovdqa     YMMWORD [wk(2)], ymm1   ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+    vmovdqa     YMMWORD [wk(3)], ymm2   ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+    jmp         near .upsample
+
+.columnloop:
+    ; -- process the next column block
+
+    vmovdqu     ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
+    vmovdqu     ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
+    vmovdqu     ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
+
+    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm7, ymm2, ymm8        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
+
+    vperm2i128  ymm1, ymm8, ymm1, 0x20
+    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+    vperm2i128  ymm2, ymm8, ymm2, 0x20
+    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+
+    vmovdqa     YMMWORD [wk(2)], ymm1
+    vmovdqa     YMMWORD [wk(3)], ymm2
+
+.upsample:
+    ; -- process the upper row
+
+    vmovdqu     ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vperm2i128  ymm0, ymm8, ymm7, 0x03
+    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm4, ymm8, ymm3, 0x20
+    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm5, ymm8, ymm7, 0x03
+    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm6, ymm8, ymm3, 0x20
+    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm2, ymm8, ymm3, 0x03
+    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm4, ymm8, ymm3, 0x03
+    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm8, ymm7, 0x20
+    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(0)], ymm4
+
+    vpmullw     ymm7, ymm7, [rel PW_THREE]
+    vpmullw     ymm3, ymm3, [rel PW_THREE]
+    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
+    vpaddw      ymm5, ymm5, [rel PW_EIGHT]
+    vpaddw      ymm0, ymm0, [rel PW_SEVEN]
+    vpaddw      ymm2, [rel PW_SEVEN]
+
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm5, ymm5, ymm3
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm3
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpsllw      ymm2, ymm2, BYTE_BIT
+    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
+
+    ; -- process the lower row
+
+    vmovdqu     ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vperm2i128  ymm7, ymm8, ymm6, 0x03
+    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm3, ymm8, ymm4, 0x20
+    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm0, ymm8, ymm6, 0x03
+    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm2, ymm8, ymm4, 0x20
+    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm5, ymm8, ymm4, 0x03
+    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm3, ymm8, ymm4, 0x03
+    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm8, ymm6, 0x20
+    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(1)], ymm3
+
+    vpmullw     ymm6, ymm6, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
+    vpaddw      ymm0, ymm0, [rel PW_EIGHT]
+    vpaddw      ymm7, ymm7, [rel PW_SEVEN]
+    vpaddw      ymm5, ymm5, [rel PW_SEVEN]
+
+    vpaddw      ymm1, ymm1, ymm6
+    vpaddw      ymm0, ymm0, ymm4
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm7, ymm7, ymm6
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpsllw      ymm5, ymm5, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
+    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
+
+    sub         rax, byte SIZEOF_YMMWORD
+    add         rcx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
+    add         rbx, byte 1*SIZEOF_YMMWORD  ; inptr0
+    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
+    add         rdx, byte 2*SIZEOF_YMMWORD  ; outptr0
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr1
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        rax, rax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    pop_xmm     3
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         edx, r11d
+    add         rdx, byte (SIZEOF_YMMWORD-1)
+    and         rdx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          short .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+    mov         rax, rdx                ; colctr
+.columnloop:
+
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .above_16
+
+    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         short .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         rax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD    ; inptr
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         edx, r11d
+    add         rdx, byte (SIZEOF_YMMWORD-1)
+    and         rdx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]                   ; inptr
+    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+    mov         rax, rdx                               ; colctr
+.columnloop:
+
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .above_16
+
+    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         near .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         rax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr
+    add         rbx, 2*SIZEOF_YMMWORD     ; outptr0
+    add         rdi, 2*SIZEOF_YMMWORD     ; outptr1
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdsample-sse2.asm b/media/libjpeg/simd/x86_64/jdsample-sse2.asm
new file mode 100644
index 0000000000..38dbceec26
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdsample-sse2.asm
@@ -0,0 +1,665 @@
+;
+; jdsample.asm - upsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE   times 8 dw 1
+PW_TWO   times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    test        rax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        xmm0, xmm0              ; xmm0=(all 0's)
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)
+    pand        xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    add         rax, byte SIZEOF_XMMWORD-1
+    and         rax, byte -SIZEOF_XMMWORD
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    pcmpeqb     xmm6, xmm6
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+    pand        xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    jmp         short .upsample
+
+.columnloop:
+    movdqa      xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
+    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
+    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
+
+    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
+    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
+
+    movdqa      xmm7, xmm1
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
+
+    movdqa      xmm4, xmm1
+    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm2
+    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
+    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
+    movdqa      xmm6, xmm3
+    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
+    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
+
+    pmullw      xmm1, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+    paddw       xmm2, [rel PW_ONE]
+    paddw       xmm5, [rel PW_ONE]
+    paddw       xmm3, [rel PW_TWO]
+    paddw       xmm6, [rel PW_TWO]
+
+    paddw       xmm2, xmm1
+    paddw       xmm5, xmm4
+    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+    paddw       xmm3, xmm1
+    paddw       xmm6, xmm4
+    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm3, BYTE_BIT
+    psllw       xmm6, BYTE_BIT
+    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+    sub         rax, byte SIZEOF_XMMWORD
+    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        rax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    push        rdx
+    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         rdx
+.skip:
+    ; -- process the first column block
+
+    movdqa      xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
+    movdqa      xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
+    movdqa      xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-2)
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
+    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
+
+    movdqa      XMMWORD [wk(0)], xmm1
+    movdqa      XMMWORD [wk(1)], xmm2
+
+    add         rax, byte SIZEOF_XMMWORD-1
+    and         rax, byte -SIZEOF_XMMWORD
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pcmpeqb     xmm1, xmm1
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)
+    movdqa      xmm2, xmm1
+
+    pand        xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+    pand        xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+    jmp         near .upsample
+
+.columnloop:
+    ; -- process the next column block
+
+    movdqa      xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
+    movdqa      xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
+    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
+
+    movdqa      XMMWORD [wk(2)], xmm1
+    movdqa      XMMWORD [wk(3)], xmm2
+
+.upsample:
+    ; -- process the upper row
+
+    movdqa      xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
+    movdqa      xmm5, xmm7
+    movdqa      xmm6, xmm3
+    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
+
+    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
+    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm2, xmm3
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm4, xmm3
+    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(0)], xmm4
+
+    pmullw      xmm7, [rel PW_THREE]
+    pmullw      xmm3, [rel PW_THREE]
+    paddw       xmm1, [rel PW_EIGHT]
+    paddw       xmm5, [rel PW_EIGHT]
+    paddw       xmm0, [rel PW_SEVEN]
+    paddw       xmm2, [rel PW_SEVEN]
+
+    paddw       xmm1, xmm7
+    paddw       xmm5, xmm3
+    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm3
+    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm0, BYTE_BIT
+    psllw       xmm2, BYTE_BIT
+    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+    ; -- process the lower row
+
+    movdqa      xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+    movdqa      xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
+    movdqa      xmm0, xmm6
+    movdqa      xmm2, xmm4
+    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
+
+    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
+    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm3, xmm4
+    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(1)], xmm3
+
+    pmullw      xmm6, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+    paddw       xmm1, [rel PW_EIGHT]
+    paddw       xmm0, [rel PW_EIGHT]
+    paddw       xmm7, [rel PW_SEVEN]
+    paddw       xmm5, [rel PW_SEVEN]
+
+    paddw       xmm1, xmm6
+    paddw       xmm0, xmm4
+    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm7, xmm6
+    paddw       xmm5, xmm4
+    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm7, BYTE_BIT
+    psllw       xmm5, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+    sub         rax, byte SIZEOF_XMMWORD
+    add         rcx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
+    add         rbx, byte 1*SIZEOF_XMMWORD  ; inptr0
+    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
+    add         rdx, byte 2*SIZEOF_XMMWORD  ; outptr0
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr1
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        rax, rax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         edx, r11d
+    add         rdx, byte (2*SIZEOF_XMMWORD)-1
+    and         rdx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          short .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+    mov         rax, rdx                ; colctr
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         edx, r11d
+    add         rdx, byte (2*SIZEOF_XMMWORD)-1
+    and         rdx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]                   ; inptr
+    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+    mov         rax, rdx                               ; colctr
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rbx, byte 4*SIZEOF_XMMWORD  ; outptr0
+    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr1
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jfdctflt-sse.asm b/media/libjpeg/simd/x86_64/jfdctflt-sse.asm
new file mode 100644
index 0000000000..ef2796649b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctflt-sse.asm
@@ -0,0 +1,355 @@
+;
+; jfdctflt.asm - floating-point FDCT (64-bit SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro  unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+; r10 = FAST_FLOAT *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (FAST_FLOAT *)
+    mov         rcx, DCTSIZE/4
+.rowloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+    ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(20 30 21 31)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 32 23 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(24 34 25 35)
+    unpckhps    xmm5, xmm3              ; xmm5=(26 36 27 37)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+    ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(02 12 03 13)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(04 14 05 15)
+    unpckhps    xmm2, xmm3              ; xmm2=(06 16 07 17)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 10 20 30)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(01 11 21 31)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(06 16 26 36)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(07 17 27 37)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(02 12 22 32)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(03 13 23 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(04 14 24 34)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(05 15 25 35)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
+
+    movaps      xmm1, xmm2              ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
+    mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2              ; xmm1=z2
+    addps       xmm6, xmm2              ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         rdx, r10                ; (FAST_FLOAT *)
+    mov         rcx, DCTSIZE/4
+.columnloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+    ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(02 03 12 13)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 23 32 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(42 43 52 53)
+    unpckhps    xmm5, xmm3              ; xmm5=(62 63 72 73)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+    ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 01 10 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(20 21 30 31)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(40 41 50 51)
+    unpckhps    xmm2, xmm3              ; xmm2=(60 61 70 71)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 01 02 03)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(10 11 12 13)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(60 61 62 63)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(70 71 72 73)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(20 21 22 23)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(30 31 32 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(40 41 42 43)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(50 51 52 53)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
+
+    movaps      xmm1, xmm2              ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
+    mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2              ; xmm1=z2
+    addps       xmm6, xmm2              ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         rdx, byte 4*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         near .columnloop
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm b/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm
new file mode 100644
index 0000000000..2e1bfe6e8c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm
@@ -0,0 +1,389 @@
+;
+; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    psubw       xmm3, xmm1              ; xmm3=tmp13
+    psubw       xmm6, xmm7              ; xmm6=tmp12
+    paddw       xmm4, xmm1              ; xmm4=tmp10
+    paddw       xmm0, xmm7              ; xmm0=tmp11
+
+    paddw       xmm6, xmm3
+    psllw       xmm6, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm6, [rel PW_F0707]    ; xmm6=z1
+
+    movdqa      xmm1, xmm4
+    movdqa      xmm7, xmm3
+    psubw       xmm4, xmm0              ; xmm4=data4
+    psubw       xmm3, xmm6              ; xmm3=data6
+    paddw       xmm1, xmm0              ; xmm1=data0
+    paddw       xmm7, xmm6              ; xmm7=data2
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+    ; -- Odd part
+
+    paddw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm5, xmm0              ; xmm5=tmp11
+    paddw       xmm0, xmm6              ; xmm0=tmp12, xmm6=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F0707]    ; xmm5=z3
+
+    movdqa      xmm4, xmm2              ; xmm4=tmp10
+    psubw       xmm2, xmm0
+    pmulhw      xmm2, [rel PW_F0382]    ; xmm2=z5
+    pmulhw      xmm4, [rel PW_F0541]    ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm0, [rel PW_F1306]    ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm2              ; xmm4=z2
+    paddw       xmm0, xmm2              ; xmm0=z4
+
+    movdqa      xmm3, xmm6
+    psubw       xmm6, xmm5              ; xmm6=z13
+    paddw       xmm3, xmm5              ; xmm3=z11
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm5, xmm3
+    psubw       xmm6, xmm4              ; xmm6=data3
+    psubw       xmm3, xmm0              ; xmm3=data7
+    paddw       xmm2, xmm4              ; xmm2=data5
+    paddw       xmm5, xmm0              ; xmm5=data1
+
+    ; ---- Pass 2: process columns.
+
+    ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+    ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm4, xmm5              ; xmm4=(40 41 50 51 60 61 70 71)
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm6              ; xmm7=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm0, xmm6              ; xmm0=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+    ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+    ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm2              ; xmm5=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm7, xmm2              ; xmm7=(44 45 54 55 64 65 74 75)
+    movdqa      xmm0, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm3              ; xmm6=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm0, xmm3              ; xmm0=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm2, xmm5              ; transpose coefficients(phase 2)
+    punpckldq   xmm5, xmm6              ; xmm5=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm2, xmm6              ; xmm2=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm0              ; xmm7=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm3, xmm0              ; xmm3=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm6              ; xmm1=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm2, xmm6              ; xmm2=(20 21 22 23 30 31 32 33)
+    movdqa      xmm7, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm7, xmm0              ; xmm7=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm5              ; xmm1=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm6, xmm5              ; xmm6=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm3              ; xmm7=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm0, xmm3              ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm3, xmm1
+    psubw       xmm6, xmm7              ; xmm6=data1-data6=tmp6
+    psubw       xmm1, xmm0              ; xmm1=data0-data7=tmp7
+    paddw       xmm5, xmm7              ; xmm5=data1+data6=tmp1
+    paddw       xmm3, xmm0              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+    movdqa      xmm6, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm7              ; xmm2=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm6, xmm7              ; xmm6=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm1, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm1, xmm0              ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm0, xmm2
+    paddw       xmm6, xmm4              ; xmm6=data3+data4=tmp3
+    paddw       xmm2, xmm1              ; xmm2=data2+data5=tmp2
+    psubw       xmm7, xmm4              ; xmm7=data3-data4=tmp4
+    psubw       xmm0, xmm1              ; xmm0=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm1, xmm5
+    psubw       xmm3, xmm6              ; xmm3=tmp13
+    psubw       xmm5, xmm2              ; xmm5=tmp12
+    paddw       xmm4, xmm6              ; xmm4=tmp10
+    paddw       xmm1, xmm2              ; xmm1=tmp11
+
+    paddw       xmm5, xmm3
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F0707]    ; xmm5=z1
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm2, xmm3
+    psubw       xmm4, xmm1              ; xmm4=data4
+    psubw       xmm3, xmm5              ; xmm3=data6
+    paddw       xmm6, xmm1              ; xmm6=data0
+    paddw       xmm2, xmm5              ; xmm2=data2
+
+    movdqa      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+    ; -- Odd part
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    paddw       xmm7, xmm0              ; xmm7=tmp10
+    paddw       xmm0, xmm1              ; xmm0=tmp11
+    paddw       xmm1, xmm5              ; xmm1=tmp12, xmm5=tmp7
+
+    psllw       xmm7, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm0, [rel PW_F0707]    ; xmm0=z3
+
+    movdqa      xmm4, xmm7              ; xmm4=tmp10
+    psubw       xmm7, xmm1
+    pmulhw      xmm7, [rel PW_F0382]    ; xmm7=z5
+    pmulhw      xmm4, [rel PW_F0541]    ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm1, [rel PW_F1306]    ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm7              ; xmm4=z2
+    paddw       xmm1, xmm7              ; xmm1=z4
+
+    movdqa      xmm3, xmm5
+    psubw       xmm5, xmm0              ; xmm5=z13
+    paddw       xmm3, xmm0              ; xmm3=z11
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm2, xmm3
+    psubw       xmm5, xmm4              ; xmm5=data3
+    psubw       xmm3, xmm1              ; xmm3=data7
+    paddw       xmm6, xmm4              ; xmm6=data5
+    paddw       xmm2, xmm1              ; xmm2=data1
+
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+    movdqa      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jfdctint-avx2.asm b/media/libjpeg/simd/x86_64/jfdctint-avx2.asm
new file mode 100644
index 0000000000..e56258b48a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctint-avx2.asm
@@ -0,0 +1,320 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpckhwd  %6, %1, %2
+    vpunpcklwd  %7, %3, %4
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 01 11 02 12 03 13  40 50 41 51 42 52 43 53)
+    ; %6=(04 14 05 15 06 16 07 17  44 54 45 55 46 56 47 57)
+    ; %7=(20 30 21 31 22 32 23 33  60 70 61 71 62 72 63 73)
+    ; %8=(24 34 25 35 26 36 27 37  64 74 65 75 66 76 67 77)
+
+    vpunpckldq  %1, %5, %7
+    vpunpckhdq  %2, %5, %7
+    vpunpckldq  %3, %6, %8
+    vpunpckhdq  %4, %6, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %2=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %3=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %4=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpermq      %1, %1, 0x8D
+    vpermq      %2, %2, 0x8D
+    vpermq      %3, %3, 0xD8
+    vpermq      %4, %4, 0xD8
+    ; transpose coefficients(phase 3)
+    ; %1=(01 11 21 31 41 51 61 71  00 10 20 30 40 50 60 70)
+    ; %2=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %3=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %4=(06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9:    Pass (1 or 2)
+
+%macro dodct 9
+    vpsubw      %5, %1, %4              ; %5=data1_0-data6_7=tmp6_7
+    vpaddw      %6, %1, %4              ; %6=data1_0+data6_7=tmp1_0
+    vpaddw      %7, %2, %3              ; %7=data3_2+data4_5=tmp3_2
+    vpsubw      %8, %2, %3              ; %8=data3_2-data4_5=tmp4_5
+
+    ; -- Even part
+
+    vperm2i128  %6, %6, %6, 0x01        ; %6=tmp0_1
+    vpaddw      %1, %6, %7              ; %1=tmp0_1+tmp3_2=tmp10_11
+    vpsubw      %6, %6, %7              ; %6=tmp0_1-tmp3_2=tmp13_12
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=tmp11_10
+    vpsignw     %1, %1, [rel PW_1_NEG1]  ; %1=tmp10_neg11
+    vpaddw      %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+    vpsllw      %1, %7, PASS1_BITS      ; %1=data0_4
+%else
+    vpaddw      %7, %7, [rel PW_DESCALE_P2X]
+    vpsraw      %1, %7, PASS1_BITS      ; %1=data0_4
+%endif
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    vperm2i128  %7, %6, %6, 0x01        ; %7=tmp12_13
+    vpunpcklwd  %2, %6, %7
+    vpunpckhwd  %6, %6, %7
+    vpmaddwd    %2, %2, [rel PW_F130_F054_MF130_F054]  ; %2=data2_6L
+    vpmaddwd    %6, %6, [rel PW_F130_F054_MF130_F054]  ; %6=data2_6H
+
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %6, %6, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %6, %6, DESCALE_P %+ %9
+
+    vpackssdw   %3, %2, %6              ; %6=data2_6
+
+    ; -- Odd part
+
+    vpaddw      %7, %8, %5              ; %7=tmp4_5+tmp6_7=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %2, %7, %7, 0x01        ; %2=z4_3
+    vpunpcklwd  %6, %7, %2
+    vpunpckhwd  %7, %7, %2
+    vpmaddwd    %6, %6, [rel PW_MF078_F117_F078_F117]  ; %6=z3_4L
+    vpmaddwd    %7, %7, [rel PW_MF078_F117_F078_F117]  ; %7=z3_4H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    vperm2i128  %4, %5, %5, 0x01        ; %4=tmp7_6
+    vpunpcklwd  %2, %8, %4
+    vpunpckhwd  %4, %8, %4
+    vpmaddwd    %2, %2, [rel PW_MF060_MF089_MF050_MF256]  ; %2=tmp4_5L
+    vpmaddwd    %4, %4, [rel PW_MF060_MF089_MF050_MF256]  ; %4=tmp4_5H
+
+    vpaddd      %2, %2, %6              ; %2=data7_5L
+    vpaddd      %4, %4, %7              ; %4=data7_5H
+
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %4, %4, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %4, %4, DESCALE_P %+ %9
+
+    vpackssdw   %4, %2, %4              ; %4=data7_5
+
+    vperm2i128  %2, %8, %8, 0x01        ; %2=tmp5_4
+    vpunpcklwd  %8, %5, %2
+    vpunpckhwd  %5, %5, %2
+    vpmaddwd    %8, %8, [rel PW_F050_MF256_F060_MF089]  ; %8=tmp6_7L
+    vpmaddwd    %5, %5, [rel PW_F050_MF256_F060_MF089]  ; %5=tmp6_7H
+
+    vpaddd      %8, %8, %6              ; %8=data3_1L
+    vpaddd      %5, %5, %7              ; %5=data3_1H
+
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %5, %5, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %8, %8, DESCALE_P %+ %9
+    vpsrad      %5, %5, DESCALE_P %+ %9
+
+    vpackssdw   %2, %8, %5              ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089   times 4  dw  (F_3_072 - F_2_562), -F_2_562
+                           times 4  dw  (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X             times 16 dw  1 << (PASS1_BITS - 1)
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)]
+    ; ymm4=(00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17)
+    ; ymm5=(20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37)
+    ; ymm6=(40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57)
+    ; ymm7=(60 61 62 63 64 65 66 67  70 71 72 73 74 75 76 77)
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20
+    vperm2i128  ymm1, ymm4, ymm6, 0x31
+    vperm2i128  ymm2, ymm5, ymm7, 0x20
+    vperm2i128  ymm3, ymm5, ymm7, 0x31
+    ; ymm0=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; ymm1=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+    ; ---- Pass 2: process columns.
+
+    vperm2i128  ymm4, ymm1, ymm3, 0x20  ; ymm4=data3_7
+    vperm2i128  ymm1, ymm1, ymm3, 0x31  ; ymm1=data1_5
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+    vperm2i128 ymm3, ymm0, ymm1, 0x30   ; ymm3=data0_1
+    vperm2i128 ymm5, ymm2, ymm1, 0x20   ; ymm5=data2_3
+    vperm2i128 ymm6, ymm0, ymm4, 0x31   ; ymm6=data4_5
+    vperm2i128 ymm7, ymm2, ymm4, 0x21   ; ymm7=data6_7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm3
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm5
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm6
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7
+
+    vzeroupper
+    uncollect_args 1
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jfdctint-sse2.asm b/media/libjpeg/simd/x86_64/jfdctint-sse2.asm
new file mode 100644
index 0000000000..ec1f383ccb
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctint-sse2.asm
@@ -0,0 +1,619 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054   times 4 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  6
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    paddw       xmm3, xmm1              ; xmm3=tmp10
+    paddw       xmm6, xmm7              ; xmm6=tmp11
+    psubw       xmm4, xmm1              ; xmm4=tmp13
+    psubw       xmm0, xmm7              ; xmm0=tmp12
+
+    movdqa      xmm1, xmm3
+    paddw       xmm3, xmm6              ; xmm3=tmp10+tmp11
+    psubw       xmm1, xmm6              ; xmm1=tmp10-tmp11
+
+    psllw       xmm3, PASS1_BITS        ; xmm3=data0
+    psllw       xmm1, PASS1_BITS        ; xmm1=data4
+
+    movdqa      XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+    movdqa      XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm7, xmm4              ; xmm4=tmp13
+    movdqa      xmm6, xmm4
+    punpcklwd   xmm7, xmm0              ; xmm0=tmp12
+    punpckhwd   xmm6, xmm0
+    movdqa      xmm4, xmm7
+    movdqa      xmm0, xmm6
+    pmaddwd     xmm7, [rel PW_F130_F054]   ; xmm7=data2L
+    pmaddwd     xmm6, [rel PW_F130_F054]   ; xmm6=data2H
+    pmaddwd     xmm4, [rel PW_F054_MF130]  ; xmm4=data6L
+    pmaddwd     xmm0, [rel PW_F054_MF130]  ; xmm0=data6H
+
+    paddd       xmm7, [rel PD_DESCALE_P1]
+    paddd       xmm6, [rel PD_DESCALE_P1]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    paddd       xmm0, [rel PD_DESCALE_P1]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm7, xmm6              ; xmm7=data2
+    packssdw    xmm4, xmm0              ; xmm4=data6
+
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+    ; -- Odd part
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+    movdqa      xmm6, xmm2              ; xmm2=tmp4
+    movdqa      xmm0, xmm5              ; xmm5=tmp5
+    paddw       xmm6, xmm3              ; xmm6=z3
+    paddw       xmm0, xmm1              ; xmm0=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm7, xmm0
+    punpckhwd   xmm4, xmm0
+    movdqa      xmm6, xmm7
+    movdqa      xmm0, xmm4
+    pmaddwd     xmm7, [rel PW_MF078_F117]  ; xmm7=z3L
+    pmaddwd     xmm4, [rel PW_MF078_F117]  ; xmm4=z3H
+    pmaddwd     xmm6, [rel PW_F117_F078]   ; xmm6=z4L
+    pmaddwd     xmm0, [rel PW_F117_F078]   ; xmm0=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm7, xmm2
+    movdqa      xmm4, xmm2
+    punpcklwd   xmm7, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm2, xmm7
+    movdqa      xmm1, xmm4
+    pmaddwd     xmm7, [rel PW_MF060_MF089]  ; xmm7=tmp4L
+    pmaddwd     xmm4, [rel PW_MF060_MF089]  ; xmm4=tmp4H
+    pmaddwd     xmm2, [rel PW_MF089_F060]   ; xmm2=tmp7L
+    pmaddwd     xmm1, [rel PW_MF089_F060]   ; xmm1=tmp7H
+
+    paddd       xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+    paddd       xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+    paddd       xmm2, xmm6              ; xmm2=data1L
+    paddd       xmm1, xmm0              ; xmm1=data1H
+
+    paddd       xmm7, [rel PD_DESCALE_P1]
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm2, [rel PD_DESCALE_P1]
+    paddd       xmm1, [rel PD_DESCALE_P1]
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+
+    packssdw    xmm7, xmm4              ; xmm7=data7
+    packssdw    xmm2, xmm1              ; xmm2=data1
+
+    movdqa      xmm4, xmm5
+    movdqa      xmm1, xmm5
+    punpcklwd   xmm4, xmm3
+    punpckhwd   xmm1, xmm3
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm1
+    pmaddwd     xmm4, [rel PW_MF050_MF256]  ; xmm4=tmp5L
+    pmaddwd     xmm1, [rel PW_MF050_MF256]  ; xmm1=tmp5H
+    pmaddwd     xmm5, [rel PW_MF256_F050]   ; xmm5=tmp6L
+    pmaddwd     xmm3, [rel PW_MF256_F050]   ; xmm3=tmp6H
+
+    paddd       xmm4, xmm6              ; xmm4=data5L
+    paddd       xmm1, xmm0              ; xmm1=data5H
+    paddd       xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+    paddd       xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    paddd       xmm1, [rel PD_DESCALE_P1]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+    paddd       xmm5, [rel PD_DESCALE_P1]
+    paddd       xmm3, [rel PD_DESCALE_P1]
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+
+    packssdw    xmm4, xmm1              ; xmm4=data5
+    packssdw    xmm5, xmm3              ; xmm5=data3
+
+    ; ---- Pass 2: process columns.
+
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+    movdqa      xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+    ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+    ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm2              ; xmm6=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm1, xmm2              ; xmm1=(40 41 50 51 60 61 70 71)
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm3, xmm5              ; xmm3=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+    movdqa      xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+    ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+    ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm4              ; xmm2=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm0, xmm4              ; xmm0=(44 45 54 55 64 65 74 75)
+    movdqa      xmm3, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm7              ; xmm5=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm3, xmm7              ; xmm3=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm4, xmm5              ; xmm4=(24 25 26 27 34 35 36 37)
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm7, xmm3              ; xmm7=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm5              ; xmm6=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm4, xmm5              ; xmm4=(20 21 22 23 30 31 32 33)
+    movdqa      xmm0, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm3              ; xmm1=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm0, xmm3              ; xmm0=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm2              ; xmm6=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm5, xmm2              ; xmm5=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm7              ; xmm0=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm3, xmm7              ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm7, xmm6
+    psubw       xmm5, xmm0              ; xmm5=data1-data6=tmp6
+    psubw       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    paddw       xmm2, xmm0              ; xmm2=data1+data6=tmp1
+    paddw       xmm7, xmm3              ; xmm7=data0+data7=tmp0
+
+    movdqa      xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movdqa      xmm5, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm5, xmm0              ; xmm5=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm3              ; xmm1=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm6, xmm3              ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm3, xmm4
+    paddw       xmm5, xmm1              ; xmm5=data3+data4=tmp3
+    paddw       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    psubw       xmm0, xmm1              ; xmm0=data3-data4=tmp4
+    psubw       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm6, xmm2
+    paddw       xmm7, xmm5              ; xmm7=tmp10
+    paddw       xmm2, xmm4              ; xmm2=tmp11
+    psubw       xmm1, xmm5              ; xmm1=tmp13
+    psubw       xmm6, xmm4              ; xmm6=tmp12
+
+    movdqa      xmm5, xmm7
+    paddw       xmm7, xmm2              ; xmm7=tmp10+tmp11
+    psubw       xmm5, xmm2              ; xmm5=tmp10-tmp11
+
+    paddw       xmm7, [rel PW_DESCALE_P2X]
+    paddw       xmm5, [rel PW_DESCALE_P2X]
+    psraw       xmm7, PASS1_BITS        ; xmm7=data0
+    psraw       xmm5, PASS1_BITS        ; xmm5=data4
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+    movdqa      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm4, xmm1              ; xmm1=tmp13
+    movdqa      xmm2, xmm1
+    punpcklwd   xmm4, xmm6              ; xmm6=tmp12
+    punpckhwd   xmm2, xmm6
+    movdqa      xmm1, xmm4
+    movdqa      xmm6, xmm2
+    pmaddwd     xmm4, [rel PW_F130_F054]   ; xmm4=data2L
+    pmaddwd     xmm2, [rel PW_F130_F054]   ; xmm2=data2H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=data6L
+    pmaddwd     xmm6, [rel PW_F054_MF130]  ; xmm6=data6H
+
+    paddd       xmm4, [rel PD_DESCALE_P2]
+    paddd       xmm2, [rel PD_DESCALE_P2]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    paddd       xmm6, [rel PD_DESCALE_P2]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm6, DESCALE_P2
+
+    packssdw    xmm4, xmm2              ; xmm4=data2
+    packssdw    xmm1, xmm6              ; xmm1=data6
+
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+    ; -- Odd part
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    movdqa      xmm2, xmm0              ; xmm0=tmp4
+    movdqa      xmm6, xmm3              ; xmm3=tmp5
+    paddw       xmm2, xmm7              ; xmm2=z3
+    paddw       xmm6, xmm5              ; xmm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm1, xmm2
+    punpcklwd   xmm4, xmm6
+    punpckhwd   xmm1, xmm6
+    movdqa      xmm2, xmm4
+    movdqa      xmm6, xmm1
+    pmaddwd     xmm4, [rel PW_MF078_F117]  ; xmm4=z3L
+    pmaddwd     xmm1, [rel PW_MF078_F117]  ; xmm1=z3H
+    pmaddwd     xmm2, [rel PW_F117_F078]   ; xmm2=z4L
+    pmaddwd     xmm6, [rel PW_F117_F078]   ; xmm6=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm1, xmm0
+    punpcklwd   xmm4, xmm5
+    punpckhwd   xmm1, xmm5
+    movdqa      xmm0, xmm4
+    movdqa      xmm5, xmm1
+    pmaddwd     xmm4, [rel PW_MF060_MF089]  ; xmm4=tmp4L
+    pmaddwd     xmm1, [rel PW_MF060_MF089]  ; xmm1=tmp4H
+    pmaddwd     xmm0, [rel PW_MF089_F060]   ; xmm0=tmp7L
+    pmaddwd     xmm5, [rel PW_MF089_F060]   ; xmm5=tmp7H
+
+    paddd       xmm4,  XMMWORD [wk(0)]  ; xmm4=data7L
+    paddd       xmm1,  XMMWORD [wk(1)]  ; xmm1=data7H
+    paddd       xmm0, xmm2              ; xmm0=data1L
+    paddd       xmm5, xmm6              ; xmm5=data1H
+
+    paddd       xmm4, [rel PD_DESCALE_P2]
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm0, [rel PD_DESCALE_P2]
+    paddd       xmm5, [rel PD_DESCALE_P2]
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+
+    packssdw    xmm4, xmm1              ; xmm4=data7
+    packssdw    xmm0, xmm5              ; xmm0=data1
+
+    movdqa      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+    movdqa      xmm1, xmm3
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm1, xmm7
+    punpckhwd   xmm5, xmm7
+    movdqa      xmm3, xmm1
+    movdqa      xmm7, xmm5
+    pmaddwd     xmm1, [rel PW_MF050_MF256]  ; xmm1=tmp5L
+    pmaddwd     xmm5, [rel PW_MF050_MF256]  ; xmm5=tmp5H
+    pmaddwd     xmm3, [rel PW_MF256_F050]   ; xmm3=tmp6L
+    pmaddwd     xmm7, [rel PW_MF256_F050]   ; xmm7=tmp6H
+
+    paddd       xmm1, xmm2              ; xmm1=data5L
+    paddd       xmm5, xmm6              ; xmm5=data5H
+    paddd       xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+    paddd       xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    paddd       xmm5, [rel PD_DESCALE_P2]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+    paddd       xmm3, [rel PD_DESCALE_P2]
+    paddd       xmm7, [rel PD_DESCALE_P2]
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm1, xmm5              ; xmm1=data5
+    packssdw    xmm3, xmm7              ; xmm3=data3
+
+    movdqa      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctflt-sse2.asm b/media/libjpeg/simd/x86_64/jidctflt-sse2.asm
new file mode 100644
index 0000000000..60bf961896
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctflt-sse2.asm
@@ -0,0 +1,482 @@
+;
+; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414        times 4  dd  1.414213562373095048801689
+PD_1_847        times 4  dd  1.847759065022573512256366
+PD_1_082        times 4  dd  1.082392200292393968799446
+PD_M2_613       times 4  dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+%define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [workspace]
+    collect_args 4
+    push        rbx
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+    lea         rdi, [workspace]        ; FAST_FLOAT *wsptr
+    mov         rcx, DCTSIZE/4          ; ctr
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm2
+    por         xmm3, xmm4
+    por         xmm5, xmm6
+    por         xmm1, xmm3
+    por         xmm5, xmm7
+    por         xmm1, xmm5
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [rel PD_1_414]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
+
+    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
+    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
+    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
+    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0              ; xmm3=tmp12
+    subps       xmm4, xmm0              ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         rsi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         rcx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+    mov         rcx, DCTSIZE/4          ; ctr
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [rel PD_1_414]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
+    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0              ; xmm3=tmp12
+    subps       xmm4, xmm0              ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [rel PD_RNDINT_MAGIC]  ; xmm1=[rel PD_RNDINT_MAGIC]
+    pcmpeqd     xmm3, xmm3
+    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
+    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
+
+    movaps      xmm1,  XMMWORD [wk(0)]  ; xmm1=tmp2
+    movaps      xmm3,  XMMWORD [wk(1)]  ; xmm3=tmp3
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm7, xmm1
+    movaps      xmm5, xmm3
+    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
+    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
+    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
+    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
+
+    movaps      xmm2, [rel PD_RNDINT_MAGIC]  ; xmm2=[rel PD_RNDINT_MAGIC]
+    pcmpeqd     xmm4, xmm4
+    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
+    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
+
+    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+    paddb       xmm6, xmm2
+    paddb       xmm1, xmm2
+
+    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
+    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+    add         rsi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         rdi, byte 4*SIZEOF_JSAMPROW
+    dec         rcx                            ; ctr
+    jnz         near .rowloop
+
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctfst-sse2.asm b/media/libjpeg/simd/x86_64/jidctfst-sse2.asm
new file mode 100644
index 0000000000..cb97fdfbb2
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctfst-sse2.asm
@@ -0,0 +1,491 @@
+;
+; jidctfst.asm - fast integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))         ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414       times 8  dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 8  dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 8  dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 8  dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm7, xmm0              ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0              ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm7, xmm7              ; xmm7=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm6, xmm0, 0x00        ; xmm6=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm2, xmm0, 0x55        ; xmm2=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm5, xmm0, 0xAA        ; xmm5=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm0, xmm0, 0xFF        ; xmm0=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm1, xmm7, 0x00        ; xmm1=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm4, xmm7, 0x55        ; xmm4=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm3, xmm7, 0xAA        ; xmm3=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm7, xmm7, 0xFF        ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    psubw       xmm0, xmm2              ; xmm0=tmp11
+    psubw       xmm1, xmm3
+    paddw       xmm4, xmm2              ; xmm4=tmp10
+    paddw       xmm5, xmm3              ; xmm5=tmp13
+
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm1, [rel PW_F1414]
+    psubw       xmm1, xmm5              ; xmm1=tmp12
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm0
+    psubw       xmm4, xmm5              ; xmm4=tmp3
+    psubw       xmm0, xmm1              ; xmm0=tmp2
+    paddw       xmm6, xmm5              ; xmm6=tmp0
+    paddw       xmm7, xmm1              ; xmm7=tmp1
+
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movdqa      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm0, xmm5
+    psubw       xmm2, xmm1              ; xmm2=z12
+    psubw       xmm5, xmm3              ; xmm5=z10
+    paddw       xmm4, xmm1              ; xmm4=z11
+    paddw       xmm0, xmm3              ; xmm0=z13
+
+    movdqa      xmm1, xmm5              ; xmm1=z10(unscaled)
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm3, xmm4
+    psubw       xmm4, xmm0
+    paddw       xmm3, xmm0              ; xmm3=tmp7
+
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm4, [rel PW_F1414]    ; xmm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm0, xmm5
+    paddw       xmm5, xmm2
+    pmulhw      xmm5, [rel PW_F1847]    ; xmm5=z5
+    pmulhw      xmm0, [rel PW_MF1613]
+    pmulhw      xmm2, [rel PW_F1082]
+    psubw       xmm0, xmm1
+    psubw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm0, xmm5              ; xmm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm0, xmm3              ; xmm0=tmp6
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm7
+    paddw       xmm6, xmm3              ; xmm6=data0=(00 01 02 03 04 05 06 07)
+    paddw       xmm7, xmm0              ; xmm7=data1=(10 11 12 13 14 15 16 17)
+    psubw       xmm1, xmm3              ; xmm1=data7=(70 71 72 73 74 75 76 77)
+    psubw       xmm5, xmm0              ; xmm5=data6=(60 61 62 63 64 65 66 67)
+    psubw       xmm4, xmm0              ; xmm4=tmp5
+
+    movdqa      xmm3, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm3, xmm7              ; xmm3=(04 14 05 15 06 16 07 17)
+    movdqa      xmm0, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm1              ; xmm5=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm0, xmm1              ; xmm0=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+    paddw       xmm2, xmm4              ; xmm2=tmp4
+    movdqa      xmm5, xmm7
+    movdqa      xmm0, xmm1
+    paddw       xmm7, xmm4              ; xmm7=data2=(20 21 22 23 24 25 26 27)
+    paddw       xmm1, xmm2              ; xmm1=data4=(40 41 42 43 44 45 46 47)
+    psubw       xmm5, xmm4              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+    psubw       xmm0, xmm2              ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm0              ; xmm7=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm0              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm5              ; xmm2=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm0, xmm3              ; transpose coefficients(phase 2)
+    punpckldq   xmm3, xmm4              ; xmm3=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm0, xmm4              ; xmm0=(06 16 26 36 07 17 27 37)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7              ; xmm6=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm5, xmm7              ; xmm5=(02 12 22 32 03 13 23 33)
+
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm4              ; xmm1=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm3, xmm4              ; xmm3=(42 52 62 72 43 53 63 73)
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm7              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm0, xmm7              ; xmm0=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm1              ; xmm6=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm4, xmm1              ; xmm4=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm5, xmm3              ; xmm5=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm7, xmm3              ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm7, xmm3              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm3, xmm0              ; xmm3=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm7, xmm0              ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm0, xmm5
+    psubw       xmm6, xmm1              ; xmm6=tmp11
+    psubw       xmm5, xmm3
+    paddw       xmm2, xmm1              ; xmm2=tmp10
+    paddw       xmm0, xmm3              ; xmm0=tmp13
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F1414]
+    psubw       xmm5, xmm0              ; xmm5=tmp12
+
+    movdqa      xmm1, xmm2
+    movdqa      xmm3, xmm6
+    psubw       xmm2, xmm0              ; xmm2=tmp3
+    psubw       xmm6, xmm5              ; xmm6=tmp2
+    paddw       xmm1, xmm0              ; xmm1=tmp0
+    paddw       xmm3, xmm5              ; xmm3=tmp1
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+    ; -- Odd part
+
+    ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm4
+    psubw       xmm0, xmm7              ; xmm0=z12
+    psubw       xmm4, xmm5              ; xmm4=z10
+    paddw       xmm2, xmm7              ; xmm2=z11
+    paddw       xmm6, xmm5              ; xmm6=z13
+
+    movdqa      xmm7, xmm4              ; xmm7=z10(unscaled)
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm5, xmm2
+    psubw       xmm2, xmm6
+    paddw       xmm5, xmm6              ; xmm5=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm2, [rel PW_F1414]    ; xmm2=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm6, xmm4
+    paddw       xmm4, xmm0
+    pmulhw      xmm4, [rel PW_F1847]    ; xmm4=z5
+    pmulhw      xmm6, [rel PW_MF1613]
+    pmulhw      xmm0, [rel PW_F1082]
+    psubw       xmm6, xmm7
+    psubw       xmm0, xmm4              ; xmm0=tmp10
+    paddw       xmm6, xmm4              ; xmm6=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm6, xmm5              ; xmm6=tmp6
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm3
+    paddw       xmm1, xmm5              ; xmm1=data0=(00 10 20 30 40 50 60 70)
+    paddw       xmm3, xmm6              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    psraw       xmm1, (PASS1_BITS+3)    ; descale
+    psraw       xmm3, (PASS1_BITS+3)    ; descale
+    psubw       xmm7, xmm5              ; xmm7=data7=(07 17 27 37 47 57 67 77)
+    psubw       xmm4, xmm6              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psubw       xmm2, xmm6              ; xmm2=tmp5
+
+    packsswb    xmm1, xmm4        ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm7        ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+    paddw       xmm0, xmm2              ; xmm0=tmp4
+    movdqa      xmm4, xmm5
+    movdqa      xmm7, xmm6
+    paddw       xmm5, xmm2              ; xmm5=data2=(02 12 22 32 42 52 62 72)
+    paddw       xmm6, xmm0              ; xmm6=data4=(04 14 24 34 44 54 64 74)
+    psraw       xmm5, (PASS1_BITS+3)    ; descale
+    psraw       xmm6, (PASS1_BITS+3)    ; descale
+    psubw       xmm4, xmm2              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+    psubw       xmm7, xmm0              ; xmm7=data3=(03 13 23 33 43 53 63 73)
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+
+    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
+
+    packsswb    xmm5, xmm6        ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm7, xmm4        ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm1, xmm2
+    paddb       xmm3, xmm2
+    paddb       xmm5, xmm2
+    paddb       xmm7, xmm2
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 1)
+    punpcklbw   xmm1, xmm3        ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm3        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm6, xmm5        ; transpose coefficients(phase 1)
+    punpcklbw   xmm5, xmm7        ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm6, xmm7        ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm1        ; transpose coefficients(phase 2)
+    punpcklwd   xmm1, xmm5        ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm5        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm0        ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm2, xmm0        ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm3, xmm1        ; transpose coefficients(phase 3)
+    punpckldq   xmm1, xmm6        ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm3, xmm6        ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm7, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm2        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm7, xmm2        ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm5, xmm1, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm3, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm6, xmm4, 0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm2, xmm7, 0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctint-avx2.asm b/media/libjpeg/simd/x86_64/jidctint-avx2.asm
new file mode 100644
index 0000000000..ca7e317f6e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctint-avx2.asm
@@ -0,0 +1,418 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %5=(00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71)
+    ; %6=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %7=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %8=(07 17 27 37 47 57 67 77  06 16 26 36 46 56 66 76)
+
+    vpermq      %5, %1, 0xD8
+    vpermq      %6, %2, 0x72
+    vpermq      %7, %3, 0xD8
+    vpermq      %8, %4, 0x72
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %6=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %7=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %8=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpunpcklwd  %1, %5, %6
+    vpunpckhwd  %2, %5, %6
+    vpunpcklwd  %3, %7, %8
+    vpunpckhwd  %4, %7, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 02 10 12 20 22 30 32  40 42 50 52 60 62 70 72)
+    ; %2=(01 03 11 13 21 23 31 33  41 43 51 53 61 63 71 73)
+    ; %3=(04 06 14 16 24 26 34 36  44 46 54 56 64 66 74 76)
+    ; %4=(05 07 15 17 25 27 35 37  45 47 55 57 65 67 75 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpcklwd  %6, %3, %4
+    vpunpckhwd  %7, %1, %2
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 3)
+    ; %5=(00 01 02 03 10 11 12 13  40 41 42 43 50 51 52 53)
+    ; %6=(04 05 06 07 14 15 16 17  44 45 46 47 54 55 56 57)
+    ; %7=(20 21 22 23 30 31 32 33  60 61 62 63 70 71 72 73)
+    ; %8=(24 25 26 27 34 35 36 37  64 65 66 67 74 75 76 77)
+
+    vpunpcklqdq %1, %5, %6
+    vpunpckhqdq %2, %5, %6
+    vpunpcklqdq %3, %7, %8
+    vpunpckhqdq %4, %7, %8
+    ; transpose coefficients(phase 4)
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
+; %1-%4:  Input/output registers
+; %5-%12: Temp registers
+; %9:     Pass (1 or 2)
+
+%macro dodct 13
+    ; -- Even part
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    vperm2i128  %6, %3, %3, 0x01        ; %6=in6_2
+    vpunpcklwd  %5, %3, %6              ; %5=in26_62L
+    vpunpckhwd  %6, %3, %6              ; %6=in26_62H
+    vpmaddwd    %5, %5, [rel PW_F130_F054_MF130_F054]  ; %5=tmp3_2L
+    vpmaddwd    %6, %6, [rel PW_F130_F054_MF130_F054]  ; %6=tmp3_2H
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=in4_0
+    vpsignw     %1, %1, [rel PW_1_NEG1]
+    vpaddw      %7, %7, %1              ; %7=(in0+in4)_(in0-in4)
+
+    vpxor       %1, %1, %1
+    vpunpcklwd  %8, %1, %7              ; %8=tmp0_1L
+    vpunpckhwd  %1, %1, %7              ; %1=tmp0_1H
+    vpsrad      %8, %8, (16-CONST_BITS)  ; vpsrad %8,16 & vpslld %8,CONST_BITS
+    vpsrad      %1, %1, (16-CONST_BITS)  ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+    vpsubd      %11, %8, %5             ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+    vpaddd      %9, %8, %5              ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+    vpsubd      %12, %1, %6             ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+    vpaddd      %10, %1, %6             ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+    ; -- Odd part
+
+    vpaddw      %1, %4, %2              ; %1=in7_5+in3_1=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %8, %1, %1, 0x01        ; %8=z4_3
+    vpunpcklwd  %7, %1, %8              ; %7=z34_43L
+    vpunpckhwd  %8, %1, %8              ; %8=z34_43H
+    vpmaddwd    %7, %7, [rel PW_MF078_F117_F078_F117]  ; %7=z3_4L
+    vpmaddwd    %8, %8, [rel PW_MF078_F117_F078_F117]  ; %8=z3_4H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    vperm2i128  %2, %2, %2, 0x01        ; %2=in1_3
+    vpunpcklwd  %3, %4, %2              ; %3=in71_53L
+    vpunpckhwd  %4, %4, %2              ; %4=in71_53H
+
+    vpmaddwd    %5, %3, [rel PW_MF060_MF089_MF050_MF256]  ; %5=tmp0_1L
+    vpmaddwd    %6, %4, [rel PW_MF060_MF089_MF050_MF256]  ; %6=tmp0_1H
+    vpaddd      %5, %5, %7              ; %5=tmp0_1L+z3_4L=tmp0_1L
+    vpaddd      %6, %6, %8              ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+    vpmaddwd    %3, %3, [rel PW_MF089_F060_MF256_F050]  ; %3=tmp3_2L
+    vpmaddwd    %4, %4, [rel PW_MF089_F060_MF256_F050]  ; %4=tmp3_2H
+    vperm2i128  %7, %7, %7, 0x01        ; %7=z4_3L
+    vperm2i128  %8, %8, %8, 0x01        ; %8=z4_3H
+    vpaddd      %7, %3, %7              ; %7=tmp3_2L+z4_3L=tmp3_2L
+    vpaddd      %8, %4, %8              ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+    ; -- Final output stage
+
+    vpaddd      %1, %9, %7              ; %1=tmp10_11L+tmp3_2L=data0_1L
+    vpaddd      %2, %10, %8             ; %2=tmp10_11H+tmp3_2H=data0_1H
+    vpaddd      %1, %1, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %1, %1, DESCALE_P %+ %13
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpackssdw   %1, %1, %2              ; %1=data0_1
+
+    vpsubd      %3, %9, %7              ; %3=tmp10_11L-tmp3_2L=data7_6L
+    vpsubd      %4, %10, %8             ; %4=tmp10_11H-tmp3_2H=data7_6H
+    vpaddd      %3, %3, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %4, %4, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %4, %4, DESCALE_P %+ %13
+    vpackssdw   %4, %3, %4              ; %4=data7_6
+
+    vpaddd      %7, %11, %5             ; %7=tmp13_12L+tmp0_1L=data3_2L
+    vpaddd      %8, %12, %6             ; %8=tmp13_12H+tmp0_1H=data3_2H
+    vpaddd      %7, %7, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %7, %7, DESCALE_P %+ %13
+    vpsrad      %8, %8, DESCALE_P %+ %13
+    vpackssdw   %2, %7, %8              ; %2=data3_2
+
+    vpsubd      %7, %11, %5             ; %7=tmp13_12L-tmp0_1L=data4_5L
+    vpsubd      %8, %12, %6             ; %8=tmp13_12H-tmp0_1H=data4_5H
+    vpaddd      %7, %7, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %7, %7, DESCALE_P %+ %13
+    vpsrad      %8, %8, DESCALE_P %+ %13
+    vpackssdw   %3, %7, %8              ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050   times 4  dw -F_0_899, (F_1_501 - F_0_899)
+                           times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP             times 32 db  CENTERJSAMPLE
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    push_xmm    4
+    collect_args 4
+
+    ; ---- Pass 1: process columns.
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+    mov         eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, xmm0
+    vpacksswb   xmm1, xmm1, xmm1
+    vpacksswb   xmm1, xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,r11,SIZEOF_JCOEF)]
+    vpmullw     xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vpsllw      xmm5, xmm5, PASS1_BITS
+
+    vpunpcklwd  xmm4, xmm5, xmm5        ; xmm4=(00 00 01 01 02 02 03 03)
+    vpunpckhwd  xmm5, xmm5, xmm5        ; xmm5=(04 04 05 05 06 06 07 07)
+    vinserti128 ymm4, ymm4, xmm5, 1
+
+    vpshufd     ymm0, ymm4, 0x00        ; ymm0=col0_4=(00 00 00 00 00 00 00 00  04 04 04 04 04 04 04 04)
+    vpshufd     ymm1, ymm4, 0x55        ; ymm1=col1_5=(01 01 01 01 01 01 01 01  05 05 05 05 05 05 05 05)
+    vpshufd     ymm2, ymm4, 0xAA        ; ymm2=col2_6=(02 02 02 02 02 02 02 02  06 06 06 06 06 06 06 06)
+    vpshufd     ymm3, ymm4, 0xFF        ; ymm3=col3_7=(03 03 03 03 03 03 03 03  07 07 07 07 07 07 07 07)
+
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,r11,SIZEOF_JCOEF)]  ; ymm4=in0_1
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,r11,SIZEOF_JCOEF)]  ; ymm5=in2_3
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,r11,SIZEOF_JCOEF)]  ; ymm6=in4_5
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,r11,SIZEOF_JCOEF)]  ; ymm7=in6_7
+    vpmullw     ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20  ; ymm0=in0_4
+    vperm2i128  ymm1, ymm5, ymm4, 0x31  ; ymm1=in3_1
+    vperm2i128  ymm2, ymm5, ymm7, 0x20  ; ymm2=in2_6
+    vperm2i128  ymm3, ymm7, ymm6, 0x31  ; ymm3=in7_5
+
+    dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows.
+
+    vperm2i128  ymm4, ymm3, ymm1, 0x31  ; ymm3=in7_5
+    vperm2i128  ymm1, ymm3, ymm1, 0x20  ; ymm1=in3_1
+
+    dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+    vpacksswb   ymm0, ymm0, ymm1        ; ymm0=data01_45
+    vpacksswb   ymm1, ymm2, ymm4        ; ymm1=data23_67
+    vpaddb      ymm0, ymm0, [rel PB_CENTERJSAMP]
+    vpaddb      ymm1, ymm1, [rel PB_CENTERJSAMP]
+
+    vextracti128 xmm6, ymm1, 1          ; xmm3=data67
+    vextracti128 xmm4, ymm0, 1          ; xmm2=data45
+    vextracti128 xmm2, ymm1, 0          ; xmm1=data23
+    vextracti128 xmm0, ymm0, 0          ; xmm0=data01
+
+    vpshufd     xmm1, xmm0, 0x4E  ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    vpshufd     xmm3, xmm2, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    vpshufd     xmm5, xmm4, 0x4E  ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    vpshufd     xmm7, xmm6, 0x4E  ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    vzeroupper
+
+    mov         eax, r13d
+
+    mov         rdxp, JSAMPROW [r12+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+
+    mov         rdxp, JSAMPROW [r12+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    mov         rdxp, JSAMPROW [r12+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+    mov         rdxp, JSAMPROW [r12+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+    uncollect_args 4
+    pop_xmm     4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctint-sse2.asm b/media/libjpeg/simd/x86_64/jidctint-sse2.asm
new file mode 100644
index 0000000000..7aa869bc0b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctint-sse2.asm
@@ -0,0 +1,847 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054   times 4  dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4  dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4  dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4  dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4  dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        12
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm5, PASS1_BITS
+
+    movdqa      xmm4, xmm5              ; xmm5=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm5, xmm5              ; xmm5=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm4, xmm4              ; xmm4=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm7, xmm5, 0x00        ; xmm7=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm6, xmm5, 0x55        ; xmm6=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm1, xmm5, 0xAA        ; xmm1=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm5, xmm5, 0xFF        ; xmm5=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm0, xmm4, 0x00        ; xmm0=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm3, xmm4, 0x55        ; xmm3=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm2, xmm4, 0xAA        ; xmm2=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm4, xmm4, 0xFF        ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm4, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm4, xmm3              ; xmm3=in6=z3
+    punpckhwd   xmm5, xmm3
+    movdqa      xmm1, xmm4
+    movdqa      xmm3, xmm5
+    pmaddwd     xmm4, [rel PW_F130_F054]   ; xmm4=tmp3L
+    pmaddwd     xmm5, [rel PW_F130_F054]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=tmp2L
+    pmaddwd     xmm3, [rel PW_F054_MF130]  ; xmm3=tmp2H
+
+    movdqa      xmm6, xmm0
+    paddw       xmm0, xmm2              ; xmm0=in0+in4
+    psubw       xmm6, xmm2              ; xmm6=in0-in4
+
+    pxor        xmm7, xmm7
+    pxor        xmm2, xmm2
+    punpcklwd   xmm7, xmm0              ; xmm7=tmp0L
+    punpckhwd   xmm2, xmm0              ; xmm2=tmp0H
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+    psrad       xmm2, (16-CONST_BITS)   ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm4              ; xmm7=tmp10L
+    psubd       xmm0, xmm4              ; xmm0=tmp13L
+    movdqa      xmm4, xmm2
+    paddd       xmm2, xmm5              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm7, xmm7
+    punpcklwd   xmm5, xmm6              ; xmm5=tmp1L
+    punpckhwd   xmm7, xmm6              ; xmm7=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+    movdqa      xmm2, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm2, xmm1              ; xmm2=tmp12L
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm3              ; xmm7=tmp11H
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm7, xmm4
+    paddw       xmm5, xmm3              ; xmm5=z3
+    paddw       xmm7, xmm1              ; xmm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm5
+    punpcklwd   xmm2, xmm7
+    punpckhwd   xmm0, xmm7
+    movdqa      xmm5, xmm2
+    movdqa      xmm7, xmm0
+    pmaddwd     xmm2, [rel PW_MF078_F117]  ; xmm2=z3L
+    pmaddwd     xmm0, [rel PW_MF078_F117]  ; xmm0=z3H
+    pmaddwd     xmm5, [rel PW_F117_F078]   ; xmm5=z4L
+    pmaddwd     xmm7, [rel PW_F117_F078]   ; xmm7=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm3
+    punpcklwd   xmm2, xmm4
+    punpckhwd   xmm0, xmm4
+    movdqa      xmm3, xmm2
+    movdqa      xmm4, xmm0
+    pmaddwd     xmm2, [rel PW_MF060_MF089]  ; xmm2=tmp0L
+    pmaddwd     xmm0, [rel PW_MF060_MF089]  ; xmm0=tmp0H
+    pmaddwd     xmm3, [rel PW_MF089_F060]   ; xmm3=tmp3L
+    pmaddwd     xmm4, [rel PW_MF089_F060]   ; xmm4=tmp3H
+
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+    paddd       xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+    paddd       xmm3, xmm5              ; xmm3=tmp3L
+    paddd       xmm4, xmm7              ; xmm4=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm0, xmm1
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm0, xmm6
+    movdqa      xmm1, xmm2
+    movdqa      xmm6, xmm0
+    pmaddwd     xmm2, [rel PW_MF050_MF256]  ; xmm2=tmp1L
+    pmaddwd     xmm0, [rel PW_MF050_MF256]  ; xmm0=tmp1H
+    pmaddwd     xmm1, [rel PW_MF256_F050]   ; xmm1=tmp2L
+    pmaddwd     xmm6, [rel PW_MF256_F050]   ; xmm6=tmp2H
+
+    paddd       xmm2, xmm5              ; xmm2=tmp1L
+    paddd       xmm0, xmm7              ; xmm0=tmp1H
+    paddd       xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm7
+    paddd       xmm5, xmm3              ; xmm5=data0L
+    paddd       xmm7, xmm4              ; xmm7=data0H
+    psubd       xmm2, xmm3              ; xmm2=data7L
+    psubd       xmm0, xmm4              ; xmm0=data7H
+
+    movdqa      xmm3, [rel PD_DESCALE_P1]  ; xmm3=[rel PD_DESCALE_P1]
+
+    paddd       xmm5, xmm3
+    paddd       xmm7, xmm3
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm7, DESCALE_P1
+    paddd       xmm2, xmm3
+    paddd       xmm0, xmm3
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm5, xmm7              ; xmm5=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm2, xmm0              ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+    movdqa      xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+    movdqa      xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+    movdqa      xmm7, xmm4
+    movdqa      xmm0, xmm3
+    paddd       xmm4, xmm1              ; xmm4=data1L
+    paddd       xmm3, xmm6              ; xmm3=data1H
+    psubd       xmm7, xmm1              ; xmm7=data6L
+    psubd       xmm0, xmm6              ; xmm0=data6H
+
+    movdqa      xmm1, [rel PD_DESCALE_P1]  ; xmm1=[rel PD_DESCALE_P1]
+
+    paddd       xmm4, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+    paddd       xmm7, xmm1
+    paddd       xmm0, xmm1
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm4, xmm3              ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm7, xmm0              ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+    movdqa      xmm6, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm4              ; xmm5=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4              ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm2              ; xmm7=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm1, xmm2              ; xmm1=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+    movdqa      xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+    movdqa      xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+    movdqa      xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm5, xmm3
+    movdqa      xmm6, xmm0
+    paddd       xmm3, xmm4              ; xmm3=data2L
+    paddd       xmm0, xmm2              ; xmm0=data2H
+    psubd       xmm5, xmm4              ; xmm5=data5L
+    psubd       xmm6, xmm2              ; xmm6=data5H
+
+    movdqa      xmm7, [rel PD_DESCALE_P1]  ; xmm7=[rel PD_DESCALE_P1]
+
+    paddd       xmm3, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm3, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+    paddd       xmm5, xmm7
+    paddd       xmm6, xmm7
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm3, xmm0              ; xmm3=data2=(20 21 22 23 24 25 26 27)
+    packssdw    xmm5, xmm6              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+    movdqa      xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+    movdqa      xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+    movdqa      xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm6, xmm4
+    paddd       xmm1, xmm2              ; xmm1=data3L
+    paddd       xmm4, xmm7              ; xmm4=data3H
+    psubd       xmm0, xmm2              ; xmm0=data4L
+    psubd       xmm6, xmm7              ; xmm6=data4H
+
+    movdqa      xmm2, [rel PD_DESCALE_P1]  ; xmm2=[rel PD_DESCALE_P1]
+
+    paddd       xmm1, xmm2
+    paddd       xmm4, xmm2
+    psrad       xmm1, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm0, xmm2
+    paddd       xmm6, xmm2
+    psrad       xmm0, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm1, xmm4              ; xmm1=data3=(30 31 32 33 34 35 36 37)
+    packssdw    xmm0, xmm6              ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+    movdqa      xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+    movdqa      xmm4, xmm3              ; transpose coefficients(phase 1)
+    punpcklwd   xmm3, xmm1              ; xmm3=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm1              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm6, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm6, xmm5              ; xmm6=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm3              ; xmm7=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm1, xmm3              ; xmm1=(02 12 22 32 03 13 23 33)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm4              ; xmm2=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm5, xmm4              ; xmm5=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+    movdqa      xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm2, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm2, xmm3              ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm4              ; xmm6=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm5, xmm4              ; xmm5=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm0              ; xmm7=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm3, xmm0              ; xmm3=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm3, xmm6              ; xmm3=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm5              ; xmm2=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm4, xmm5              ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm6, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm6, xmm2              ; xmm2=in6=z3
+    punpckhwd   xmm5, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm2, xmm5
+    pmaddwd     xmm6, [rel PW_F130_F054]   ; xmm6=tmp3L
+    pmaddwd     xmm5, [rel PW_F130_F054]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=tmp2L
+    pmaddwd     xmm2, [rel PW_F054_MF130]  ; xmm2=tmp2H
+
+    movdqa      xmm3, xmm7
+    paddw       xmm7, xmm0              ; xmm7=in0+in4
+    psubw       xmm3, xmm0              ; xmm3=in0-in4
+
+    pxor        xmm4, xmm4
+    pxor        xmm0, xmm0
+    punpcklwd   xmm4, xmm7              ; xmm4=tmp0L
+    punpckhwd   xmm0, xmm7              ; xmm0=tmp0H
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+    psrad       xmm0, (16-CONST_BITS)   ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm6              ; xmm4=tmp10L
+    psubd       xmm7, xmm6              ; xmm7=tmp13L
+    movdqa      xmm6, xmm0
+    paddd       xmm0, xmm5              ; xmm0=tmp10H
+    psubd       xmm6, xmm5              ; xmm6=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm4, xmm4
+    punpcklwd   xmm5, xmm3              ; xmm5=tmp1L
+    punpckhwd   xmm4, xmm3              ; xmm4=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+    movdqa      xmm0, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm0, xmm1              ; xmm0=tmp12L
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm2              ; xmm4=tmp11H
+    psubd       xmm7, xmm2              ; xmm7=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+    movdqa      xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+    movdqa      xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+    movdqa      xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm4, xmm3
+    paddw       xmm5, xmm1              ; xmm5=z3
+    paddw       xmm4, xmm2              ; xmm4=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm5
+    punpcklwd   xmm0, xmm4
+    punpckhwd   xmm7, xmm4
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm7
+    pmaddwd     xmm0, [rel PW_MF078_F117]  ; xmm0=z3L
+    pmaddwd     xmm7, [rel PW_MF078_F117]  ; xmm7=z3H
+    pmaddwd     xmm5, [rel PW_F117_F078]   ; xmm5=z4L
+    pmaddwd     xmm4, [rel PW_F117_F078]   ; xmm4=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm7, xmm1
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm1, xmm0
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm0, [rel PW_MF060_MF089]  ; xmm0=tmp0L
+    pmaddwd     xmm7, [rel PW_MF060_MF089]  ; xmm7=tmp0H
+    pmaddwd     xmm1, [rel PW_MF089_F060]   ; xmm1=tmp3L
+    pmaddwd     xmm3, [rel PW_MF089_F060]   ; xmm3=tmp3H
+
+    paddd       xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+    paddd       xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+    paddd       xmm1, xmm5              ; xmm1=tmp3L
+    paddd       xmm3, xmm4              ; xmm3=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+    movdqa      xmm0, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm0, xmm6
+    punpckhwd   xmm7, xmm6
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm7
+    pmaddwd     xmm0, [rel PW_MF050_MF256]  ; xmm0=tmp1L
+    pmaddwd     xmm7, [rel PW_MF050_MF256]  ; xmm7=tmp1H
+    pmaddwd     xmm2, [rel PW_MF256_F050]   ; xmm2=tmp2L
+    pmaddwd     xmm6, [rel PW_MF256_F050]   ; xmm6=tmp2H
+
+    paddd       xmm0, xmm5              ; xmm0=tmp1L
+    paddd       xmm7, xmm4              ; xmm7=tmp1H
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm4
+    paddd       xmm5, xmm1              ; xmm5=data0L
+    paddd       xmm4, xmm3              ; xmm4=data0H
+    psubd       xmm0, xmm1              ; xmm0=data7L
+    psubd       xmm7, xmm3              ; xmm7=data7H
+
+    movdqa      xmm1, [rel PD_DESCALE_P2]  ; xmm1=[rel PD_DESCALE_P2]
+
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrad       xmm5, DESCALE_P2
+    psrad       xmm4, DESCALE_P2
+    paddd       xmm0, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm5, xmm4              ; xmm5=data0=(00 10 20 30 40 50 60 70)
+    packssdw    xmm0, xmm7              ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+    movdqa      xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm7, xmm1
+    paddd       xmm3, xmm2              ; xmm3=data1L
+    paddd       xmm1, xmm6              ; xmm1=data1H
+    psubd       xmm4, xmm2              ; xmm4=data6L
+    psubd       xmm7, xmm6              ; xmm7=data6H
+
+    movdqa      xmm2, [rel PD_DESCALE_P2]  ; xmm2=[rel PD_DESCALE_P2]
+
+    paddd       xmm3, xmm2
+    paddd       xmm1, xmm2
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm4, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm3, xmm1              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    packssdw    xmm4, xmm7              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+    packsswb    xmm5, xmm4              ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm0              ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+    movdqa      xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+    movdqa      xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm4, xmm6
+    movdqa      xmm0, xmm2
+    paddd       xmm6, xmm1              ; xmm6=data2L
+    paddd       xmm2, xmm7              ; xmm2=data2H
+    psubd       xmm4, xmm1              ; xmm4=data5L
+    psubd       xmm0, xmm7              ; xmm0=data5H
+
+    movdqa      xmm5, [rel PD_DESCALE_P2]  ; xmm5=[rel PD_DESCALE_P2]
+
+    paddd       xmm6, xmm5
+    paddd       xmm2, xmm5
+    psrad       xmm6, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm4, xmm5
+    paddd       xmm0, xmm5
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    packssdw    xmm6, xmm2              ; xmm6=data2=(02 12 22 32 42 52 62 72)
+    packssdw    xmm4, xmm0              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+    movdqa      xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+    movdqa      xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+    movdqa      xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+    movdqa      xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm1
+    paddd       xmm3, xmm7              ; xmm3=data3L
+    paddd       xmm1, xmm5              ; xmm1=data3H
+    psubd       xmm2, xmm7              ; xmm2=data4L
+    psubd       xmm0, xmm5              ; xmm0=data4H
+
+    movdqa      xmm7, [rel PD_DESCALE_P2]  ; xmm7=[rel PD_DESCALE_P2]
+
+    paddd       xmm3, xmm7
+    paddd       xmm1, xmm7
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm2, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm2, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    movdqa      xmm5, [rel PB_CENTERJSAMP]  ; xmm5=[rel PB_CENTERJSAMP]
+
+    packssdw    xmm3, xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+    packssdw    xmm2, xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+    movdqa      xmm7, XMMWORD [wk(0)]  ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      xmm1, XMMWORD [wk(1)]  ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    packsswb    xmm6, xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm3, xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm7, xmm5
+    paddb       xmm1, xmm5
+    paddb       xmm6, xmm5
+    paddb       xmm3, xmm5
+
+    movdqa      xmm0, xmm7        ; transpose coefficients(phase 1)
+    punpcklbw   xmm7, xmm1        ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm1        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 1)
+    punpcklbw   xmm6, xmm3        ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm2, xmm3        ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm7        ; transpose coefficients(phase 2)
+    punpcklwd   xmm7, xmm6        ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm6        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm5, xmm2        ; transpose coefficients(phase 2)
+    punpcklwd   xmm2, xmm0        ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm5, xmm0        ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm1, xmm7        ; transpose coefficients(phase 3)
+    punpckldq   xmm7, xmm2        ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm1, xmm2        ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm3, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm5        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm3, xmm5        ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm6, xmm7, 0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm1, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctred-sse2.asm b/media/libjpeg/simd/x86_64/jidctred-sse2.asm
new file mode 100644
index 0000000000..4ece9d891c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctred-sse2.asm
@@ -0,0 +1,574 @@
+;
+; jidctred.asm - reduced-size IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076   times 4  dw  F_1_847, -F_0_765
+PW_F256_F089    times 4  dw  F_2_562,  F_0_899
+PW_F106_MF217   times 4  dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 4  dw -F_0_601, -F_0_509
+PW_F145_MF021   times 4  dw  F_1_451, -F_0_211
+PW_F362_MF127   times 4  dw  F_3_624, -F_1_272
+PW_F085_MF072   times 4  dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4  dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4  dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4  dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4  dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, xmm1
+    packsswb    xmm0, xmm0
+    packsswb    xmm0, xmm0
+    movd        eax, xmm0
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm0, PASS1_BITS
+
+    movdqa      xmm3, xmm0        ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0        ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm3, xmm3        ; xmm3=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm1, xmm0, 0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+    pshufd      xmm0, xmm0, 0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+    pshufd      xmm6, xmm3, 0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+    pshufd      xmm3, xmm3, 0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm0
+    punpcklwd   xmm4, xmm1
+    punpckhwd   xmm5, xmm1
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    pmaddwd     xmm4, [rel PW_F256_F089]   ; xmm4=(tmp2L)
+    pmaddwd     xmm5, [rel PW_F256_F089]   ; xmm5=(tmp2H)
+    pmaddwd     xmm0, [rel PW_F106_MF217]  ; xmm0=(tmp0L)
+    pmaddwd     xmm1, [rel PW_F106_MF217]  ; xmm1=(tmp0H)
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm6, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm6, [rel PW_MF060_MF050]  ; xmm6=(tmp2L)
+    pmaddwd     xmm7, [rel PW_MF060_MF050]  ; xmm7=(tmp2H)
+    pmaddwd     xmm2, [rel PW_F145_MF021]   ; xmm2=(tmp0L)
+    pmaddwd     xmm3, [rel PW_F145_MF021]   ; xmm3=(tmp0H)
+
+    paddd       xmm6, xmm4              ; xmm6=tmp2L
+    paddd       xmm7, xmm5              ; xmm7=tmp2H
+    paddd       xmm2, xmm0              ; xmm2=tmp0L
+    paddd       xmm3, xmm1              ; xmm3=tmp0H
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        xmm1, xmm1
+    pxor        xmm2, xmm2
+    punpcklwd   xmm1, xmm4               ; xmm1=tmp0L
+    punpckhwd   xmm2, xmm4               ; xmm2=tmp0H
+    psrad       xmm1, (16-CONST_BITS-1)  ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+    psrad       xmm2, (16-CONST_BITS-1)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+    movdqa      xmm3, xmm5              ; xmm5=in2=z2
+    punpcklwd   xmm5, xmm0              ; xmm0=in6=z3
+    punpckhwd   xmm3, xmm0
+    pmaddwd     xmm5, [rel PW_F184_MF076]  ; xmm5=tmp2L
+    pmaddwd     xmm3, [rel PW_F184_MF076]  ; xmm3=tmp2H
+
+    movdqa      xmm4, xmm1
+    movdqa      xmm0, xmm2
+    paddd       xmm1, xmm5              ; xmm1=tmp10L
+    paddd       xmm2, xmm3              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp12L
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, xmm1
+    movdqa      xmm3, xmm2
+    paddd       xmm1, xmm6              ; xmm1=data0L
+    paddd       xmm2, xmm7              ; xmm2=data0H
+    psubd       xmm5, xmm6              ; xmm5=data3L
+    psubd       xmm3, xmm7              ; xmm3=data3H
+
+    movdqa      xmm6, [rel PD_DESCALE_P1_4]  ; xmm6=[rel PD_DESCALE_P1_4]
+
+    paddd       xmm1, xmm6
+    paddd       xmm2, xmm6
+    psrad       xmm1, DESCALE_P1_4
+    psrad       xmm2, DESCALE_P1_4
+    paddd       xmm5, xmm6
+    paddd       xmm3, xmm6
+    psrad       xmm5, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm1, xmm2              ; xmm1=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm5, xmm3              ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+    movdqa      xmm2, xmm4
+    movdqa      xmm3, xmm0
+    paddd       xmm4, xmm7              ; xmm4=data1L
+    paddd       xmm0, xmm6              ; xmm0=data1H
+    psubd       xmm2, xmm7              ; xmm2=data2L
+    psubd       xmm3, xmm6              ; xmm3=data2H
+
+    movdqa      xmm7, [rel PD_DESCALE_P1_4]  ; xmm7=[rel PD_DESCALE_P1_4]
+
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm4, DESCALE_P1_4
+    psrad       xmm0, DESCALE_P1_4
+    paddd       xmm2, xmm7
+    paddd       xmm3, xmm7
+    psrad       xmm2, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm4, xmm0        ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm2, xmm3        ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+    movdqa      xmm6, xmm1        ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm4        ; xmm1=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4        ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm7, xmm2        ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm5        ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm7, xmm5        ; xmm7=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm2        ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm0, xmm2        ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+    movdqa      xmm3, xmm6        ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7        ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm3, xmm7        ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    pxor        xmm4, xmm4
+    punpcklwd   xmm4, xmm1               ; xmm4=tmp0
+    psrad       xmm4, (16-CONST_BITS-1)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+    ; -- Odd part
+
+    punpckhwd   xmm1, xmm0
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm5, xmm1
+    movdqa      xmm2, xmm6
+    pmaddwd     xmm1, [rel PW_F256_F089]    ; xmm1=(tmp2)
+    pmaddwd     xmm6, [rel PW_MF060_MF050]  ; xmm6=(tmp2)
+    pmaddwd     xmm5, [rel PW_F106_MF217]   ; xmm5=(tmp0)
+    pmaddwd     xmm2, [rel PW_F145_MF021]   ; xmm2=(tmp0)
+
+    paddd       xmm6, xmm1              ; xmm6=tmp2
+    paddd       xmm2, xmm5              ; xmm2=tmp0
+
+    ; -- Even part
+
+    punpcklwd   xmm0, xmm3
+    pmaddwd     xmm0, [rel PW_F184_MF076]  ; xmm0=tmp2
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm0              ; xmm4=tmp10
+    psubd       xmm7, xmm0              ; xmm7=tmp12
+
+    ; -- Final output stage
+
+    movdqa      xmm1, [rel PD_DESCALE_P2_4]  ; xmm1=[rel PD_DESCALE_P2_4]
+
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm7
+    paddd       xmm4, xmm6              ; xmm4=data0=(00 10 20 30)
+    paddd       xmm7, xmm2              ; xmm7=data1=(01 11 21 31)
+    psubd       xmm5, xmm6              ; xmm5=data3=(03 13 23 33)
+    psubd       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+
+    paddd       xmm4, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm4, DESCALE_P2_4
+    psrad       xmm7, DESCALE_P2_4
+    paddd       xmm5, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm5, DESCALE_P2_4
+    psrad       xmm3, DESCALE_P2_4
+
+    packssdw    xmm4, xmm3              ; xmm4=(00 10 20 30 02 12 22 32)
+    packssdw    xmm7, xmm5              ; xmm7=(01 11 21 31 03 13 23 33)
+
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 1)
+    punpcklwd   xmm4, xmm7              ; xmm4=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm0, xmm7              ; xmm0=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm6, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm6, xmm0              ; xmm6=(20 21 22 23 30 31 32 33)
+
+    packsswb    xmm4, xmm6              ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+    paddb       xmm4, [rel PB_CENTERJSAMP]
+
+    pshufd      xmm2, xmm4, 0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+    pshufd      xmm1, xmm4, 0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+    pshufd      xmm3, xmm4, 0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+    mov         rdxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+    movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+    ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+    pcmpeqd     xmm7, xmm7
+    pslld       xmm7, WORD_BIT          ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+    movdqa      xmm4, xmm0              ; xmm4=(10 11 ** 13 ** 15 ** 17)
+    movdqa      xmm5, xmm2              ; xmm5=(50 51 ** 53 ** 55 ** 57)
+    punpcklwd   xmm4, xmm1              ; xmm4=(10 30 11 31 ** ** 13 33)
+    punpcklwd   xmm5, xmm3              ; xmm5=(50 70 51 71 ** ** 53 73)
+    pmaddwd     xmm4, [rel PW_F362_MF127]
+    pmaddwd     xmm5, [rel PW_F085_MF072]
+
+    psrld       xmm0, WORD_BIT          ; xmm0=(11 -- 13 -- 15 -- 17 --)
+    pand        xmm1, xmm7              ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+    psrld       xmm2, WORD_BIT          ; xmm2=(51 -- 53 -- 55 -- 57 --)
+    pand        xmm3, xmm7              ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+    por         xmm0, xmm1              ; xmm0=(11 31 13 33 15 35 17 37)
+    por         xmm2, xmm3              ; xmm2=(51 71 53 73 55 75 57 77)
+    pmaddwd     xmm0, [rel PW_F362_MF127]
+    pmaddwd     xmm2, [rel PW_F085_MF072]
+
+    paddd       xmm4, xmm5              ; xmm4=tmp0[col0 col1 **** col3]
+    paddd       xmm0, xmm2              ; xmm0=tmp0[col1 col3 col5 col7]
+
+    ; -- Even part
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+    movdqa      xmm1, xmm6              ; xmm1=(00 01 ** 03 ** 05 ** 07)
+    pslld       xmm6, WORD_BIT          ; xmm6=(-- 00 -- ** -- ** -- **)
+    pand        xmm1, xmm7              ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+    psrad       xmm6, (WORD_BIT-CONST_BITS-2)  ; xmm6=tmp10[col0 **** **** ****]
+    psrad       xmm1, (WORD_BIT-CONST_BITS-2)  ; xmm1=tmp10[col1 col3 col5 col7]
+
+    ; -- Final output stage
+
+    movdqa      xmm3, xmm6
+    movdqa      xmm5, xmm1
+    paddd       xmm6, xmm4      ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+    paddd       xmm1, xmm0      ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+    psubd       xmm3, xmm4      ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+    psubd       xmm5, xmm0      ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+    movdqa      xmm2, [rel PD_DESCALE_P1_2]  ; xmm2=[rel PD_DESCALE_P1_2]
+
+    punpckldq   xmm6, xmm3              ; xmm6=(A0 B0 ** **)
+
+    movdqa      xmm7, xmm1
+    punpcklqdq  xmm1, xmm5              ; xmm1=(A1 A3 B1 B3)
+    punpckhqdq  xmm7, xmm5              ; xmm7=(A5 A7 B5 B7)
+
+    paddd       xmm6, xmm2
+    psrad       xmm6, DESCALE_P1_2
+
+    paddd       xmm1, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm1, DESCALE_P1_2
+    psrad       xmm7, DESCALE_P1_2
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    xmm1, xmm1              ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+    packssdw    xmm7, xmm7              ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+    pmaddwd     xmm1, [rel PW_F362_MF127]
+    pmaddwd     xmm7, [rel PW_F085_MF072]
+
+    paddd       xmm1, xmm7              ; xmm1=tmp0[row0 row1 row0 row1]
+
+    ; -- Even part
+
+    pslld       xmm6, (CONST_BITS+2)    ; xmm6=tmp10[row0 row1 **** ****]
+
+    ; -- Final output stage
+
+    movdqa      xmm4, xmm6
+    paddd       xmm6, xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+    psubd       xmm4, xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+    punpckldq   xmm6, xmm4     ; xmm6=(C0 D0 C1 D1)
+
+    paddd       xmm6, [rel PD_DESCALE_P2_2]
+    psrad       xmm6, DESCALE_P2_2
+
+    packssdw    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+    packsswb    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+    paddb       xmm6, [rel PB_CENTERJSAMP]
+
+    pextrw      ebx, xmm6, 0x00         ; ebx=(C0 D0 -- --)
+    pextrw      ecx, xmm6, 0x01         ; ecx=(C1 D1 -- --)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         word [rdx+rax*SIZEOF_JSAMPLE], bx
+    mov         word [rsi+rax*SIZEOF_JSAMPLE], cx
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jquantf-sse2.asm b/media/libjpeg/simd/x86_64/jquantf-sse2.asm
new file mode 100644
index 0000000000..ab2e3954f6
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquantf-sse2.asm
@@ -0,0 +1,155 @@
+;
+; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+    push        rbx
+
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7
+    packsswb    xmm7, xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         rsi, r10
+    mov         eax, r11d
+    mov         rdi, r12
+    mov         rcx, DCTSIZE/2
+.convloop:
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+    psubb       xmm0, xmm7              ; xmm0=(01234567)
+    psubb       xmm1, xmm7              ; xmm1=(89ABCDEF)
+
+    punpcklbw   xmm0, xmm0              ; xmm0=(*0*1*2*3*4*5*6*7)
+    punpcklbw   xmm1, xmm1              ; xmm1=(*8*9*A*B*C*D*E*F)
+
+    punpcklwd   xmm2, xmm0              ; xmm2=(***0***1***2***3)
+    punpckhwd   xmm0, xmm0              ; xmm0=(***4***5***6***7)
+    punpcklwd   xmm3, xmm1              ; xmm3=(***8***9***A***B)
+    punpckhwd   xmm1, xmm1              ; xmm1=(***C***D***E***F)
+
+    psrad       xmm2, (DWORD_BIT-BYTE_BIT)  ; xmm2=(0123)
+    psrad       xmm0, (DWORD_BIT-BYTE_BIT)  ; xmm0=(4567)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=(0123)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=(4567)
+    psrad       xmm3, (DWORD_BIT-BYTE_BIT)  ; xmm3=(89AB)
+    psrad       xmm1, (DWORD_BIT-BYTE_BIT)  ; xmm1=(CDEF)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=(89AB)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW
+    add         rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         short .convloop
+
+    pop         rbx
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                           FAST_FLOAT *workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT *divisors
+; r12 = FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         rsi, r12
+    mov         rdx, r11
+    mov         rdi, r10
+    mov         rax, DCTSIZE2/16
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    cvtps2dq    xmm0, xmm0
+    cvtps2dq    xmm1, xmm1
+    cvtps2dq    xmm2, xmm2
+    cvtps2dq    xmm3, xmm3
+
+    packssdw    xmm0, xmm1
+    packssdw    xmm2, xmm3
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+    add         rsi, byte 16*SIZEOF_FAST_FLOAT
+    add         rdx, byte 16*SIZEOF_FAST_FLOAT
+    add         rdi, byte 16*SIZEOF_JCOEF
+    dec         rax
+    jnz         short .quantloop
+
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jquanti-avx2.asm b/media/libjpeg/simd/x86_64/jquanti-avx2.asm
new file mode 100644
index 0000000000..70fe81139c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquanti-avx2.asm
@@ -0,0 +1,163 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         eax, r11d
+
+    mov         rsip, JSAMPROW [r10+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsip, JSAMPROW [r10+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsip, JSAMPROW [r10+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsip, JSAMPROW [r10+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    vpmovzxbw   ymm0, xmm0              ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    vpmovzxbw   ymm1, xmm1              ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    vpmovzxbw   ymm2, xmm2              ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    vpmovzxbw   ymm3, xmm3              ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpaddw      ymm3, ymm3, ymm7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    vmovdqu     ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,r11)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,r11)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,r11)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,r11)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,r11)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,r11)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jquanti-sse2.asm b/media/libjpeg/simd/x86_64/jquanti-sse2.asm
new file mode 100644
index 0000000000..3ee442027a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquanti-sse2.asm
@@ -0,0 +1,188 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+    push        rbx
+
+    pxor        xmm6, xmm6              ; xmm6=(all 0's)
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    mov         rsi, r10
+    mov         eax, r11d
+    mov         rdi, r12
+    mov         rcx, DCTSIZE/4
+.convloop:
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
+    movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
+
+    mov         rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
+    movq        xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
+
+    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
+    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
+    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+    add         rsi, byte 4*SIZEOF_JSAMPROW
+    add         rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         rcx
+    jnz         short .convloop
+
+    pop         rbx
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         rsi, r12
+    mov         rdx, r11
+    mov         rdi, r10
+    mov         rax, DCTSIZE2/32
+.quantloop:
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    psraw       xmm4, (WORD_BIT-1)
+    psraw       xmm5, (WORD_BIT-1)
+    psraw       xmm6, (WORD_BIT-1)
+    psraw       xmm7, (WORD_BIT-1)
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
+    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
+    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
+    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
+
+    paddw       xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
+    paddw       xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+    paddw       xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+    paddw       xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
+    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+    pmulhuw     xmm0, XMMWORD [SCALE(0,0,rdx)]       ; scale
+    pmulhuw     xmm1, XMMWORD [SCALE(1,0,rdx)]
+    pmulhuw     xmm2, XMMWORD [SCALE(2,0,rdx)]
+    pmulhuw     xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4
+    psubw       xmm1, xmm5
+    psubw       xmm2, xmm6
+    psubw       xmm3, xmm7
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+    add         rsi, byte 32*SIZEOF_DCTELEM
+    add         rdx, byte 32*SIZEOF_DCTELEM
+    add         rdi, byte 32*SIZEOF_JCOEF
+    dec         rax
+    jnz         near .quantloop
+
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jsimd.c b/media/libjpeg/simd/x86_64/jsimd.c
new file mode 100644
index 0000000000..584a010ad3
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jsimd.c
@@ -0,0 +1,1068 @@
+/*
+ * jsimd_x86_64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "jconfigint.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order)  (((size_t)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr)  (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr)  (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static unsigned int simd_support = (unsigned int)(~0);
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = jpeg_simd_cpu_support();
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_SSE2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_AVX2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extrgb_ycc_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+    sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extbgr_ycc_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+    sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_rgb_ycc_convert_avx2;
+    sse2fct = jsimd_rgb_ycc_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_gray_convert_avx2;
+    sse2fct = jsimd_extrgb_gray_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_gray_convert_avx2;
+    sse2fct = jsimd_extrgbx_gray_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_gray_convert_avx2;
+    sse2fct = jsimd_extbgr_gray_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_gray_convert_avx2;
+    sse2fct = jsimd_extbgrx_gray_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_gray_convert_avx2;
+    sse2fct = jsimd_extxbgr_gray_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_gray_convert_avx2;
+    sse2fct = jsimd_extxrgb_gray_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_rgb_gray_convert_avx2;
+    sse2fct = jsimd_rgb_gray_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_ycc_extrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extrgb_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+    sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_ycc_extbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extbgr_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+    sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_ycc_rgb_convert_avx2;
+    sse2fct = jsimd_ycc_rgb_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else
+    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+    break;
+  default:
+    avx2fct = jsimd_h2v2_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_merged_upsample_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+    break;
+  default:
+    avx2fct = jsimd_h2v1_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_merged_upsample_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_convsamp_avx2(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_fdct_islow_avx2(data);
+  else
+    jsimd_fdct_islow_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+  jsimd_fdct_float_sse(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/x86_64/jsimdcpu.asm b/media/libjpeg/simd/x86_64/jsimdcpu.asm
new file mode 100644
index 0000000000..705f813d7d
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jsimdcpu.asm
@@ -0,0 +1,86 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+    align       32
+    GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+    push        rbx
+    push        rdi
+
+    xor         rdi, rdi                ; simd support flag
+
+    ; Assume that all x86-64 processors support SSE & SSE2 instructions
+    or          rdi, JSIMD_SSE2
+    or          rdi, JSIMD_SSE
+
+    ; Check whether CPUID leaf 07H is supported
+    ; (leaf 07H is used to check for AVX2 instruction support)
+    mov         rax, 0
+    cpuid
+    cmp         rax, 7
+    jl          short .return           ; Maximum leaf < 07H
+
+    ; Check for AVX2 instruction support
+    mov         rax, 7
+    xor         rcx, rcx
+    cpuid
+    mov         rax, rbx                ; rax = Extended feature flags
+
+    test        rax, 1<<5               ; bit5:AVX2
+    jz          short .return
+
+    ; Check for AVX2 O/S support
+    mov         rax, 1
+    xor         rcx, rcx
+    cpuid
+    test        rcx, 1<<27
+    jz          short .return           ; O/S does not support XSAVE
+    test        rcx, 1<<28
+    jz          short .return           ; CPU does not support AVX2
+
+    xor         rcx, rcx
+    xgetbv
+    and         rax, 6
+    cmp         rax, 6                  ; O/S does not manage XMM/YMM state
+                                        ; using XSAVE
+    jnz         short .return
+
+    or          rdi, JSIMD_AVX2
+
+.return:
+    mov         rax, rdi
+
+    pop         rdi
+    pop         rbx
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libogg/CHANGES b/media/libogg/CHANGES
index 3f2e0fb261..48f671e59b 100644
--- a/media/libogg/CHANGES
+++ b/media/libogg/CHANGES
@@ -1,3 +1,27 @@
+Version 1.3.5 (2020 June 3)
+
+ * Fix unsigned typedef problem on macOS.
+ * Fix overflow check in ogg_sync_buffer.
+ * Clean up cmake and autotools build files.
+ * Remove Symbian and Apple XCode build files.
+ * Fix documentation cross-reference links.
+
+Version 1.3.4 (2019 August 30)
+
+* Faster slice-by-8 CRC32 implementation.
+  see https://lwn.net/Articles/453931/ for motivation.
+* Add CMake build.
+* Deprecate Visual Studio project files in favor of CMake.
+* configure --disable-crc option for fuzzing.
+* Various build fixes.
+* Documentation and example code fixes.
+
+Version 1.3.3 (2017 November 7)
+
+ * Fix an issue with corrupt continued packet handling.
+ * Update Windows projects and build settings.
+ * Remove Mac OS 9 build support.
+
 Version 1.3.2 (2014 May 27)
 
  * Fix an bug in oggpack_writecopy().
@@ -76,7 +100,7 @@ Version 1.1 (2003 November 17)
 
  * big-endian bitpacker routines for Theora
  * various portability fixes
- * improved API documenation
+ * improved API documentation
  * RFC 3533 documentation of the format by Silvia Pfeiffer at CSIRO
  * RFC 3534 documentation of the application/ogg mime-type by Linus Walleij
 
diff --git a/media/libogg/README b/media/libogg/README
deleted file mode 100644
index 2db22e65f4..0000000000
--- a/media/libogg/README
+++ /dev/null
@@ -1,97 +0,0 @@
-********************************************************************
-*                                                                  *
-* THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.   *
-* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
-* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
-* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
-*                                                                  *
-* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2011             *
-* by the Xiph.Org Foundation http://www.xiph.org/                  *
-*                                                                  *
-********************************************************************
-
-= WHAT'S HERE =
-
-This source distribution includes libogg and nothing else. Other modules
-(eg, the modules libvorbis, vorbis-tools for the Vorbis music codec,
-libtheora for the Theora video codec) contain the codec libraries for
-use with Ogg bitstreams.
-
-Directory:
-
-./src  		The source for libogg, a BSD-license inplementation of
-		the public domain Ogg bitstream format
-
-./include       Library API headers
-
-./doc           Ogg specification and libogg API documents
-
-./win32		Win32 projects and build automation
-
-./macosx	Mac OS X project and build files
-
-= WHAT IS OGG? =
-
-Ogg project codecs use the Ogg bitstream format to arrange the raw,
-compressed bitstream into a more robust, useful form.  For example,
-the Ogg bitstream makes seeking, time stamping and error recovery
-possible, as well as mixing several sepearate, concurrent media
-streams into a single physical bitstream.
-
-= CONTACT =
-
-The Ogg homepage is located at 'https://www.xiph.org/ogg/'.
-Up to date technical documents, contact information, source code and
-pre-built utilities may be found there.
-
-BUILDING FROM TARBALL DISTRIBUTIONS:
-
-./configure
-make
-
-and optionally (as root):
-make install
-
-This will install the Ogg libraries (static and shared) into
-/usr/local/lib, includes into /usr/local/include and API
-documentation into /usr/local/share/doc.
-
-BUILDING FROM REPOSITORY SOURCE:
-
-A standard svn build should consist of nothing more than:
-
-./autogen.sh
-make
-
-and as root if desired :
-
-make install
-
-BUILDING ON WIN32:
-
-Use the project file in the win32 directory. It should compile out of the box.
-
-CROSS COMPILING FROM LINUX TO WIN32:
-
-It is also possible to cross compile from Linux to windows using the MinGW
-cross tools and even to run the test suite under Wine, the Linux/*nix
-windows emulator.
-
-On Debian and Ubuntu systems, these cross compiler tools can be installed
-by doing:
-
-    sudo apt-get mingw32 mingw32-binutils mingw32-runtime wine
-
-Once these tools are installed its possible to compile and test by
-executing the following commands, or something similar depending on
-your system:
-
-    ./configure --host=i586-mingw32msvc --target=i586-mingw32msvc \
-         --build=i586-linux
-    make
-    make check
-
-(Build instructions for Ogg codecs such as vorbis are similar and may
-be found in those source modules' README files)
-
-$Id: README 18096 2011-09-22 23:32:51Z giles $
diff --git a/media/libogg/README.md b/media/libogg/README.md
new file mode 100644
index 0000000000..0101cb14dc
--- /dev/null
+++ b/media/libogg/README.md
@@ -0,0 +1,160 @@
+# Ogg
+
+[![Travis Build Status](https://travis-ci.org/xiph/ogg.svg?branch=master)](https://travis-ci.org/xiph/ogg)
+[![Jenkins Build Status](https://mf4.xiph.org/jenkins/job/libogg/badge/icon)](https://mf4.xiph.org/jenkins/job/libogg/)
+[![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/xiph/ogg?branch=master&svg=true)](https://ci.appveyor.com/project/rillian/ogg)
+
+Ogg project codecs use the Ogg bitstream format to arrange the raw,
+compressed bitstream into a more robust, useful form. For example,
+the Ogg bitstream makes seeking, time stamping and error recovery
+possible, as well as mixing several sepearate, concurrent media
+streams into a single physical bitstream.
+
+## What's here ##
+This source distribution includes libogg and nothing else. Other modules
+(eg, the modules libvorbis, vorbis-tools for the Vorbis music codec,
+libtheora for the Theora video codec) contain the codec libraries for
+use with Ogg bitstreams.
+
+Directory:
+
+- `src` The source for libogg, a BSD-license inplementation of the public domain Ogg bitstream format
+
+- `include` Library API headers
+
+- `doc` Ogg specification and libogg API documents
+
+- `win32` Win32 projects and build automation
+
+## Contact ##
+
+The Ogg homepage is located at https://www.xiph.org/ogg/ .
+Up to date technical documents, contact information, source code and
+pre-built utilities may be found there.
+
+## Building ##
+
+#### Building from tarball distributions ####
+
+    ./configure
+    make
+
+and optionally (as root):
+
+    make install
+
+This will install the Ogg libraries (static and shared) into
+/usr/local/lib, includes into /usr/local/include and API
+documentation into /usr/local/share/doc.
+
+#### Building from repository source ####
+
+A standard svn build should consist of nothing more than:
+
+    ./autogen.sh
+    ./configure
+    make
+
+and as root if desired :
+
+    make install
+
+#### Building on Windows ####
+
+Use the project file in the win32 directory. It should compile out of the box.
+
+#### Cross-compiling from Linux to Windows ####
+
+It is also possible to cross compile from Linux to windows using the MinGW
+cross tools and even to run the test suite under Wine, the Linux/*nix
+windows emulator.
+
+On Debian and Ubuntu systems, these cross compiler tools can be installed
+by doing:
+
+    sudo apt-get mingw32 mingw32-binutils mingw32-runtime wine
+
+Once these tools are installed its possible to compile and test by
+executing the following commands, or something similar depending on
+your system:
+
+    ./configure --host=i586-mingw32msvc --target=i586-mingw32msvc --build=i586-linux
+    make
+    make check
+
+(Build instructions for Ogg codecs such as vorbis are similar and may
+be found in those source modules' README files)
+
+## Building with CMake ##
+
+Ogg supports building using [CMake](http://www.cmake.org/). CMake is a meta build system that generates native projects for each platform.
+To generate projects just run cmake replacing `YOUR-PROJECT-GENERATOR` with a proper generator from a list [here](http://www.cmake.org/cmake/help/v3.2/manual/cmake-generators.7.html):
+
+    mkdir build
+    cd build
+    cmake -G YOUR-PROJECT-GENERATOR ..
+
+Note that by default cmake generates projects that will build static libraries.
+To generate projects that will build dynamic library use `BUILD_SHARED_LIBS` option like this:
+
+    cmake -G YOUR-PROJECT-GENERATOR -DBUILD_SHARED_LIBS=1 ..
+
+After projects are generated use them as usual
+
+#### Building on Windows ####
+
+Use proper generator for your Visual Studio version like:
+
+    cmake -G "Visual Studio 12 2013" ..
+
+#### Building on Mac OS X ####
+
+Use Xcode generator. To build framework run:
+
+    cmake -G Xcode -DBUILD_FRAMEWORK=1 ..
+
+#### Building on Linux ####
+
+Use Makefile generator which is default one.
+
+    cmake ..
+    make
+
+## Testing ##
+
+This package includes a collection of automated tests.
+Running them is not part of building nor installation but optional.
+
+### Unix-like System or MinGW ###
+
+If build under automake:
+
+    make check
+
+If build under CMake:
+
+    make test
+
+or:
+
+    ctest
+
+### Windows with MSBuild ###
+
+If build with configuration type "Debug", then:
+
+    ctest -C Debug
+
+If build with configuration type "Release", then:
+
+    ctest -C Release
+
+## License ##
+
+THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.
+USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS
+GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE
+IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.
+
+THE OggVorbis SOURCE CODE IS COPYRIGHT (C) 1994-2019
+by the Xiph.Org Foundation https://www.xiph.org/
diff --git a/media/libogg/README_MOZILLA b/media/libogg/README_MOZILLA
index 6213fdc777..7b10a7c19e 100644
--- a/media/libogg/README_MOZILLA
+++ b/media/libogg/README_MOZILLA
@@ -1,4 +1,4 @@
-Version: 1.3.2
+Version: 1.3.5
 
 The source from this directory was extracted from the official source
 package downloaded from xiph.org and copied using the update.sh script.
diff --git a/media/libogg/include/crctable.h b/media/libogg/include/crctable.h
new file mode 100644
index 0000000000..dcc378b309
--- /dev/null
+++ b/media/libogg/include/crctable.h
@@ -0,0 +1,278 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE Ogg CONTAINER SOURCE CODE.              *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2018             *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************/
+
+#include <ogg/os_types.h>
+
+static const ogg_uint32_t crc_lookup[8][256]={
+{0x00000000,0x04c11db7,0x09823b6e,0x0d4326d9,0x130476dc,0x17c56b6b,0x1a864db2,0x1e475005,
+ 0x2608edb8,0x22c9f00f,0x2f8ad6d6,0x2b4bcb61,0x350c9b64,0x31cd86d3,0x3c8ea00a,0x384fbdbd,
+ 0x4c11db70,0x48d0c6c7,0x4593e01e,0x4152fda9,0x5f15adac,0x5bd4b01b,0x569796c2,0x52568b75,
+ 0x6a1936c8,0x6ed82b7f,0x639b0da6,0x675a1011,0x791d4014,0x7ddc5da3,0x709f7b7a,0x745e66cd,
+ 0x9823b6e0,0x9ce2ab57,0x91a18d8e,0x95609039,0x8b27c03c,0x8fe6dd8b,0x82a5fb52,0x8664e6e5,
+ 0xbe2b5b58,0xbaea46ef,0xb7a96036,0xb3687d81,0xad2f2d84,0xa9ee3033,0xa4ad16ea,0xa06c0b5d,
+ 0xd4326d90,0xd0f37027,0xddb056fe,0xd9714b49,0xc7361b4c,0xc3f706fb,0xceb42022,0xca753d95,
+ 0xf23a8028,0xf6fb9d9f,0xfbb8bb46,0xff79a6f1,0xe13ef6f4,0xe5ffeb43,0xe8bccd9a,0xec7dd02d,
+ 0x34867077,0x30476dc0,0x3d044b19,0x39c556ae,0x278206ab,0x23431b1c,0x2e003dc5,0x2ac12072,
+ 0x128e9dcf,0x164f8078,0x1b0ca6a1,0x1fcdbb16,0x018aeb13,0x054bf6a4,0x0808d07d,0x0cc9cdca,
+ 0x7897ab07,0x7c56b6b0,0x71159069,0x75d48dde,0x6b93dddb,0x6f52c06c,0x6211e6b5,0x66d0fb02,
+ 0x5e9f46bf,0x5a5e5b08,0x571d7dd1,0x53dc6066,0x4d9b3063,0x495a2dd4,0x44190b0d,0x40d816ba,
+ 0xaca5c697,0xa864db20,0xa527fdf9,0xa1e6e04e,0xbfa1b04b,0xbb60adfc,0xb6238b25,0xb2e29692,
+ 0x8aad2b2f,0x8e6c3698,0x832f1041,0x87ee0df6,0x99a95df3,0x9d684044,0x902b669d,0x94ea7b2a,
+ 0xe0b41de7,0xe4750050,0xe9362689,0xedf73b3e,0xf3b06b3b,0xf771768c,0xfa325055,0xfef34de2,
+ 0xc6bcf05f,0xc27dede8,0xcf3ecb31,0xcbffd686,0xd5b88683,0xd1799b34,0xdc3abded,0xd8fba05a,
+ 0x690ce0ee,0x6dcdfd59,0x608edb80,0x644fc637,0x7a089632,0x7ec98b85,0x738aad5c,0x774bb0eb,
+ 0x4f040d56,0x4bc510e1,0x46863638,0x42472b8f,0x5c007b8a,0x58c1663d,0x558240e4,0x51435d53,
+ 0x251d3b9e,0x21dc2629,0x2c9f00f0,0x285e1d47,0x36194d42,0x32d850f5,0x3f9b762c,0x3b5a6b9b,
+ 0x0315d626,0x07d4cb91,0x0a97ed48,0x0e56f0ff,0x1011a0fa,0x14d0bd4d,0x19939b94,0x1d528623,
+ 0xf12f560e,0xf5ee4bb9,0xf8ad6d60,0xfc6c70d7,0xe22b20d2,0xe6ea3d65,0xeba91bbc,0xef68060b,
+ 0xd727bbb6,0xd3e6a601,0xdea580d8,0xda649d6f,0xc423cd6a,0xc0e2d0dd,0xcda1f604,0xc960ebb3,
+ 0xbd3e8d7e,0xb9ff90c9,0xb4bcb610,0xb07daba7,0xae3afba2,0xaafbe615,0xa7b8c0cc,0xa379dd7b,
+ 0x9b3660c6,0x9ff77d71,0x92b45ba8,0x9675461f,0x8832161a,0x8cf30bad,0x81b02d74,0x857130c3,
+ 0x5d8a9099,0x594b8d2e,0x5408abf7,0x50c9b640,0x4e8ee645,0x4a4ffbf2,0x470cdd2b,0x43cdc09c,
+ 0x7b827d21,0x7f436096,0x7200464f,0x76c15bf8,0x68860bfd,0x6c47164a,0x61043093,0x65c52d24,
+ 0x119b4be9,0x155a565e,0x18197087,0x1cd86d30,0x029f3d35,0x065e2082,0x0b1d065b,0x0fdc1bec,
+ 0x3793a651,0x3352bbe6,0x3e119d3f,0x3ad08088,0x2497d08d,0x2056cd3a,0x2d15ebe3,0x29d4f654,
+ 0xc5a92679,0xc1683bce,0xcc2b1d17,0xc8ea00a0,0xd6ad50a5,0xd26c4d12,0xdf2f6bcb,0xdbee767c,
+ 0xe3a1cbc1,0xe760d676,0xea23f0af,0xeee2ed18,0xf0a5bd1d,0xf464a0aa,0xf9278673,0xfde69bc4,
+ 0x89b8fd09,0x8d79e0be,0x803ac667,0x84fbdbd0,0x9abc8bd5,0x9e7d9662,0x933eb0bb,0x97ffad0c,
+ 0xafb010b1,0xab710d06,0xa6322bdf,0xa2f33668,0xbcb4666d,0xb8757bda,0xb5365d03,0xb1f740b4},
+
+{0x00000000,0xd219c1dc,0xa0f29e0f,0x72eb5fd3,0x452421a9,0x973de075,0xe5d6bfa6,0x37cf7e7a,
+ 0x8a484352,0x5851828e,0x2abadd5d,0xf8a31c81,0xcf6c62fb,0x1d75a327,0x6f9efcf4,0xbd873d28,
+ 0x10519b13,0xc2485acf,0xb0a3051c,0x62bac4c0,0x5575baba,0x876c7b66,0xf58724b5,0x279ee569,
+ 0x9a19d841,0x4800199d,0x3aeb464e,0xe8f28792,0xdf3df9e8,0x0d243834,0x7fcf67e7,0xadd6a63b,
+ 0x20a33626,0xf2baf7fa,0x8051a829,0x524869f5,0x6587178f,0xb79ed653,0xc5758980,0x176c485c,
+ 0xaaeb7574,0x78f2b4a8,0x0a19eb7b,0xd8002aa7,0xefcf54dd,0x3dd69501,0x4f3dcad2,0x9d240b0e,
+ 0x30f2ad35,0xe2eb6ce9,0x9000333a,0x4219f2e6,0x75d68c9c,0xa7cf4d40,0xd5241293,0x073dd34f,
+ 0xbabaee67,0x68a32fbb,0x1a487068,0xc851b1b4,0xff9ecfce,0x2d870e12,0x5f6c51c1,0x8d75901d,
+ 0x41466c4c,0x935fad90,0xe1b4f243,0x33ad339f,0x04624de5,0xd67b8c39,0xa490d3ea,0x76891236,
+ 0xcb0e2f1e,0x1917eec2,0x6bfcb111,0xb9e570cd,0x8e2a0eb7,0x5c33cf6b,0x2ed890b8,0xfcc15164,
+ 0x5117f75f,0x830e3683,0xf1e56950,0x23fca88c,0x1433d6f6,0xc62a172a,0xb4c148f9,0x66d88925,
+ 0xdb5fb40d,0x094675d1,0x7bad2a02,0xa9b4ebde,0x9e7b95a4,0x4c625478,0x3e890bab,0xec90ca77,
+ 0x61e55a6a,0xb3fc9bb6,0xc117c465,0x130e05b9,0x24c17bc3,0xf6d8ba1f,0x8433e5cc,0x562a2410,
+ 0xebad1938,0x39b4d8e4,0x4b5f8737,0x994646eb,0xae893891,0x7c90f94d,0x0e7ba69e,0xdc626742,
+ 0x71b4c179,0xa3ad00a5,0xd1465f76,0x035f9eaa,0x3490e0d0,0xe689210c,0x94627edf,0x467bbf03,
+ 0xfbfc822b,0x29e543f7,0x5b0e1c24,0x8917ddf8,0xbed8a382,0x6cc1625e,0x1e2a3d8d,0xcc33fc51,
+ 0x828cd898,0x50951944,0x227e4697,0xf067874b,0xc7a8f931,0x15b138ed,0x675a673e,0xb543a6e2,
+ 0x08c49bca,0xdadd5a16,0xa83605c5,0x7a2fc419,0x4de0ba63,0x9ff97bbf,0xed12246c,0x3f0be5b0,
+ 0x92dd438b,0x40c48257,0x322fdd84,0xe0361c58,0xd7f96222,0x05e0a3fe,0x770bfc2d,0xa5123df1,
+ 0x189500d9,0xca8cc105,0xb8679ed6,0x6a7e5f0a,0x5db12170,0x8fa8e0ac,0xfd43bf7f,0x2f5a7ea3,
+ 0xa22feebe,0x70362f62,0x02dd70b1,0xd0c4b16d,0xe70bcf17,0x35120ecb,0x47f95118,0x95e090c4,
+ 0x2867adec,0xfa7e6c30,0x889533e3,0x5a8cf23f,0x6d438c45,0xbf5a4d99,0xcdb1124a,0x1fa8d396,
+ 0xb27e75ad,0x6067b471,0x128ceba2,0xc0952a7e,0xf75a5404,0x254395d8,0x57a8ca0b,0x85b10bd7,
+ 0x383636ff,0xea2ff723,0x98c4a8f0,0x4add692c,0x7d121756,0xaf0bd68a,0xdde08959,0x0ff94885,
+ 0xc3cab4d4,0x11d37508,0x63382adb,0xb121eb07,0x86ee957d,0x54f754a1,0x261c0b72,0xf405caae,
+ 0x4982f786,0x9b9b365a,0xe9706989,0x3b69a855,0x0ca6d62f,0xdebf17f3,0xac544820,0x7e4d89fc,
+ 0xd39b2fc7,0x0182ee1b,0x7369b1c8,0xa1707014,0x96bf0e6e,0x44a6cfb2,0x364d9061,0xe45451bd,
+ 0x59d36c95,0x8bcaad49,0xf921f29a,0x2b383346,0x1cf74d3c,0xceee8ce0,0xbc05d333,0x6e1c12ef,
+ 0xe36982f2,0x3170432e,0x439b1cfd,0x9182dd21,0xa64da35b,0x74546287,0x06bf3d54,0xd4a6fc88,
+ 0x6921c1a0,0xbb38007c,0xc9d35faf,0x1bca9e73,0x2c05e009,0xfe1c21d5,0x8cf77e06,0x5eeebfda,
+ 0xf33819e1,0x2121d83d,0x53ca87ee,0x81d34632,0xb61c3848,0x6405f994,0x16eea647,0xc4f7679b,
+ 0x79705ab3,0xab699b6f,0xd982c4bc,0x0b9b0560,0x3c547b1a,0xee4dbac6,0x9ca6e515,0x4ebf24c9},
+
+{0x00000000,0x01d8ac87,0x03b1590e,0x0269f589,0x0762b21c,0x06ba1e9b,0x04d3eb12,0x050b4795,
+ 0x0ec56438,0x0f1dc8bf,0x0d743d36,0x0cac91b1,0x09a7d624,0x087f7aa3,0x0a168f2a,0x0bce23ad,
+ 0x1d8ac870,0x1c5264f7,0x1e3b917e,0x1fe33df9,0x1ae87a6c,0x1b30d6eb,0x19592362,0x18818fe5,
+ 0x134fac48,0x129700cf,0x10fef546,0x112659c1,0x142d1e54,0x15f5b2d3,0x179c475a,0x1644ebdd,
+ 0x3b1590e0,0x3acd3c67,0x38a4c9ee,0x397c6569,0x3c7722fc,0x3daf8e7b,0x3fc67bf2,0x3e1ed775,
+ 0x35d0f4d8,0x3408585f,0x3661add6,0x37b90151,0x32b246c4,0x336aea43,0x31031fca,0x30dbb34d,
+ 0x269f5890,0x2747f417,0x252e019e,0x24f6ad19,0x21fdea8c,0x2025460b,0x224cb382,0x23941f05,
+ 0x285a3ca8,0x2982902f,0x2beb65a6,0x2a33c921,0x2f388eb4,0x2ee02233,0x2c89d7ba,0x2d517b3d,
+ 0x762b21c0,0x77f38d47,0x759a78ce,0x7442d449,0x714993dc,0x70913f5b,0x72f8cad2,0x73206655,
+ 0x78ee45f8,0x7936e97f,0x7b5f1cf6,0x7a87b071,0x7f8cf7e4,0x7e545b63,0x7c3daeea,0x7de5026d,
+ 0x6ba1e9b0,0x6a794537,0x6810b0be,0x69c81c39,0x6cc35bac,0x6d1bf72b,0x6f7202a2,0x6eaaae25,
+ 0x65648d88,0x64bc210f,0x66d5d486,0x670d7801,0x62063f94,0x63de9313,0x61b7669a,0x606fca1d,
+ 0x4d3eb120,0x4ce61da7,0x4e8fe82e,0x4f5744a9,0x4a5c033c,0x4b84afbb,0x49ed5a32,0x4835f6b5,
+ 0x43fbd518,0x4223799f,0x404a8c16,0x41922091,0x44996704,0x4541cb83,0x47283e0a,0x46f0928d,
+ 0x50b47950,0x516cd5d7,0x5305205e,0x52dd8cd9,0x57d6cb4c,0x560e67cb,0x54679242,0x55bf3ec5,
+ 0x5e711d68,0x5fa9b1ef,0x5dc04466,0x5c18e8e1,0x5913af74,0x58cb03f3,0x5aa2f67a,0x5b7a5afd,
+ 0xec564380,0xed8eef07,0xefe71a8e,0xee3fb609,0xeb34f19c,0xeaec5d1b,0xe885a892,0xe95d0415,
+ 0xe29327b8,0xe34b8b3f,0xe1227eb6,0xe0fad231,0xe5f195a4,0xe4293923,0xe640ccaa,0xe798602d,
+ 0xf1dc8bf0,0xf0042777,0xf26dd2fe,0xf3b57e79,0xf6be39ec,0xf766956b,0xf50f60e2,0xf4d7cc65,
+ 0xff19efc8,0xfec1434f,0xfca8b6c6,0xfd701a41,0xf87b5dd4,0xf9a3f153,0xfbca04da,0xfa12a85d,
+ 0xd743d360,0xd69b7fe7,0xd4f28a6e,0xd52a26e9,0xd021617c,0xd1f9cdfb,0xd3903872,0xd24894f5,
+ 0xd986b758,0xd85e1bdf,0xda37ee56,0xdbef42d1,0xdee40544,0xdf3ca9c3,0xdd555c4a,0xdc8df0cd,
+ 0xcac91b10,0xcb11b797,0xc978421e,0xc8a0ee99,0xcdaba90c,0xcc73058b,0xce1af002,0xcfc25c85,
+ 0xc40c7f28,0xc5d4d3af,0xc7bd2626,0xc6658aa1,0xc36ecd34,0xc2b661b3,0xc0df943a,0xc10738bd,
+ 0x9a7d6240,0x9ba5cec7,0x99cc3b4e,0x981497c9,0x9d1fd05c,0x9cc77cdb,0x9eae8952,0x9f7625d5,
+ 0x94b80678,0x9560aaff,0x97095f76,0x96d1f3f1,0x93dab464,0x920218e3,0x906bed6a,0x91b341ed,
+ 0x87f7aa30,0x862f06b7,0x8446f33e,0x859e5fb9,0x8095182c,0x814db4ab,0x83244122,0x82fceda5,
+ 0x8932ce08,0x88ea628f,0x8a839706,0x8b5b3b81,0x8e507c14,0x8f88d093,0x8de1251a,0x8c39899d,
+ 0xa168f2a0,0xa0b05e27,0xa2d9abae,0xa3010729,0xa60a40bc,0xa7d2ec3b,0xa5bb19b2,0xa463b535,
+ 0xafad9698,0xae753a1f,0xac1ccf96,0xadc46311,0xa8cf2484,0xa9178803,0xab7e7d8a,0xaaa6d10d,
+ 0xbce23ad0,0xbd3a9657,0xbf5363de,0xbe8bcf59,0xbb8088cc,0xba58244b,0xb831d1c2,0xb9e97d45,
+ 0xb2275ee8,0xb3fff26f,0xb19607e6,0xb04eab61,0xb545ecf4,0xb49d4073,0xb6f4b5fa,0xb72c197d},
+
+{0x00000000,0xdc6d9ab7,0xbc1a28d9,0x6077b26e,0x7cf54c05,0xa098d6b2,0xc0ef64dc,0x1c82fe6b,
+ 0xf9ea980a,0x258702bd,0x45f0b0d3,0x999d2a64,0x851fd40f,0x59724eb8,0x3905fcd6,0xe5686661,
+ 0xf7142da3,0x2b79b714,0x4b0e057a,0x97639fcd,0x8be161a6,0x578cfb11,0x37fb497f,0xeb96d3c8,
+ 0x0efeb5a9,0xd2932f1e,0xb2e49d70,0x6e8907c7,0x720bf9ac,0xae66631b,0xce11d175,0x127c4bc2,
+ 0xeae946f1,0x3684dc46,0x56f36e28,0x8a9ef49f,0x961c0af4,0x4a719043,0x2a06222d,0xf66bb89a,
+ 0x1303defb,0xcf6e444c,0xaf19f622,0x73746c95,0x6ff692fe,0xb39b0849,0xd3ecba27,0x0f812090,
+ 0x1dfd6b52,0xc190f1e5,0xa1e7438b,0x7d8ad93c,0x61082757,0xbd65bde0,0xdd120f8e,0x017f9539,
+ 0xe417f358,0x387a69ef,0x580ddb81,0x84604136,0x98e2bf5d,0x448f25ea,0x24f89784,0xf8950d33,
+ 0xd1139055,0x0d7e0ae2,0x6d09b88c,0xb164223b,0xade6dc50,0x718b46e7,0x11fcf489,0xcd916e3e,
+ 0x28f9085f,0xf49492e8,0x94e32086,0x488eba31,0x540c445a,0x8861deed,0xe8166c83,0x347bf634,
+ 0x2607bdf6,0xfa6a2741,0x9a1d952f,0x46700f98,0x5af2f1f3,0x869f6b44,0xe6e8d92a,0x3a85439d,
+ 0xdfed25fc,0x0380bf4b,0x63f70d25,0xbf9a9792,0xa31869f9,0x7f75f34e,0x1f024120,0xc36fdb97,
+ 0x3bfad6a4,0xe7974c13,0x87e0fe7d,0x5b8d64ca,0x470f9aa1,0x9b620016,0xfb15b278,0x277828cf,
+ 0xc2104eae,0x1e7dd419,0x7e0a6677,0xa267fcc0,0xbee502ab,0x6288981c,0x02ff2a72,0xde92b0c5,
+ 0xcceefb07,0x108361b0,0x70f4d3de,0xac994969,0xb01bb702,0x6c762db5,0x0c019fdb,0xd06c056c,
+ 0x3504630d,0xe969f9ba,0x891e4bd4,0x5573d163,0x49f12f08,0x959cb5bf,0xf5eb07d1,0x29869d66,
+ 0xa6e63d1d,0x7a8ba7aa,0x1afc15c4,0xc6918f73,0xda137118,0x067eebaf,0x660959c1,0xba64c376,
+ 0x5f0ca517,0x83613fa0,0xe3168dce,0x3f7b1779,0x23f9e912,0xff9473a5,0x9fe3c1cb,0x438e5b7c,
+ 0x51f210be,0x8d9f8a09,0xede83867,0x3185a2d0,0x2d075cbb,0xf16ac60c,0x911d7462,0x4d70eed5,
+ 0xa81888b4,0x74751203,0x1402a06d,0xc86f3ada,0xd4edc4b1,0x08805e06,0x68f7ec68,0xb49a76df,
+ 0x4c0f7bec,0x9062e15b,0xf0155335,0x2c78c982,0x30fa37e9,0xec97ad5e,0x8ce01f30,0x508d8587,
+ 0xb5e5e3e6,0x69887951,0x09ffcb3f,0xd5925188,0xc910afe3,0x157d3554,0x750a873a,0xa9671d8d,
+ 0xbb1b564f,0x6776ccf8,0x07017e96,0xdb6ce421,0xc7ee1a4a,0x1b8380fd,0x7bf43293,0xa799a824,
+ 0x42f1ce45,0x9e9c54f2,0xfeebe69c,0x22867c2b,0x3e048240,0xe26918f7,0x821eaa99,0x5e73302e,
+ 0x77f5ad48,0xab9837ff,0xcbef8591,0x17821f26,0x0b00e14d,0xd76d7bfa,0xb71ac994,0x6b775323,
+ 0x8e1f3542,0x5272aff5,0x32051d9b,0xee68872c,0xf2ea7947,0x2e87e3f0,0x4ef0519e,0x929dcb29,
+ 0x80e180eb,0x5c8c1a5c,0x3cfba832,0xe0963285,0xfc14ccee,0x20795659,0x400ee437,0x9c637e80,
+ 0x790b18e1,0xa5668256,0xc5113038,0x197caa8f,0x05fe54e4,0xd993ce53,0xb9e47c3d,0x6589e68a,
+ 0x9d1cebb9,0x4171710e,0x2106c360,0xfd6b59d7,0xe1e9a7bc,0x3d843d0b,0x5df38f65,0x819e15d2,
+ 0x64f673b3,0xb89be904,0xd8ec5b6a,0x0481c1dd,0x18033fb6,0xc46ea501,0xa419176f,0x78748dd8,
+ 0x6a08c61a,0xb6655cad,0xd612eec3,0x0a7f7474,0x16fd8a1f,0xca9010a8,0xaae7a2c6,0x768a3871,
+ 0x93e25e10,0x4f8fc4a7,0x2ff876c9,0xf395ec7e,0xef171215,0x337a88a2,0x530d3acc,0x8f60a07b},
+
+{0x00000000,0x490d678d,0x921acf1a,0xdb17a897,0x20f48383,0x69f9e40e,0xb2ee4c99,0xfbe32b14,
+ 0x41e90706,0x08e4608b,0xd3f3c81c,0x9afeaf91,0x611d8485,0x2810e308,0xf3074b9f,0xba0a2c12,
+ 0x83d20e0c,0xcadf6981,0x11c8c116,0x58c5a69b,0xa3268d8f,0xea2bea02,0x313c4295,0x78312518,
+ 0xc23b090a,0x8b366e87,0x5021c610,0x192ca19d,0xe2cf8a89,0xabc2ed04,0x70d54593,0x39d8221e,
+ 0x036501af,0x4a686622,0x917fceb5,0xd872a938,0x2391822c,0x6a9ce5a1,0xb18b4d36,0xf8862abb,
+ 0x428c06a9,0x0b816124,0xd096c9b3,0x999bae3e,0x6278852a,0x2b75e2a7,0xf0624a30,0xb96f2dbd,
+ 0x80b70fa3,0xc9ba682e,0x12adc0b9,0x5ba0a734,0xa0438c20,0xe94eebad,0x3259433a,0x7b5424b7,
+ 0xc15e08a5,0x88536f28,0x5344c7bf,0x1a49a032,0xe1aa8b26,0xa8a7ecab,0x73b0443c,0x3abd23b1,
+ 0x06ca035e,0x4fc764d3,0x94d0cc44,0xddddabc9,0x263e80dd,0x6f33e750,0xb4244fc7,0xfd29284a,
+ 0x47230458,0x0e2e63d5,0xd539cb42,0x9c34accf,0x67d787db,0x2edae056,0xf5cd48c1,0xbcc02f4c,
+ 0x85180d52,0xcc156adf,0x1702c248,0x5e0fa5c5,0xa5ec8ed1,0xece1e95c,0x37f641cb,0x7efb2646,
+ 0xc4f10a54,0x8dfc6dd9,0x56ebc54e,0x1fe6a2c3,0xe40589d7,0xad08ee5a,0x761f46cd,0x3f122140,
+ 0x05af02f1,0x4ca2657c,0x97b5cdeb,0xdeb8aa66,0x255b8172,0x6c56e6ff,0xb7414e68,0xfe4c29e5,
+ 0x444605f7,0x0d4b627a,0xd65ccaed,0x9f51ad60,0x64b28674,0x2dbfe1f9,0xf6a8496e,0xbfa52ee3,
+ 0x867d0cfd,0xcf706b70,0x1467c3e7,0x5d6aa46a,0xa6898f7e,0xef84e8f3,0x34934064,0x7d9e27e9,
+ 0xc7940bfb,0x8e996c76,0x558ec4e1,0x1c83a36c,0xe7608878,0xae6deff5,0x757a4762,0x3c7720ef,
+ 0x0d9406bc,0x44996131,0x9f8ec9a6,0xd683ae2b,0x2d60853f,0x646de2b2,0xbf7a4a25,0xf6772da8,
+ 0x4c7d01ba,0x05706637,0xde67cea0,0x976aa92d,0x6c898239,0x2584e5b4,0xfe934d23,0xb79e2aae,
+ 0x8e4608b0,0xc74b6f3d,0x1c5cc7aa,0x5551a027,0xaeb28b33,0xe7bfecbe,0x3ca84429,0x75a523a4,
+ 0xcfaf0fb6,0x86a2683b,0x5db5c0ac,0x14b8a721,0xef5b8c35,0xa656ebb8,0x7d41432f,0x344c24a2,
+ 0x0ef10713,0x47fc609e,0x9cebc809,0xd5e6af84,0x2e058490,0x6708e31d,0xbc1f4b8a,0xf5122c07,
+ 0x4f180015,0x06156798,0xdd02cf0f,0x940fa882,0x6fec8396,0x26e1e41b,0xfdf64c8c,0xb4fb2b01,
+ 0x8d23091f,0xc42e6e92,0x1f39c605,0x5634a188,0xadd78a9c,0xe4daed11,0x3fcd4586,0x76c0220b,
+ 0xccca0e19,0x85c76994,0x5ed0c103,0x17dda68e,0xec3e8d9a,0xa533ea17,0x7e244280,0x3729250d,
+ 0x0b5e05e2,0x4253626f,0x9944caf8,0xd049ad75,0x2baa8661,0x62a7e1ec,0xb9b0497b,0xf0bd2ef6,
+ 0x4ab702e4,0x03ba6569,0xd8adcdfe,0x91a0aa73,0x6a438167,0x234ee6ea,0xf8594e7d,0xb15429f0,
+ 0x888c0bee,0xc1816c63,0x1a96c4f4,0x539ba379,0xa878886d,0xe175efe0,0x3a624777,0x736f20fa,
+ 0xc9650ce8,0x80686b65,0x5b7fc3f2,0x1272a47f,0xe9918f6b,0xa09ce8e6,0x7b8b4071,0x328627fc,
+ 0x083b044d,0x413663c0,0x9a21cb57,0xd32cacda,0x28cf87ce,0x61c2e043,0xbad548d4,0xf3d82f59,
+ 0x49d2034b,0x00df64c6,0xdbc8cc51,0x92c5abdc,0x692680c8,0x202be745,0xfb3c4fd2,0xb231285f,
+ 0x8be90a41,0xc2e46dcc,0x19f3c55b,0x50fea2d6,0xab1d89c2,0xe210ee4f,0x390746d8,0x700a2155,
+ 0xca000d47,0x830d6aca,0x581ac25d,0x1117a5d0,0xeaf48ec4,0xa3f9e949,0x78ee41de,0x31e32653},
+
+{0x00000000,0x1b280d78,0x36501af0,0x2d781788,0x6ca035e0,0x77883898,0x5af02f10,0x41d82268,
+ 0xd9406bc0,0xc26866b8,0xef107130,0xf4387c48,0xb5e05e20,0xaec85358,0x83b044d0,0x989849a8,
+ 0xb641ca37,0xad69c74f,0x8011d0c7,0x9b39ddbf,0xdae1ffd7,0xc1c9f2af,0xecb1e527,0xf799e85f,
+ 0x6f01a1f7,0x7429ac8f,0x5951bb07,0x4279b67f,0x03a19417,0x1889996f,0x35f18ee7,0x2ed9839f,
+ 0x684289d9,0x736a84a1,0x5e129329,0x453a9e51,0x04e2bc39,0x1fcab141,0x32b2a6c9,0x299aabb1,
+ 0xb102e219,0xaa2aef61,0x8752f8e9,0x9c7af591,0xdda2d7f9,0xc68ada81,0xebf2cd09,0xf0dac071,
+ 0xde0343ee,0xc52b4e96,0xe853591e,0xf37b5466,0xb2a3760e,0xa98b7b76,0x84f36cfe,0x9fdb6186,
+ 0x0743282e,0x1c6b2556,0x311332de,0x2a3b3fa6,0x6be31dce,0x70cb10b6,0x5db3073e,0x469b0a46,
+ 0xd08513b2,0xcbad1eca,0xe6d50942,0xfdfd043a,0xbc252652,0xa70d2b2a,0x8a753ca2,0x915d31da,
+ 0x09c57872,0x12ed750a,0x3f956282,0x24bd6ffa,0x65654d92,0x7e4d40ea,0x53355762,0x481d5a1a,
+ 0x66c4d985,0x7decd4fd,0x5094c375,0x4bbcce0d,0x0a64ec65,0x114ce11d,0x3c34f695,0x271cfbed,
+ 0xbf84b245,0xa4acbf3d,0x89d4a8b5,0x92fca5cd,0xd32487a5,0xc80c8add,0xe5749d55,0xfe5c902d,
+ 0xb8c79a6b,0xa3ef9713,0x8e97809b,0x95bf8de3,0xd467af8b,0xcf4fa2f3,0xe237b57b,0xf91fb803,
+ 0x6187f1ab,0x7aaffcd3,0x57d7eb5b,0x4cffe623,0x0d27c44b,0x160fc933,0x3b77debb,0x205fd3c3,
+ 0x0e86505c,0x15ae5d24,0x38d64aac,0x23fe47d4,0x622665bc,0x790e68c4,0x54767f4c,0x4f5e7234,
+ 0xd7c63b9c,0xccee36e4,0xe196216c,0xfabe2c14,0xbb660e7c,0xa04e0304,0x8d36148c,0x961e19f4,
+ 0xa5cb3ad3,0xbee337ab,0x939b2023,0x88b32d5b,0xc96b0f33,0xd243024b,0xff3b15c3,0xe41318bb,
+ 0x7c8b5113,0x67a35c6b,0x4adb4be3,0x51f3469b,0x102b64f3,0x0b03698b,0x267b7e03,0x3d53737b,
+ 0x138af0e4,0x08a2fd9c,0x25daea14,0x3ef2e76c,0x7f2ac504,0x6402c87c,0x497adff4,0x5252d28c,
+ 0xcaca9b24,0xd1e2965c,0xfc9a81d4,0xe7b28cac,0xa66aaec4,0xbd42a3bc,0x903ab434,0x8b12b94c,
+ 0xcd89b30a,0xd6a1be72,0xfbd9a9fa,0xe0f1a482,0xa12986ea,0xba018b92,0x97799c1a,0x8c519162,
+ 0x14c9d8ca,0x0fe1d5b2,0x2299c23a,0x39b1cf42,0x7869ed2a,0x6341e052,0x4e39f7da,0x5511faa2,
+ 0x7bc8793d,0x60e07445,0x4d9863cd,0x56b06eb5,0x17684cdd,0x0c4041a5,0x2138562d,0x3a105b55,
+ 0xa28812fd,0xb9a01f85,0x94d8080d,0x8ff00575,0xce28271d,0xd5002a65,0xf8783ded,0xe3503095,
+ 0x754e2961,0x6e662419,0x431e3391,0x58363ee9,0x19ee1c81,0x02c611f9,0x2fbe0671,0x34960b09,
+ 0xac0e42a1,0xb7264fd9,0x9a5e5851,0x81765529,0xc0ae7741,0xdb867a39,0xf6fe6db1,0xedd660c9,
+ 0xc30fe356,0xd827ee2e,0xf55ff9a6,0xee77f4de,0xafafd6b6,0xb487dbce,0x99ffcc46,0x82d7c13e,
+ 0x1a4f8896,0x016785ee,0x2c1f9266,0x37379f1e,0x76efbd76,0x6dc7b00e,0x40bfa786,0x5b97aafe,
+ 0x1d0ca0b8,0x0624adc0,0x2b5cba48,0x3074b730,0x71ac9558,0x6a849820,0x47fc8fa8,0x5cd482d0,
+ 0xc44ccb78,0xdf64c600,0xf21cd188,0xe934dcf0,0xa8ecfe98,0xb3c4f3e0,0x9ebce468,0x8594e910,
+ 0xab4d6a8f,0xb06567f7,0x9d1d707f,0x86357d07,0xc7ed5f6f,0xdcc55217,0xf1bd459f,0xea9548e7,
+ 0x720d014f,0x69250c37,0x445d1bbf,0x5f7516c7,0x1ead34af,0x058539d7,0x28fd2e5f,0x33d52327},
+
+{0x00000000,0x4f576811,0x9eaed022,0xd1f9b833,0x399cbdf3,0x76cbd5e2,0xa7326dd1,0xe86505c0,
+ 0x73397be6,0x3c6e13f7,0xed97abc4,0xa2c0c3d5,0x4aa5c615,0x05f2ae04,0xd40b1637,0x9b5c7e26,
+ 0xe672f7cc,0xa9259fdd,0x78dc27ee,0x378b4fff,0xdfee4a3f,0x90b9222e,0x41409a1d,0x0e17f20c,
+ 0x954b8c2a,0xda1ce43b,0x0be55c08,0x44b23419,0xacd731d9,0xe38059c8,0x3279e1fb,0x7d2e89ea,
+ 0xc824f22f,0x87739a3e,0x568a220d,0x19dd4a1c,0xf1b84fdc,0xbeef27cd,0x6f169ffe,0x2041f7ef,
+ 0xbb1d89c9,0xf44ae1d8,0x25b359eb,0x6ae431fa,0x8281343a,0xcdd65c2b,0x1c2fe418,0x53788c09,
+ 0x2e5605e3,0x61016df2,0xb0f8d5c1,0xffafbdd0,0x17cab810,0x589dd001,0x89646832,0xc6330023,
+ 0x5d6f7e05,0x12381614,0xc3c1ae27,0x8c96c636,0x64f3c3f6,0x2ba4abe7,0xfa5d13d4,0xb50a7bc5,
+ 0x9488f9e9,0xdbdf91f8,0x0a2629cb,0x457141da,0xad14441a,0xe2432c0b,0x33ba9438,0x7cedfc29,
+ 0xe7b1820f,0xa8e6ea1e,0x791f522d,0x36483a3c,0xde2d3ffc,0x917a57ed,0x4083efde,0x0fd487cf,
+ 0x72fa0e25,0x3dad6634,0xec54de07,0xa303b616,0x4b66b3d6,0x0431dbc7,0xd5c863f4,0x9a9f0be5,
+ 0x01c375c3,0x4e941dd2,0x9f6da5e1,0xd03acdf0,0x385fc830,0x7708a021,0xa6f11812,0xe9a67003,
+ 0x5cac0bc6,0x13fb63d7,0xc202dbe4,0x8d55b3f5,0x6530b635,0x2a67de24,0xfb9e6617,0xb4c90e06,
+ 0x2f957020,0x60c21831,0xb13ba002,0xfe6cc813,0x1609cdd3,0x595ea5c2,0x88a71df1,0xc7f075e0,
+ 0xbadefc0a,0xf589941b,0x24702c28,0x6b274439,0x834241f9,0xcc1529e8,0x1dec91db,0x52bbf9ca,
+ 0xc9e787ec,0x86b0effd,0x574957ce,0x181e3fdf,0xf07b3a1f,0xbf2c520e,0x6ed5ea3d,0x2182822c,
+ 0x2dd0ee65,0x62878674,0xb37e3e47,0xfc295656,0x144c5396,0x5b1b3b87,0x8ae283b4,0xc5b5eba5,
+ 0x5ee99583,0x11befd92,0xc04745a1,0x8f102db0,0x67752870,0x28224061,0xf9dbf852,0xb68c9043,
+ 0xcba219a9,0x84f571b8,0x550cc98b,0x1a5ba19a,0xf23ea45a,0xbd69cc4b,0x6c907478,0x23c71c69,
+ 0xb89b624f,0xf7cc0a5e,0x2635b26d,0x6962da7c,0x8107dfbc,0xce50b7ad,0x1fa90f9e,0x50fe678f,
+ 0xe5f41c4a,0xaaa3745b,0x7b5acc68,0x340da479,0xdc68a1b9,0x933fc9a8,0x42c6719b,0x0d91198a,
+ 0x96cd67ac,0xd99a0fbd,0x0863b78e,0x4734df9f,0xaf51da5f,0xe006b24e,0x31ff0a7d,0x7ea8626c,
+ 0x0386eb86,0x4cd18397,0x9d283ba4,0xd27f53b5,0x3a1a5675,0x754d3e64,0xa4b48657,0xebe3ee46,
+ 0x70bf9060,0x3fe8f871,0xee114042,0xa1462853,0x49232d93,0x06744582,0xd78dfdb1,0x98da95a0,
+ 0xb958178c,0xf60f7f9d,0x27f6c7ae,0x68a1afbf,0x80c4aa7f,0xcf93c26e,0x1e6a7a5d,0x513d124c,
+ 0xca616c6a,0x8536047b,0x54cfbc48,0x1b98d459,0xf3fdd199,0xbcaab988,0x6d5301bb,0x220469aa,
+ 0x5f2ae040,0x107d8851,0xc1843062,0x8ed35873,0x66b65db3,0x29e135a2,0xf8188d91,0xb74fe580,
+ 0x2c139ba6,0x6344f3b7,0xb2bd4b84,0xfdea2395,0x158f2655,0x5ad84e44,0x8b21f677,0xc4769e66,
+ 0x717ce5a3,0x3e2b8db2,0xefd23581,0xa0855d90,0x48e05850,0x07b73041,0xd64e8872,0x9919e063,
+ 0x02459e45,0x4d12f654,0x9ceb4e67,0xd3bc2676,0x3bd923b6,0x748e4ba7,0xa577f394,0xea209b85,
+ 0x970e126f,0xd8597a7e,0x09a0c24d,0x46f7aa5c,0xae92af9c,0xe1c5c78d,0x303c7fbe,0x7f6b17af,
+ 0xe4376989,0xab600198,0x7a99b9ab,0x35ced1ba,0xddabd47a,0x92fcbc6b,0x43050458,0x0c526c49},
+
+{0x00000000,0x5ba1dcca,0xb743b994,0xece2655e,0x6a466e9f,0x31e7b255,0xdd05d70b,0x86a40bc1,
+ 0xd48cdd3e,0x8f2d01f4,0x63cf64aa,0x386eb860,0xbecab3a1,0xe56b6f6b,0x09890a35,0x5228d6ff,
+ 0xadd8a7cb,0xf6797b01,0x1a9b1e5f,0x413ac295,0xc79ec954,0x9c3f159e,0x70dd70c0,0x2b7cac0a,
+ 0x79547af5,0x22f5a63f,0xce17c361,0x95b61fab,0x1312146a,0x48b3c8a0,0xa451adfe,0xfff07134,
+ 0x5f705221,0x04d18eeb,0xe833ebb5,0xb392377f,0x35363cbe,0x6e97e074,0x8275852a,0xd9d459e0,
+ 0x8bfc8f1f,0xd05d53d5,0x3cbf368b,0x671eea41,0xe1bae180,0xba1b3d4a,0x56f95814,0x0d5884de,
+ 0xf2a8f5ea,0xa9092920,0x45eb4c7e,0x1e4a90b4,0x98ee9b75,0xc34f47bf,0x2fad22e1,0x740cfe2b,
+ 0x262428d4,0x7d85f41e,0x91679140,0xcac64d8a,0x4c62464b,0x17c39a81,0xfb21ffdf,0xa0802315,
+ 0xbee0a442,0xe5417888,0x09a31dd6,0x5202c11c,0xd4a6cadd,0x8f071617,0x63e57349,0x3844af83,
+ 0x6a6c797c,0x31cda5b6,0xdd2fc0e8,0x868e1c22,0x002a17e3,0x5b8bcb29,0xb769ae77,0xecc872bd,
+ 0x13380389,0x4899df43,0xa47bba1d,0xffda66d7,0x797e6d16,0x22dfb1dc,0xce3dd482,0x959c0848,
+ 0xc7b4deb7,0x9c15027d,0x70f76723,0x2b56bbe9,0xadf2b028,0xf6536ce2,0x1ab109bc,0x4110d576,
+ 0xe190f663,0xba312aa9,0x56d34ff7,0x0d72933d,0x8bd698fc,0xd0774436,0x3c952168,0x6734fda2,
+ 0x351c2b5d,0x6ebdf797,0x825f92c9,0xd9fe4e03,0x5f5a45c2,0x04fb9908,0xe819fc56,0xb3b8209c,
+ 0x4c4851a8,0x17e98d62,0xfb0be83c,0xa0aa34f6,0x260e3f37,0x7dafe3fd,0x914d86a3,0xcaec5a69,
+ 0x98c48c96,0xc365505c,0x2f873502,0x7426e9c8,0xf282e209,0xa9233ec3,0x45c15b9d,0x1e608757,
+ 0x79005533,0x22a189f9,0xce43eca7,0x95e2306d,0x13463bac,0x48e7e766,0xa4058238,0xffa45ef2,
+ 0xad8c880d,0xf62d54c7,0x1acf3199,0x416eed53,0xc7cae692,0x9c6b3a58,0x70895f06,0x2b2883cc,
+ 0xd4d8f2f8,0x8f792e32,0x639b4b6c,0x383a97a6,0xbe9e9c67,0xe53f40ad,0x09dd25f3,0x527cf939,
+ 0x00542fc6,0x5bf5f30c,0xb7179652,0xecb64a98,0x6a124159,0x31b39d93,0xdd51f8cd,0x86f02407,
+ 0x26700712,0x7dd1dbd8,0x9133be86,0xca92624c,0x4c36698d,0x1797b547,0xfb75d019,0xa0d40cd3,
+ 0xf2fcda2c,0xa95d06e6,0x45bf63b8,0x1e1ebf72,0x98bab4b3,0xc31b6879,0x2ff90d27,0x7458d1ed,
+ 0x8ba8a0d9,0xd0097c13,0x3ceb194d,0x674ac587,0xe1eece46,0xba4f128c,0x56ad77d2,0x0d0cab18,
+ 0x5f247de7,0x0485a12d,0xe867c473,0xb3c618b9,0x35621378,0x6ec3cfb2,0x8221aaec,0xd9807626,
+ 0xc7e0f171,0x9c412dbb,0x70a348e5,0x2b02942f,0xada69fee,0xf6074324,0x1ae5267a,0x4144fab0,
+ 0x136c2c4f,0x48cdf085,0xa42f95db,0xff8e4911,0x792a42d0,0x228b9e1a,0xce69fb44,0x95c8278e,
+ 0x6a3856ba,0x31998a70,0xdd7bef2e,0x86da33e4,0x007e3825,0x5bdfe4ef,0xb73d81b1,0xec9c5d7b,
+ 0xbeb48b84,0xe515574e,0x09f73210,0x5256eeda,0xd4f2e51b,0x8f5339d1,0x63b15c8f,0x38108045,
+ 0x9890a350,0xc3317f9a,0x2fd31ac4,0x7472c60e,0xf2d6cdcf,0xa9771105,0x4595745b,0x1e34a891,
+ 0x4c1c7e6e,0x17bda2a4,0xfb5fc7fa,0xa0fe1b30,0x265a10f1,0x7dfbcc3b,0x9119a965,0xcab875af,
+ 0x3548049b,0x6ee9d851,0x820bbd0f,0xd9aa61c5,0x5f0e6a04,0x04afb6ce,0xe84dd390,0xb3ec0f5a,
+ 0xe1c4d9a5,0xba65056f,0x56876031,0x0d26bcfb,0x8b82b73a,0xd0236bf0,0x3cc10eae,0x6760d264}};
diff --git a/media/libogg/include/ogg/config_types.h b/media/libogg/include/ogg/config_types.h
index 1e7d490989..1a87df6423 100644
--- a/media/libogg/include/ogg/config_types.h
+++ b/media/libogg/include/ogg/config_types.h
@@ -1,7 +1,7 @@
 #ifndef __CONFIG_TYPES_H__
 #define __CONFIG_TYPES_H__
 
-/* these are filled in by configure */
+/* these are filled in by configure or cmake*/
 #define INCLUDE_INTTYPES_H 1
 #define INCLUDE_STDINT_H 1
 #define INCLUDE_SYS_TYPES_H 1
@@ -16,10 +16,11 @@
 #  include <sys/types.h>
 #endif
 
-typedef short ogg_int16_t;
-typedef unsigned short ogg_uint16_t;
-typedef int ogg_int32_t;
-typedef unsigned int ogg_uint32_t;
-typedef long long ogg_int64_t;
+typedef int16_t ogg_int16_t;
+typedef uint16_t ogg_uint16_t;
+typedef int32_t ogg_int32_t;
+typedef uint32_t ogg_uint32_t;
+typedef int64_t ogg_int64_t;
+typedef uint64_t ogg_uint64_t;
 
 #endif
diff --git a/media/libogg/include/ogg/ogg.h b/media/libogg/include/ogg/ogg.h
index cebe38ee1e..2789fc1ca2 100644
--- a/media/libogg/include/ogg/ogg.h
+++ b/media/libogg/include/ogg/ogg.h
@@ -11,7 +11,6 @@
  ********************************************************************
 
  function: toplevel libogg include
- last mod: $Id: ogg.h 18044 2011-08-01 17:55:20Z gmaxwell $
 
  ********************************************************************/
 #ifndef _OGG_H
diff --git a/media/libogg/include/ogg/os_types.h b/media/libogg/include/ogg/os_types.h
index 62d717b684..a8d4600fb0 100644
--- a/media/libogg/include/ogg/os_types.h
+++ b/media/libogg/include/ogg/os_types.h
@@ -10,8 +10,7 @@
  *                                                                  *
  ********************************************************************
 
- function: #ifdef jail to whip a few platforms into the UNIX ideal.
- last mod: $Id: os_types.h 19098 2014-02-26 19:06:45Z giles $
+ function: Define a consistent set of types on each platform.
 
  ********************************************************************/
 #ifndef _OS_TYPES_H
@@ -65,36 +64,40 @@ extern ogg_free_function_type *ogg_free_func;
      typedef unsigned long long ogg_uint64_t;
 #  elif defined(__MWERKS__)
      typedef long long ogg_int64_t;
+     typedef unsigned long long ogg_uint64_t;
      typedef int ogg_int32_t;
      typedef unsigned int ogg_uint32_t;
      typedef short ogg_int16_t;
      typedef unsigned short ogg_uint16_t;
 #  else
-     /* MSVC/Borland */
-     typedef __int64 ogg_int64_t;
-     typedef __int32 ogg_int32_t;
-     typedef unsigned __int32 ogg_uint32_t;
-     typedef __int16 ogg_int16_t;
-     typedef unsigned __int16 ogg_uint16_t;
+#    if defined(_MSC_VER) && (_MSC_VER >= 1800) /* MSVC 2013 and newer */
+#      include <stdint.h>
+       typedef int16_t ogg_int16_t;
+       typedef uint16_t ogg_uint16_t;
+       typedef int32_t ogg_int32_t;
+       typedef uint32_t ogg_uint32_t;
+       typedef int64_t ogg_int64_t;
+       typedef uint64_t ogg_uint64_t;
+#    else
+       /* MSVC/Borland */
+       typedef __int64 ogg_int64_t;
+       typedef __int32 ogg_int32_t;
+       typedef unsigned __int32 ogg_uint32_t;
+       typedef unsigned __int64 ogg_uint64_t;
+       typedef __int16 ogg_int16_t;
+       typedef unsigned __int16 ogg_uint16_t;
+#    endif
 #  endif
 
-#elif defined(__MACOS__)
-
-#  include <sys/types.h>
-   typedef SInt16 ogg_int16_t;
-   typedef UInt16 ogg_uint16_t;
-   typedef SInt32 ogg_int32_t;
-   typedef UInt32 ogg_uint32_t;
-   typedef SInt64 ogg_int64_t;
-
 #elif (defined(__APPLE__) && defined(__MACH__)) /* MacOS X Framework build */
 
-#  include <inttypes.h>
+#  include <sys/types.h>
    typedef int16_t ogg_int16_t;
-   typedef uint16_t ogg_uint16_t;
+   typedef u_int16_t ogg_uint16_t;
    typedef int32_t ogg_int32_t;
-   typedef uint32_t ogg_uint32_t;
+   typedef u_int32_t ogg_uint32_t;
    typedef int64_t ogg_int64_t;
+   typedef u_int64_t ogg_uint64_t;
 
 #elif defined(__sun__)
 
@@ -115,6 +118,7 @@ extern ogg_free_function_type *ogg_free_func;
    typedef int ogg_int32_t;
    typedef unsigned int ogg_uint32_t;
    typedef long long ogg_int64_t;
+   typedef unsigned long long ogg_uint64_t;
 
 #elif defined(__BEOS__)
 
@@ -125,6 +129,7 @@ extern ogg_free_function_type *ogg_free_func;
    typedef int32_t ogg_int32_t;
    typedef uint32_t ogg_uint32_t;
    typedef int64_t ogg_int64_t;
+   typedef uint64_t ogg_uint64_t;
 
 #elif defined (__EMX__)
 
@@ -134,6 +139,8 @@ extern ogg_free_function_type *ogg_free_func;
    typedef int ogg_int32_t;
    typedef unsigned int ogg_uint32_t;
    typedef long long ogg_int64_t;
+   typedef unsigned long long ogg_uint64_t;
+
 
 #elif defined (DJGPP)
 
@@ -142,11 +149,13 @@ extern ogg_free_function_type *ogg_free_func;
    typedef int ogg_int32_t;
    typedef unsigned int ogg_uint32_t;
    typedef long long ogg_int64_t;
+   typedef unsigned long long ogg_uint64_t;
 
 #elif defined(R5900)
 
    /* PS2 EE */
    typedef long ogg_int64_t;
+   typedef unsigned long ogg_uint64_t;
    typedef int ogg_int32_t;
    typedef unsigned ogg_uint32_t;
    typedef short ogg_int16_t;
@@ -159,6 +168,7 @@ extern ogg_free_function_type *ogg_free_func;
    typedef signed int ogg_int32_t;
    typedef unsigned int ogg_uint32_t;
    typedef long long int ogg_int64_t;
+   typedef unsigned long long int ogg_uint64_t;
 
 #elif defined(__TMS320C6X__)
 
@@ -168,6 +178,7 @@ extern ogg_free_function_type *ogg_free_func;
    typedef signed int ogg_int32_t;
    typedef unsigned int ogg_uint32_t;
    typedef long long int ogg_int64_t;
+   typedef unsigned long long int ogg_uint64_t;
 
 #else
 
diff --git a/media/libogg/moz.build b/media/libogg/moz.build
index abc8d02850..839d2b2aab 100644
--- a/media/libogg/moz.build
+++ b/media/libogg/moz.build
@@ -6,6 +6,10 @@
 with Files('*'):
     BUG_COMPONENT = ('Core', 'Video/Audio')
 
+EXPORTS += [
+    'include/crctable.h',
+]
+
 EXPORTS.ogg += [
     'include/ogg/config_types.h',
     'include/ogg/ogg.h',
diff --git a/media/libogg/src/ogg_bitwise.c b/media/libogg/src/ogg_bitwise.c
index 145901d185..f5ef79122e 100644
--- a/media/libogg/src/ogg_bitwise.c
+++ b/media/libogg/src/ogg_bitwise.c
@@ -11,7 +11,6 @@
  ********************************************************************
 
   function: packing variable sized words into an octet stream
-  last mod: $Id: bitwise.c 19149 2014-05-27 16:26:23Z giles $
 
  ********************************************************************/
 
@@ -890,7 +889,7 @@ int main(void){
   for(i=0;i<test2size;i++){
     if(oggpack_look(&r,32)==-1)report("out of data. failed!");
     if(oggpack_look(&r,32)!=large[i]){
-      fprintf(stderr,"%ld != %ld (%lx!=%lx):",oggpack_look(&r,32),large[i],
+      fprintf(stderr,"%ld != %lu (%lx!=%lx):",oggpack_look(&r,32),large[i],
               oggpack_look(&r,32),large[i]);
       report("read incorrect value!\n");
     }
@@ -1000,7 +999,7 @@ int main(void){
   for(i=0;i<test2size;i++){
     if(oggpackB_look(&r,32)==-1)report("out of data. failed!");
     if(oggpackB_look(&r,32)!=large[i]){
-      fprintf(stderr,"%ld != %ld (%lx!=%lx):",oggpackB_look(&r,32),large[i],
+      fprintf(stderr,"%ld != %lu (%lx!=%lx):",oggpackB_look(&r,32),large[i],
               oggpackB_look(&r,32),large[i]);
       report("read incorrect value!\n");
     }
diff --git a/media/libogg/src/ogg_framing.c b/media/libogg/src/ogg_framing.c
index 3a2f0a6058..724d116d7f 100644
--- a/media/libogg/src/ogg_framing.c
+++ b/media/libogg/src/ogg_framing.c
@@ -5,14 +5,13 @@
  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
- * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010             *
+ * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2018             *
  * by the Xiph.Org Foundation http://www.xiph.org/                  *
  *                                                                  *
  ********************************************************************
 
  function: code raw packets into framed OggSquish stream and
            decode Ogg streams back into raw packets
- last mod: $Id: framing.c 18758 2013-01-08 16:29:56Z tterribe $
 
  note: The CRC code is directly derived from public domain code by
  Ross Williams (ross@guest.adelaide.edu.au).  See docs/framing.html
@@ -20,6 +19,10 @@
 
  ********************************************************************/
 
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
 #include <stdlib.h>
 #include <limits.h>
 #include <string.h>
@@ -45,7 +48,7 @@ int ogg_page_eos(const ogg_page *og){
 
 ogg_int64_t ogg_page_granulepos(const ogg_page *og){
   unsigned char *page=og->header;
-  ogg_int64_t granulepos=page[13]&(0xff);
+  ogg_uint64_t granulepos=page[13]&(0xff);
   granulepos= (granulepos<<8)|(page[12]&0xff);
   granulepos= (granulepos<<8)|(page[11]&0xff);
   granulepos= (granulepos<<8)|(page[10]&0xff);
@@ -53,21 +56,21 @@ ogg_int64_t ogg_page_granulepos(const ogg_page *og){
   granulepos= (granulepos<<8)|(page[8]&0xff);
   granulepos= (granulepos<<8)|(page[7]&0xff);
   granulepos= (granulepos<<8)|(page[6]&0xff);
-  return(granulepos);
+  return((ogg_int64_t)granulepos);
 }
 
 int ogg_page_serialno(const ogg_page *og){
-  return(og->header[14] |
-         (og->header[15]<<8) |
-         (og->header[16]<<16) |
-         (og->header[17]<<24));
+  return((int)((ogg_uint32_t)og->header[14]) |
+              ((ogg_uint32_t)og->header[15]<<8) |
+              ((ogg_uint32_t)og->header[16]<<16) |
+              ((ogg_uint32_t)og->header[17]<<24));
 }
 
 long ogg_page_pageno(const ogg_page *og){
-  return(og->header[18] |
-         (og->header[19]<<8) |
-         (og->header[20]<<16) |
-         (og->header[21]<<24));
+  return((long)((ogg_uint32_t)og->header[18]) |
+               ((ogg_uint32_t)og->header[19]<<8) |
+               ((ogg_uint32_t)og->header[20]<<16) |
+               ((ogg_uint32_t)og->header[21]<<24));
 }
 
 
@@ -99,90 +102,31 @@ int ogg_page_packets(const ogg_page *og){
 
 #if 0
 /* helper to initialize lookup for direct-table CRC (illustrative; we
-   use the static init below) */
-
-static ogg_uint32_t _ogg_crc_entry(unsigned long index){
-  int           i;
-  unsigned long r;
-
-  r = index << 24;
-  for (i=0; i<8; i++)
-    if (r & 0x80000000UL)
-      r = (r << 1) ^ 0x04c11db7; /* The same as the ethernet generator
-                                    polynomial, although we use an
-                                    unreflected alg and an init/final
-                                    of 0, not 0xffffffff */
-    else
-       r<<=1;
- return (r & 0xffffffffUL);
+   use the static init in crctable.h) */
+
+static void _ogg_crc_init(){
+  int i, j;
+  ogg_uint32_t polynomial, crc;
+  polynomial = 0x04c11db7; /* The same as the ethernet generator
+                              polynomial, although we use an
+                              unreflected alg and an init/final
+                              of 0, not 0xffffffff */
+  for (i = 0; i <= 0xFF; i++){
+    crc = i << 24;
+
+    for (j = 0; j < 8; j++)
+      crc = (crc << 1) ^ (crc & (1 << 31) ? polynomial : 0);
+
+    crc_lookup[0][i] = crc;
+  }
+
+  for (i = 0; i <= 0xFF; i++)
+    for (j = 1; j < 8; j++)
+      crc_lookup[j][i] = crc_lookup[0][(crc_lookup[j - 1][i] >> 24) & 0xFF] ^ (crc_lookup[j - 1][i] << 8);
 }
 #endif
 
-static const ogg_uint32_t crc_lookup[256]={
-  0x00000000,0x04c11db7,0x09823b6e,0x0d4326d9,
-  0x130476dc,0x17c56b6b,0x1a864db2,0x1e475005,
-  0x2608edb8,0x22c9f00f,0x2f8ad6d6,0x2b4bcb61,
-  0x350c9b64,0x31cd86d3,0x3c8ea00a,0x384fbdbd,
-  0x4c11db70,0x48d0c6c7,0x4593e01e,0x4152fda9,
-  0x5f15adac,0x5bd4b01b,0x569796c2,0x52568b75,
-  0x6a1936c8,0x6ed82b7f,0x639b0da6,0x675a1011,
-  0x791d4014,0x7ddc5da3,0x709f7b7a,0x745e66cd,
-  0x9823b6e0,0x9ce2ab57,0x91a18d8e,0x95609039,
-  0x8b27c03c,0x8fe6dd8b,0x82a5fb52,0x8664e6e5,
-  0xbe2b5b58,0xbaea46ef,0xb7a96036,0xb3687d81,
-  0xad2f2d84,0xa9ee3033,0xa4ad16ea,0xa06c0b5d,
-  0xd4326d90,0xd0f37027,0xddb056fe,0xd9714b49,
-  0xc7361b4c,0xc3f706fb,0xceb42022,0xca753d95,
-  0xf23a8028,0xf6fb9d9f,0xfbb8bb46,0xff79a6f1,
-  0xe13ef6f4,0xe5ffeb43,0xe8bccd9a,0xec7dd02d,
-  0x34867077,0x30476dc0,0x3d044b19,0x39c556ae,
-  0x278206ab,0x23431b1c,0x2e003dc5,0x2ac12072,
-  0x128e9dcf,0x164f8078,0x1b0ca6a1,0x1fcdbb16,
-  0x018aeb13,0x054bf6a4,0x0808d07d,0x0cc9cdca,
-  0x7897ab07,0x7c56b6b0,0x71159069,0x75d48dde,
-  0x6b93dddb,0x6f52c06c,0x6211e6b5,0x66d0fb02,
-  0x5e9f46bf,0x5a5e5b08,0x571d7dd1,0x53dc6066,
-  0x4d9b3063,0x495a2dd4,0x44190b0d,0x40d816ba,
-  0xaca5c697,0xa864db20,0xa527fdf9,0xa1e6e04e,
-  0xbfa1b04b,0xbb60adfc,0xb6238b25,0xb2e29692,
-  0x8aad2b2f,0x8e6c3698,0x832f1041,0x87ee0df6,
-  0x99a95df3,0x9d684044,0x902b669d,0x94ea7b2a,
-  0xe0b41de7,0xe4750050,0xe9362689,0xedf73b3e,
-  0xf3b06b3b,0xf771768c,0xfa325055,0xfef34de2,
-  0xc6bcf05f,0xc27dede8,0xcf3ecb31,0xcbffd686,
-  0xd5b88683,0xd1799b34,0xdc3abded,0xd8fba05a,
-  0x690ce0ee,0x6dcdfd59,0x608edb80,0x644fc637,
-  0x7a089632,0x7ec98b85,0x738aad5c,0x774bb0eb,
-  0x4f040d56,0x4bc510e1,0x46863638,0x42472b8f,
-  0x5c007b8a,0x58c1663d,0x558240e4,0x51435d53,
-  0x251d3b9e,0x21dc2629,0x2c9f00f0,0x285e1d47,
-  0x36194d42,0x32d850f5,0x3f9b762c,0x3b5a6b9b,
-  0x0315d626,0x07d4cb91,0x0a97ed48,0x0e56f0ff,
-  0x1011a0fa,0x14d0bd4d,0x19939b94,0x1d528623,
-  0xf12f560e,0xf5ee4bb9,0xf8ad6d60,0xfc6c70d7,
-  0xe22b20d2,0xe6ea3d65,0xeba91bbc,0xef68060b,
-  0xd727bbb6,0xd3e6a601,0xdea580d8,0xda649d6f,
-  0xc423cd6a,0xc0e2d0dd,0xcda1f604,0xc960ebb3,
-  0xbd3e8d7e,0xb9ff90c9,0xb4bcb610,0xb07daba7,
-  0xae3afba2,0xaafbe615,0xa7b8c0cc,0xa379dd7b,
-  0x9b3660c6,0x9ff77d71,0x92b45ba8,0x9675461f,
-  0x8832161a,0x8cf30bad,0x81b02d74,0x857130c3,
-  0x5d8a9099,0x594b8d2e,0x5408abf7,0x50c9b640,
-  0x4e8ee645,0x4a4ffbf2,0x470cdd2b,0x43cdc09c,
-  0x7b827d21,0x7f436096,0x7200464f,0x76c15bf8,
-  0x68860bfd,0x6c47164a,0x61043093,0x65c52d24,
-  0x119b4be9,0x155a565e,0x18197087,0x1cd86d30,
-  0x029f3d35,0x065e2082,0x0b1d065b,0x0fdc1bec,
-  0x3793a651,0x3352bbe6,0x3e119d3f,0x3ad08088,
-  0x2497d08d,0x2056cd3a,0x2d15ebe3,0x29d4f654,
-  0xc5a92679,0xc1683bce,0xcc2b1d17,0xc8ea00a0,
-  0xd6ad50a5,0xd26c4d12,0xdf2f6bcb,0xdbee767c,
-  0xe3a1cbc1,0xe760d676,0xea23f0af,0xeee2ed18,
-  0xf0a5bd1d,0xf464a0aa,0xf9278673,0xfde69bc4,
-  0x89b8fd09,0x8d79e0be,0x803ac667,0x84fbdbd0,
-  0x9abc8bd5,0x9e7d9662,0x933eb0bb,0x97ffad0c,
-  0xafb010b1,0xab710d06,0xa6322bdf,0xa2f33668,
-  0xbcb4666d,0xb8757bda,0xb5365d03,0xb1f740b4};
+#include "crctable.h"
 
 /* init the encode/decode logical stream state */
 
@@ -290,10 +234,27 @@ static int _os_lacing_expand(ogg_stream_state *os,long needed){
 /* Direct table CRC; note that this will be faster in the future if we
    perform the checksum simultaneously with other copies */
 
+static ogg_uint32_t _os_update_crc(ogg_uint32_t crc, unsigned char *buffer, int size){
+  while (size>=8){
+    crc^=((ogg_uint32_t)buffer[0]<<24)|((ogg_uint32_t)buffer[1]<<16)|((ogg_uint32_t)buffer[2]<<8)|((ogg_uint32_t)buffer[3]);
+
+    crc=crc_lookup[7][ crc>>24      ]^crc_lookup[6][(crc>>16)&0xFF]^
+        crc_lookup[5][(crc>> 8)&0xFF]^crc_lookup[4][ crc     &0xFF]^
+        crc_lookup[3][buffer[4]     ]^crc_lookup[2][buffer[5]     ]^
+        crc_lookup[1][buffer[6]     ]^crc_lookup[0][buffer[7]     ];
+
+    buffer+=8;
+    size-=8;
+  }
+
+  while (size--)
+    crc=(crc<<8)^crc_lookup[0][((crc >> 24)&0xff)^*buffer++];
+  return crc;
+}
+
 void ogg_page_checksum_set(ogg_page *og){
   if(og){
     ogg_uint32_t crc_reg=0;
-    int i;
 
     /* safety; needed for API behavior, but not framing code */
     og->header[22]=0;
@@ -301,10 +262,8 @@ void ogg_page_checksum_set(ogg_page *og){
     og->header[24]=0;
     og->header[25]=0;
 
-    for(i=0;i<og->header_len;i++)
-      crc_reg=(crc_reg<<8)^crc_lookup[((crc_reg >> 24)&0xff)^og->header[i]];
-    for(i=0;i<og->body_len;i++)
-      crc_reg=(crc_reg<<8)^crc_lookup[((crc_reg >> 24)&0xff)^og->body[i]];
+    crc_reg=_os_update_crc(crc_reg,og->header,og->header_len);
+    crc_reg=_os_update_crc(crc_reg,og->body,og->body_len);
 
     og->header[22]=(unsigned char)(crc_reg&0xff);
     og->header[23]=(unsigned char)((crc_reg>>8)&0xff);
@@ -414,9 +373,9 @@ static int ogg_stream_flush_i(ogg_stream_state *os,ogg_page *og, int force, int
   }else{
 
     /* The extra packets_done, packet_just_done logic here attempts to do two things:
-       1) Don't unneccessarily span pages.
+       1) Don't unnecessarily span pages.
        2) Unless necessary, don't flush pages if there are less than four packets on
-          them; this expands page size to reduce unneccessary overhead if incoming packets
+          them; this expands page size to reduce unnecessary overhead if incoming packets
           are large.
        These are not necessary behaviors, just 'always better than naive flushing'
        without requiring an application to explicitly request a specific optimized
@@ -638,9 +597,14 @@ char *ogg_sync_buffer(ogg_sync_state *oy, long size){
 
   if(size>oy->storage-oy->fill){
     /* We need to extend the internal buffer */
-    long newsize=size+oy->fill+4096; /* an extra page to be nice */
+    long newsize;
     void *ret;
 
+    if(size>INT_MAX-4096-oy->fill){
+      ogg_sync_clear(oy);
+      return NULL;
+    }
+    newsize=size+oy->fill+4096; /* an extra page to be nice */
     if(oy->data)
       ret=_ogg_realloc(oy->data,newsize);
     else
@@ -723,16 +687,15 @@ long ogg_sync_pageseek(ogg_sync_state *oy,ogg_page *og){
       /* replace the computed checksum with the one actually read in */
       memcpy(page+22,chksum,4);
 
+#ifndef DISABLE_CRC
       /* Bad checksum. Lose sync */
       goto sync_fail;
+#endif
     }
   }
 
   /* yes, have a whole page all ready to go */
   {
-    unsigned char *page=oy->data+oy->returned;
-    long bytes;
-
     if(og){
       og->header=page;
       og->header_len=oy->headerbytes;
@@ -875,6 +838,7 @@ int ogg_stream_pagein(ogg_stream_state *os, ogg_page *og){
      some segments */
   if(continued){
     if(os->lacing_fill<1 ||
+       (os->lacing_vals[os->lacing_fill-1]&0xff)<255 ||
        os->lacing_vals[os->lacing_fill-1]==0x400){
       bos=0;
       for(;segptr<segments;segptr++){
@@ -1492,6 +1456,34 @@ const int head3_7[] = {0x4f,0x67,0x67,0x53,0,0x05,
                        1,
                        0};
 
+int compare_packet(const ogg_packet *op1, const ogg_packet *op2){
+  if(op1->packet!=op2->packet){
+    fprintf(stderr,"op1->packet != op2->packet\n");
+    return(1);
+  }
+  if(op1->bytes!=op2->bytes){
+    fprintf(stderr,"op1->bytes != op2->bytes\n");
+    return(1);
+  }
+  if(op1->b_o_s!=op2->b_o_s){
+    fprintf(stderr,"op1->b_o_s != op2->b_o_s\n");
+    return(1);
+  }
+  if(op1->e_o_s!=op2->e_o_s){
+    fprintf(stderr,"op1->e_o_s != op2->e_o_s\n");
+    return(1);
+  }
+  if(op1->granulepos!=op2->granulepos){
+    fprintf(stderr,"op1->granulepos != op2->granulepos\n");
+    return(1);
+  }
+  if(op1->packetno!=op2->packetno){
+    fprintf(stderr,"op1->packetno != op2->packetno\n");
+    return(1);
+  }
+  return(0);
+}
+
 void test_pack(const int *pl, const int **headers, int byteskip,
                int pageskip, int packetskip){
   unsigned char *data=_ogg_malloc(1024*1024); /* for scripted test cases only */
@@ -1577,7 +1569,7 @@ void test_pack(const int *pl, const int **headers, int byteskip,
             byteskipcount=byteskip;
           }
 
-          ogg_sync_wrote(&oy,next-buf);
+          ogg_sync_wrote(&oy,(long)(next-buf));
 
           while(1){
             int ret=ogg_sync_pageout(&oy,&og_de);
@@ -1600,7 +1592,7 @@ void test_pack(const int *pl, const int **headers, int byteskip,
               ogg_stream_packetout(&os_de,&op_de); /* just catching them all */
 
               /* verify peek and out match */
-              if(memcmp(&op_de,&op_de2,sizeof(op_de))){
+              if(compare_packet(&op_de,&op_de2)){
                 fprintf(stderr,"packetout != packetpeek! pos=%ld\n",
                         depacket);
                 exit(1);
@@ -1785,6 +1777,7 @@ int main(void){
     test_pack(packets,headret,0,0,0);
   }
 
+#ifndef DISABLE_CRC
   {
     /* test for the libogg 1.1.1 resync in large continuation bug
        found by Josh Coalson)  */
@@ -1794,6 +1787,9 @@ int main(void){
     fprintf(stderr,"testing continuation resync in very large packets... ");
     test_pack(packets,headret,100,2,3);
   }
+#else
+    fprintf(stderr,"Skipping continuation resync test due to --disable-crc\n");
+#endif
 
   {
     /* term only page.  why not? */
@@ -2055,6 +2051,7 @@ int main(void){
       fprintf(stderr,"ok.\n");
     }
 
+#ifndef DISABLE_CRC
     /* Test recapture: page + garbage + page */
     {
       ogg_page og_de;
@@ -2096,6 +2093,9 @@ int main(void){
 
       fprintf(stderr,"ok.\n");
     }
+#else
+    fprintf(stderr,"Skipping recapture test due to --disable-crc\n");
+#endif
 
     /* Free page data that was previously copied */
     {
@@ -2104,6 +2104,9 @@ int main(void){
       }
     }
   }
+  ogg_sync_clear(&oy);
+  ogg_stream_clear(&os_en);
+  ogg_stream_clear(&os_de);
 
   return(0);
 }
diff --git a/media/libogg/update.sh b/media/libogg/update.sh
index 5a7184bcbd..5e12863a03 100644..100755
--- a/media/libogg/update.sh
+++ b/media/libogg/update.sh
@@ -2,12 +2,16 @@
 #
 # Copies the needed files from a directory containing the original
 # libogg source that we need for the Mozilla HTML5 media support.
+#
+# Before executing this script, make sure you've already ran ./configure
+# on the libogg source to ensure config_types.h exists.
+cp $1/src/crctable.h ./include/crctable.h
 cp $1/include/ogg/config_types.h ./include/ogg/config_types.h
 cp $1/include/ogg/ogg.h ./include/ogg/ogg.h
 cp $1/include/ogg/os_types.h ./include/ogg/os_types.h
 cp $1/CHANGES ./CHANGES
 cp $1/COPYING ./COPYING
-cp $1/README ./README
+cp $1/README.md ./README.md
 cp $1/src/bitwise.c ./src/ogg_bitwise.c
 cp $1/src/framing.c ./src/ogg_framing.c
 cp $1/AUTHORS ./AUTHORS
diff --git a/media/libvorbis/CHANGES b/media/libvorbis/CHANGES
new file mode 100644
index 0000000000..ba0c3ca01a
--- /dev/null
+++ b/media/libvorbis/CHANGES
@@ -0,0 +1,185 @@
+libvorbis 1.3.7 (2020-07-04) -- "Xiph.Org libVorbis I 20200704 (Reducing Environment)"
+
+* Fix CVE-2018-10393 - out-of-bounds read encoding very low sample rates.
+* Fix CVE-2017-14160 - out-of-bounds read encoding very low sample rates.
+* Fix CVE-2018-10392 - out-of-bounds access encoding invalid channel count.
+* Fix handling invalid bytes per sample arguments.
+* Fix handling invalid channel count arguments.
+* Fix invalid free on seek failure.
+* Fix negative shift reading blocksize.
+* Fix accepting unreasonable float32 values.
+* Fix tag comparison depending on locale.
+* Fix unnecessarily linking libm.
+* Fix memory leak in test_sharedbook.
+* Update Visual Studio projects for ogg library filename change.
+* Distribute CMake build files with the source package.
+* Remove unnecessary configure --target switch.
+* Add gitlab CI support.
+* Add OSS-Fuzz support.
+* Build system and integration updates.
+
+libvorbis 1.3.6 (2018-03-16) -- "Xiph.Org libVorbis I 20180316 (Now 100% fewer shells)"
+
+* Fix CVE-2018-5146 - out-of-bounds write on codebook decoding.
+* Fix CVE-2017-14632 - free() on unitialized data
+* Fix CVE-2017-14633 - out-of-bounds read
+* Fix bitrate metadata parsing.
+* Fix out-of-bounds read in codebook parsing.
+* Fix residue vector size in Vorbis I spec.
+* Appveyor support
+* Travis CI support
+* Add secondary CMake build system.
+* Build system fixes
+
+libvorbis 1.3.5 (2015-03-03) -- "Xiph.Org libVorbis I 20150105 (⛄⛄⛄⛄)"
+
+* Tolerate single-entry codebooks.
+* Fix decoder crash with invalid input.
+* Fix encoder crash with non-positive sample rates.
+# Fix issues in vorbisfile's seek bisection code.
+* Spec errata.
+* Reject multiple headers of the same type.
+* Various build fixes and code cleanup.
+
+libvorbis 1.3.4 (2014-01-22) -- "Xiph.Org libVorbis I 20140122 (Turpakäräjiin)"
+
+* Reduce codebook footprint in library code.
+* Various build and documentation fixes.
+
+libvorbis 1.3.3 (2012-02-03) -- "Xiph.Org libVorbis I 20120203 (Omnipresent)"
+
+* vorbis: additional proofing against invalid/malicious 
+  streams in decode (see SVN for details).  
+* vorbis: fix a memory leak in vorbis_commentheader_out().
+* updates, corrections and clarifications in the Vorbis I specification 
+  document
+* win32: fixed project configuration which referenced two CRT versions 
+  in output binaries.
+* build warning fixes
+
+libvorbis 1.3.2 (2010-11-01) -- "Xiph.Org libVorbis I 20101101 (Schaufenugget)"
+
+ * vorbis: additional proofing against invalid/malicious 
+   streams in floor, residue, and bos/eos packet trimming 
+   code (see SVN for details). 
+ * vorbis: Added programming documentation tree for the 
+   low-level calls
+ * vorbisfile: Correct handling of serial numbers array 
+   element [0] on non-seekable streams
+ * vorbisenc: Back out an [old] AoTuV HF weighting that was 
+   first enabled in 1.3.0; there are a few samples where I 
+   really don't like the effect it causes.
+ * vorbis: return correct timestamp for granule positions 
+   with high bit set.
+ * vorbisfile: the [undocumented] half-rate decode api made no 
+   attempt to keep the pcm offset tracking consistent in seeks. 
+   Fix and add a testing mode to seeking_example.c to torture 
+   test seeking in halfrate mode.  Also remove requirement that 
+   halfrate mode only work with seekable files.
+ * vorbisfile:  Fix a chaining bug in raw_seeks where seeking 
+   out of the current link would fail due to not 
+   reinitializing the decode machinery.  
+ * vorbisfile: improve seeking strategy. Reduces the 
+   necessary number of seek callbacks in an open or seek 
+   operation by well over 2/3.
+
+libvorbis 1.3.1 (2010-02-26) -- "Xiph.Org libVorbis I 20100325 (Everywhere)"
+
+ * tweak + minor arithmetic fix in floor1 fit
+ * revert noise norm to conservative 1.2.3 behavior pending 
+   more listening testing
+
+libvorbis 1.3.0 (2010-02-25) -- unreleased staging snapshot
+
+ * Optimized surround support for 5.1 encoding at 44.1/48kHz
+ * Added encoder control call to disable channel coupling
+ * Correct an overflow bug in very low-bitrate encoding on 32 bit 
+   machines that caused inflated bitrates
+ * Numerous API hardening, leak and build fixes 
+ * Correct bug in 22kHz compand setup that could cause a crash
+ * Correct bug in 16kHz codebooks that could cause unstable pure 
+   tones at high bitrates
+
+libvorbis 1.2.3 (2009-07-09) -- "Xiph.Org libVorbis I 20090709"
+
+ * correct a vorbisfile bug that prevented proper playback of
+   Vorbis files where all audio in a logical stream is in a
+   single page
+ * Additional decode setup hardening against malicious streams
+ * Add 'OV_EXCLUDE_STATIC_CALLBACKS' define for developers who 
+   wish to avoid unused symbol warnings from the static callbacks 
+   defined in vorbisfile.h
+
+libvorbis 1.2.2 (2009-06-24) -- "Xiph.Org libVorbis I 20090624"
+
+ * define VENDOR and ENCODER strings
+ * seek correctly in files bigger than 2 GB (Windows)
+ * fix regression from CVE-2008-1420; 1.0b1 files work again
+ * mark all tables as constant to reduce memory occupation
+ * additional decoder hardening against malicious streams
+ * substantially reduce amount of seeking performed by Vorbisfile
+ * Multichannel decode bugfix 
+ * build system updates
+ * minor specification clarifications/fixes
+
+libvorbis 1.2.1 (unreleased) -- "Xiph.Org libVorbis I 20080501"
+
+ * Improved robustness with corrupt streams.
+ * New ov_read_filter() vorbisfile call allows filtering decoded
+   audio as floats before converting to integer samples.
+ * Fix an encoder bug with multichannel streams.
+ * Replaced RTP payload format draft with RFC 5215.
+ * Bare bones self test under 'make check'.
+ * Fix a problem encoding some streams between 14 and 28 kHz.
+ * Fix a numerical instability in the edge extrapolation filter.
+ * Build system improvements.
+ * Specification correction.
+
+libvorbis 1.2.0 (2007-07-25) -- "Xiph.Org libVorbis I 20070622"
+
+ * new ov_fopen() convenience call that avoids the common
+   stdio conflicts with ov_open() and MSVC runtimes.
+ * libvorbisfile now handles multiplexed streams
+ * improve robustness to corrupt input streams
+ * fix a minor encoder bug
+ * updated RTP draft
+ * build system updates
+ * minor corrections to the specification
+
+libvorbis 1.1.2 (2005-11-27) -- "Xiph.Org libVorbis I 20050304"
+
+ * fix a serious encoder bug with gcc 4 optimized builds
+ * documentation and spec fixes
+ * updated VS2003 and XCode builds
+ * new draft RTP encapsulation spec
+
+libvorbis 1.1.1 (2005-06-27) -- "Xiph.Org libVorbis I 20050304"
+
+ * bug fix to the bitrate management encoder interface
+ * bug fix to properly set packetno field in the encoder
+ * new draft RTP encapsulation spec
+ * library API documentation improvements
+
+libvorbis 1.1.0 (2004-09-22) -- "Xiph.Org libVorbis I 20040629"
+
+ * merges tuning improvements from Aoyumi's aoTuV with fixups
+ * new managed bitrate (CBR) mode support
+ * new vorbis_encoder_ctl() interface
+ * extensive documentation updates
+ * application/ogg mimetype is now official
+ * autotools cleanup from Thomas Vander Stichele
+ * SymbianOS build support from Colin Ward at CSIRO
+ * various bugfixes
+ * various packaging improvements
+
+libvorbis 1.0.1 (2003-11-17) -- "Xiph.Org libVorbis I 20030909"
+
+ * numerous bug fixes
+ * specification corrections
+ * new crosslap and halfrate APIs for game use
+ * packaging and build updates
+
+libvorbis 1.0.0 (2002-07-19) -- "Xiph.Org libVorbis I 20020717"
+
+ * first stable release
+
diff --git a/media/libvorbis/COPYING b/media/libvorbis/COPYING
index 8f1d18cc2b..fb456a87bd 100644
--- a/media/libvorbis/COPYING
+++ b/media/libvorbis/COPYING
@@ -1,4 +1,4 @@
-Copyright (c) 2002-2015 Xiph.org Foundation
+Copyright (c) 2002-2020 Xiph.org Foundation
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
diff --git a/media/libvorbis/README b/media/libvorbis/README
deleted file mode 100644
index 343be9a452..0000000000
--- a/media/libvorbis/README
+++ /dev/null
@@ -1,134 +0,0 @@
-********************************************************************
-*                                                                  *
-* THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.   *
-* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
-* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
-* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
-*                                                                  *
-* THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015             *
-* by the Xiph.org Foundation, http://www.xiph.org/                 *
-*                                                                  *
-********************************************************************
-
-Vorbis is a general purpose audio and music encoding format
-contemporary to MPEG-4's AAC and TwinVQ, the next generation beyond
-MPEG audio layer 3. Unlike the MPEG sponsored formats (and other
-proprietary formats such as RealAudio G2 and Windows' flavor of the
-month), the Vorbis CODEC specification belongs to the public domain.
-All the technical details are published and documented, and any
-software entity may make full use of the format without license 
-fee, royalty or patent concerns.
-
-This package contains:
-
-* libvorbis, a BSD-style license software implementation of
-  the Vorbis specification by the Xiph.Org Foundation 
-  (http://www.xiph.org/) 
-
-* libvorbisfile, a BSD-style license convenience library
-  built on Vorbis designed to simplify common uses
-
-* libvorbisenc, a BSD-style license library that provides a simple,
-  programmatic encoding setup interface 
-
-* example code making use of libogg, libvorbis, libvorbisfile and
-  libvorbisenc
-
-WHAT'S HERE:
-
-This source distribution includes libvorbis and an example
-encoder/player to demonstrate use of libvorbis as well as
-documentation on the Ogg Vorbis audio coding format.
-
-You'll need libogg (distributed separately) to compile this library.
-A more comprehensive set of utilities is available in the vorbis-tools
-package.
-
-Directory:
-
-./lib  		The source for the libraries, a BSD-license implementation
-		of the public domain Ogg Vorbis audio encoding format.
-
-./include       Library API headers
-
-./debian        Rules/spec files for building Debian .deb packages
-
-./doc           Vorbis documentation
-
-./examples	Example code illustrating programmatic use of libvorbis, 
-		libvorbisfile and libvorbisenc
-
-./mac 		Codewarrior project files and build tweaks for MacOS.
-
-./macosx 	Project files for MacOS X.
-
-./win32		Win32 projects files and build automation
-
-./vq 		Internal utilities for training/building new LSP/residue 
-		and auxiliary codebooks.
-
-CONTACT:
-
-The Ogg homepage is located at 'http://www.xiph.org/ogg/'.
-Vorbis's homepage is located at 'http://www.xiph.org/vorbis/'.
-Up to date technical documents, contact information, source code and
-pre-built utilities may be found there.
-
-The user website for Ogg Vorbis software and audio is http://vorbis.com/
-
-BUILDING FROM TRUNK:
-
-Development source is under subversion revision control at 
-https://svn.xiph.org/trunk/vorbis/. You will also need the 
-newest versions of autoconf, automake, libtool and pkg-config in
-order to compile Vorbis from development source. A configure script
-is provided for you in the source tarball distributions.
-
-  [update or checkout latest source]
-  ./autogen.sh
-  make
-
-and as root if desired:
-
-  make install
-
-This will install the Vorbis libraries (static and shared) into
-/usr/local/lib, includes into /usr/local/include and API manpages
-(once we write some) into /usr/local/man.
-
-Documentation building requires xsltproc and pdfxmltex.
-
-BUILDING FROM TARBALL DISTRIBUTIONS:
-
-  ./configure
-  make
-
-and optionally (as root):
-  make install
-
-BUILDING RPMS:
-
-after normal configuring:
-
-  make dist
-  rpm -ta libvorbis-<version>.tar.gz
-
-BUILDING ON MACOS 9:
-
-Vorbis on MacOS 9 is built using Metroworks CodeWarrior.  To build it, 
-first verify that the Ogg libraries are already built following the
-instructions in the Ogg module README.  Open vorbis/mac/libvorbis.mcp,
-switch to the "Targets" pane, select everything, and make the project.
-Do the same thing to build libvorbisenc.mcp, and libvorbisfile.mcp (in
-that order).  In vorbis/mac/Output you will now have both debug and final
-versions of Vorbis shared libraries to link your projects against.
-
-To build a project using Ogg Vorbis, add access paths to your
-CodeWarrior project for the ogg/include, ogg/mac/Output,
-vorbis/include, and vorbis/mac/Output folders.  Be sure that
-"interpret DOS and Unix paths" is turned on in your project; it can
-be found in the "access paths" pane in your project settings.  Now
-simply add the shared libraries you need to your project (OggLib and
-VorbisLib at least) and #include "ogg/ogg.h" and "vorbis/codec.h"
-wherever you need to access Ogg and Vorbis functionality.
-
diff --git a/media/libvorbis/README.md b/media/libvorbis/README.md
new file mode 100644
index 0000000000..30c88d391d
--- /dev/null
+++ b/media/libvorbis/README.md
@@ -0,0 +1,147 @@
+# Vorbis
+
+[![GitLab Build Status](https://gitlab.xiph.org/xiph/vorbis/badges/master/pipeline.svg)](https://gitlab.xiph.org/xiph/vorbis/-/pipelines)
+[![Travis Build Status](https://travis-ci.org/xiph/vorbis.svg?branch=master)](https://travis-ci.org/xiph/vorbis)
+[![AppVeyor Build status](https://ci.appveyor.com/api/projects/status/github/xiph/vorbis?branch=master&svg=true)](https://ci.appveyor.com/project/rillian/vorbis)
+
+Vorbis is a general purpose audio and music encoding format
+contemporary to MPEG-4's AAC and TwinVQ, the next generation beyond
+MPEG audio layer 3. Unlike the MPEG sponsored formats (and other
+proprietary formats such as RealAudio G2 and Windows' flavor of the
+month), the Vorbis CODEC specification belongs to the public domain.
+All the technical details are published and documented, and any
+software entity may make full use of the format without license
+fee, royalty or patent concerns.
+
+This package contains:
+
+- libvorbis, a BSD-style license software implementation of
+  the Vorbis specification by the Xiph.Org Foundation
+  (https://xiph.org/)
+
+- libvorbisfile, a BSD-style license convenience library
+  built on Vorbis designed to simplify common uses
+
+- libvorbisenc, a BSD-style license library that provides a simple,
+  programmatic encoding setup interface
+
+- example code making use of libogg, libvorbis, libvorbisfile and
+  libvorbisenc
+
+## What's here ##
+
+This source distribution includes libvorbis and an example
+encoder/player to demonstrate use of libvorbis as well as
+documentation on the Ogg Vorbis audio coding format.
+
+You'll need libogg (distributed separately) to compile this library.
+A more comprehensive set of utilities is available in the vorbis-tools
+package.
+
+Directory:
+
+- `lib` The source for the libraries, a BSD-license implementation of the public domain Ogg Vorbis audio encoding format.
+
+- `include` Library API headers
+
+- `debian` Rules/spec files for building Debian .deb packages
+
+- `doc` Vorbis documentation
+
+- `examples` Example code illustrating programmatic use of libvorbis, libvorbisfile and libvorbisenc
+
+- `macosx` Project files for MacOS X.
+
+- `win32` Win32 projects files and build automation
+
+- `vq` Internal utilities for training/building new LSP/residue and auxiliary codebooks.
+
+## Contact ##
+
+The Ogg homepage is located at 'https://xiph.org/ogg/'.
+Vorbis's homepage is located at 'https://xiph.org/vorbis/'.
+Up to date technical documents, contact information, source code and
+pre-built utilities may be found there.
+
+## Building ##
+
+#### Building from master ####
+
+Development source is under git revision control at
+https://gitlab.xiph.org/xiph/vorbis.git. You will also need the
+newest versions of autoconf, automake, libtool and pkg-config in
+order to compile Vorbis from development source. A configure script
+is provided for you in the source tarball distributions.
+
+    ./autogen.sh
+    ./configure
+    make
+
+and as root if desired:
+
+    make install
+
+This will install the Vorbis libraries (static and shared) into
+/usr/local/lib, includes into /usr/local/include and API manpages
+(once we write some) into /usr/local/man.
+
+Documentation building requires xsltproc and pdfxmltex.
+
+#### Building from tarball distributions ####
+
+    ./configure
+    make
+
+and optionally (as root):
+
+    make install
+
+#### Building RPM packages ####
+
+after normal configuring:
+
+    make dist
+    rpm -ta libvorbis-<version>.tar.gz
+
+## Building with CMake ##
+
+Ogg supports building using [CMake](https://cmake.org/). CMake is a meta build system that generates native projects for each platform.
+To generate projects just run cmake replacing `YOUR-PROJECT-GENERATOR` with a proper generator from a list [here](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html):
+
+    cmake -G YOUR-PROJECT-GENERATOR .
+
+Note that by default cmake generates projects that will build static libraries.
+To generate projects that will build dynamic library use `BUILD_SHARED_LIBS` option like this:
+
+    cmake -G YOUR-PROJECT-GENERATOR -DBUILD_SHARED_LIBS=1 .
+
+After projects are generated use them as usual
+
+#### Building on Windows ####
+
+Use proper generator for your Visual Studio version like:
+
+    cmake -G "Visual Studio 12 2013" .
+
+#### Building on Mac OS X ####
+
+Use Xcode generator. To build framework run:
+
+    cmake -G Xcode -DBUILD_FRAMEWORK=1 .
+
+#### Building on Linux ####
+
+Use Makefile generator which is default one.
+
+    cmake .
+    make
+
+## License ##
+
+THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.
+USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS
+GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE
+IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.
+
+THE OggVorbis SOURCE CODE IS COPYRIGHT (C) 1994-2020
+by the Xiph.Org Foundation https://xiph.org/
diff --git a/media/libvorbis/README_MOZILLA b/media/libvorbis/README_MOZILLA
index 1211ac074b..34c86dad62 100644
--- a/media/libvorbis/README_MOZILLA
+++ b/media/libvorbis/README_MOZILLA
@@ -1,10 +1,10 @@
-The source from this directory was copied from the libvorbis
+The source from this directory was copied from the vorbis
 subversion repository using the update.sh script. The only changes
 made were those applied by update.sh and the addition/update of
 Makefile.in and moz.build files for the Mozilla build system.
 
-The upstream version used was libvorbis 1.3.5.
-https://svn.xiph.org/tags/vorbis/libvorbis-1.3.5@19464
+The upstream version used was vorbis v1.3.7 (84c02369)
+from https://gitlab.xiph.org/xiph/vorbis
 
 Some files are renamed during the copy to prevent clashes with object
 file names with other Mozilla libraries.
diff --git a/media/libvorbis/include/vorbis/codec.h b/media/libvorbis/include/vorbis/codec.h
index 999aa33510..f8a912bc26 100644
--- a/media/libvorbis/include/vorbis/codec.h
+++ b/media/libvorbis/include/vorbis/codec.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2001             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
 
  ********************************************************************
 
  function: libvorbis codec headers
- last mod: $Id: codec.h 17021 2010-03-24 09:29:41Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/include/vorbis/vorbisenc.h b/media/libvorbis/include/vorbis/vorbisenc.h
index 02332b50ca..085b15e669 100644
--- a/media/libvorbis/include/vorbis/vorbisenc.h
+++ b/media/libvorbis/include/vorbis/vorbisenc.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2001             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: vorbis encode-engine setup
- last mod: $Id: vorbisenc.h 17021 2010-03-24 09:29:41Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/backends.h b/media/libvorbis/lib/backends.h
index ff5bcc95fe..670b0b902e 100644
--- a/media/libvorbis/lib/backends.h
+++ b/media/libvorbis/lib/backends.h
@@ -6,13 +6,12 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: libvorbis backend and mapping structures; needed for
            static mode headers
- last mod: $Id: backends.h 16962 2010-03-11 07:30:34Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/bitrate.h b/media/libvorbis/lib/bitrate.h
index db48fcb645..48fa150596 100644
--- a/media/libvorbis/lib/bitrate.h
+++ b/media/libvorbis/lib/bitrate.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: bitrate tracking and management
- last mod: $Id: bitrate.h 13293 2007-07-24 00:09:47Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/books/coupled/res_books_51.h b/media/libvorbis/lib/books/coupled/res_books_51.h
index 93910ff481..eb569c6f04 100644
--- a/media/libvorbis/lib/books/coupled/res_books_51.h
+++ b/media/libvorbis/lib/books/coupled/res_books_51.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
  *
  * function: static codebooks for 5.1 surround
- * last modified: $Id: res_books_51.h 19057 2014-01-22 12:32:31Z xiphmont $
  *
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/books/coupled/res_books_stereo.h b/media/libvorbis/lib/books/coupled/res_books_stereo.h
index 9a9049f6ed..7b53cb972b 100644
--- a/media/libvorbis/lib/books/coupled/res_books_stereo.h
+++ b/media/libvorbis/lib/books/coupled/res_books_stereo.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: static codebooks autogenerated by huff/huffbuld
- last modified: $Id: res_books_stereo.h 19057 2014-01-22 12:32:31Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/books/floor/floor_books.h b/media/libvorbis/lib/books/floor/floor_books.h
index e925313f7b..d26664f766 100644
--- a/media/libvorbis/lib/books/floor/floor_books.h
+++ b/media/libvorbis/lib/books/floor/floor_books.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: static codebooks autogenerated by huff/huffbuld
- last modified: $Id: floor_books.h 19057 2014-01-22 12:32:31Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/books/uncoupled/res_books_uncoupled.h b/media/libvorbis/lib/books/uncoupled/res_books_uncoupled.h
index 736353b675..107e22f9e3 100644
--- a/media/libvorbis/lib/books/uncoupled/res_books_uncoupled.h
+++ b/media/libvorbis/lib/books/uncoupled/res_books_uncoupled.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: static codebooks autogenerated by huff/huffbuld
- last modified: $Id: res_books_uncoupled.h 19057 2014-01-22 12:32:31Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/codebook.h b/media/libvorbis/lib/codebook.h
index 537d6c12d3..7d4e2aae4f 100644
--- a/media/libvorbis/lib/codebook.h
+++ b/media/libvorbis/lib/codebook.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: basic shared codebook operations
- last mod: $Id: codebook.h 19457 2015-03-03 00:15:29Z giles $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/codec_internal.h b/media/libvorbis/lib/codec_internal.h
index de1bccaedf..2ecf5e5c73 100644
--- a/media/libvorbis/lib/codec_internal.h
+++ b/media/libvorbis/lib/codec_internal.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: libvorbis codec headers
- last mod: $Id: codec_internal.h 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/envelope.h b/media/libvorbis/lib/envelope.h
index fd15fb32a7..2ef60a82ca 100644
--- a/media/libvorbis/lib/envelope.h
+++ b/media/libvorbis/lib/envelope.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: PCM data envelope analysis and manipulation
- last mod: $Id: envelope.h 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/highlevel.h b/media/libvorbis/lib/highlevel.h
index e38f370fd6..7690e3ebfb 100644
--- a/media/libvorbis/lib/highlevel.h
+++ b/media/libvorbis/lib/highlevel.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: highlevel encoder setup struct separated out for vorbisenc clarity
- last mod: $Id: highlevel.h 17195 2010-05-05 21:49:51Z giles $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/lookup.h b/media/libvorbis/lib/lookup.h
index f8b5b82730..ec05014f44 100644
--- a/media/libvorbis/lib/lookup.h
+++ b/media/libvorbis/lib/lookup.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
   function: lookup based functions
-  last mod: $Id: lookup.h 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/lookup_data.h b/media/libvorbis/lib/lookup_data.h
index 2424a1b386..7935715a70 100644
--- a/media/libvorbis/lib/lookup_data.h
+++ b/media/libvorbis/lib/lookup_data.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
   function: lookup data; generated by lookups.pl; edit there
-  last mod: $Id: lookup_data.h 16037 2009-05-26 21:10:58Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/lpc.h b/media/libvorbis/lib/lpc.h
index 39d237601b..4f59e6d32d 100644
--- a/media/libvorbis/lib/lpc.h
+++ b/media/libvorbis/lib/lpc.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
   function: LPC low level routines
-  last mod: $Id: lpc.h 16037 2009-05-26 21:10:58Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/lsp.h b/media/libvorbis/lib/lsp.h
index bacfb0971f..68b38daf16 100644
--- a/media/libvorbis/lib/lsp.h
+++ b/media/libvorbis/lib/lsp.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
   function: LSP (also called LSF) conversion routines
-  last mod: $Id: lsp.h 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/masking.h b/media/libvorbis/lib/masking.h
index 3576ab7885..7a196a37eb 100644
--- a/media/libvorbis/lib/masking.h
+++ b/media/libvorbis/lib/masking.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: masking curve data for psychoacoustics
- last mod: $Id: masking.h 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/mdct.h b/media/libvorbis/lib/mdct.h
index 3ed94333c5..ceaea617a3 100644
--- a/media/libvorbis/lib/mdct.h
+++ b/media/libvorbis/lib/mdct.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: modified discrete cosine transform prototypes
- last mod: $Id: mdct.h 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/misc.h b/media/libvorbis/lib/misc.h
index 73b4519898..eac5160e88 100644
--- a/media/libvorbis/lib/misc.h
+++ b/media/libvorbis/lib/misc.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: miscellaneous prototypes
- last mod: $Id: misc.h 19457 2015-03-03 00:15:29Z giles $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/floor_all.h b/media/libvorbis/lib/modes/floor_all.h
index 4292be326e..2e3d4a5012 100644
--- a/media/libvorbis/lib/modes/floor_all.h
+++ b/media/libvorbis/lib/modes/floor_all.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: key floor settings
- last mod: $Id: floor_all.h 17050 2010-03-26 01:34:42Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/psych_11.h b/media/libvorbis/lib/modes/psych_11.h
index 844a8ed3cd..9d8ed357ee 100644
--- a/media/libvorbis/lib/modes/psych_11.h
+++ b/media/libvorbis/lib/modes/psych_11.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: 11kHz settings
- last mod: $Id: psych_11.h 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/psych_16.h b/media/libvorbis/lib/modes/psych_16.h
index 1c10b3954e..49cbf7c4b2 100644
--- a/media/libvorbis/lib/modes/psych_16.h
+++ b/media/libvorbis/lib/modes/psych_16.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: 16kHz settings
- last mod: $Id: psych_16.h 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/psych_44.h b/media/libvorbis/lib/modes/psych_44.h
index f05c032653..d15509b71d 100644
--- a/media/libvorbis/lib/modes/psych_44.h
+++ b/media/libvorbis/lib/modes/psych_44.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: key psychoacoustic settings for 44.1/48kHz
- last mod: $Id: psych_44.h 16962 2010-03-11 07:30:34Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/psych_8.h b/media/libvorbis/lib/modes/psych_8.h
index 0e2dd57371..a19817f760 100644
--- a/media/libvorbis/lib/modes/psych_8.h
+++ b/media/libvorbis/lib/modes/psych_8.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: 8kHz psychoacoustic settings
- last mod: $Id: psych_8.h 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/residue_16.h b/media/libvorbis/lib/modes/residue_16.h
index dcaca5451e..15e161c862 100644
--- a/media/libvorbis/lib/modes/residue_16.h
+++ b/media/libvorbis/lib/modes/residue_16.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: toplevel residue templates 16/22kHz
- last mod: $Id: residue_16.h 16962 2010-03-11 07:30:34Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/residue_44.h b/media/libvorbis/lib/modes/residue_44.h
index 236c18341b..3f982695a7 100644
--- a/media/libvorbis/lib/modes/residue_44.h
+++ b/media/libvorbis/lib/modes/residue_44.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: toplevel residue templates for 32/44.1/48kHz
- last mod: $Id: residue_44.h 16962 2010-03-11 07:30:34Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/residue_44p51.h b/media/libvorbis/lib/modes/residue_44p51.h
index a52cc5245e..8ac5f65e62 100644
--- a/media/libvorbis/lib/modes/residue_44p51.h
+++ b/media/libvorbis/lib/modes/residue_44p51.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: toplevel residue templates for 32/44.1/48kHz uncoupled
- last mod: $Id: residue_44p51.h 19013 2013-11-12 04:04:50Z giles $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/residue_44u.h b/media/libvorbis/lib/modes/residue_44u.h
index 92c4a09ce3..2f3595e49f 100644
--- a/media/libvorbis/lib/modes/residue_44u.h
+++ b/media/libvorbis/lib/modes/residue_44u.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: toplevel residue templates for 32/44.1/48kHz uncoupled
- last mod: $Id: residue_44u.h 16962 2010-03-11 07:30:34Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/residue_8.h b/media/libvorbis/lib/modes/residue_8.h
index 94c6d84c44..b836f79c84 100644
--- a/media/libvorbis/lib/modes/residue_8.h
+++ b/media/libvorbis/lib/modes/residue_8.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: toplevel residue templates 8/11kHz
- last mod: $Id: residue_8.h 16962 2010-03-11 07:30:34Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/setup_11.h b/media/libvorbis/lib/modes/setup_11.h
index 4c2d619ca2..5ade5dd169 100644
--- a/media/libvorbis/lib/modes/setup_11.h
+++ b/media/libvorbis/lib/modes/setup_11.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: 11kHz settings
- last mod: $Id: setup_11.h 16894 2010-02-12 20:32:12Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/setup_16.h b/media/libvorbis/lib/modes/setup_16.h
index 336007f98e..8b2daafa3f 100644
--- a/media/libvorbis/lib/modes/setup_16.h
+++ b/media/libvorbis/lib/modes/setup_16.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: 16kHz settings
- last mod: $Id: setup_16.h 16894 2010-02-12 20:32:12Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/setup_22.h b/media/libvorbis/lib/modes/setup_22.h
index 4fd5e57111..eef5a4e7da 100644
--- a/media/libvorbis/lib/modes/setup_22.h
+++ b/media/libvorbis/lib/modes/setup_22.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: 22kHz settings
- last mod: $Id: setup_22.h 17026 2010-03-25 05:00:27Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/setup_32.h b/media/libvorbis/lib/modes/setup_32.h
index 2275ac9615..f87cb767d0 100644
--- a/media/libvorbis/lib/modes/setup_32.h
+++ b/media/libvorbis/lib/modes/setup_32.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: toplevel settings for 32kHz
- last mod: $Id: setup_32.h 16894 2010-02-12 20:32:12Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/setup_44.h b/media/libvorbis/lib/modes/setup_44.h
index 3b88a89ac5..12d592808e 100644
--- a/media/libvorbis/lib/modes/setup_44.h
+++ b/media/libvorbis/lib/modes/setup_44.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: toplevel settings for 44.1/48kHz
- last mod: $Id: setup_44.h 16962 2010-03-11 07:30:34Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/setup_44p51.h b/media/libvorbis/lib/modes/setup_44p51.h
index 67d9979608..4d49173ffb 100644
--- a/media/libvorbis/lib/modes/setup_44p51.h
+++ b/media/libvorbis/lib/modes/setup_44p51.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: toplevel settings for 44.1/48kHz 5.1 surround modes
- last mod: $Id: setup_44p51.h 19013 2013-11-12 04:04:50Z giles $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/setup_44u.h b/media/libvorbis/lib/modes/setup_44u.h
index 568b5f8959..2dd8bf701f 100644
--- a/media/libvorbis/lib/modes/setup_44u.h
+++ b/media/libvorbis/lib/modes/setup_44u.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: toplevel settings for 44.1/48kHz uncoupled modes
- last mod: $Id: setup_44u.h 16962 2010-03-11 07:30:34Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/setup_8.h b/media/libvorbis/lib/modes/setup_8.h
index 14c48374fa..16b02e01b7 100644
--- a/media/libvorbis/lib/modes/setup_8.h
+++ b/media/libvorbis/lib/modes/setup_8.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: 8kHz settings
- last mod: $Id: setup_8.h 16894 2010-02-12 20:32:12Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/modes/setup_X.h b/media/libvorbis/lib/modes/setup_X.h
index a69f5d40a2..27807c10b4 100644
--- a/media/libvorbis/lib/modes/setup_X.h
+++ b/media/libvorbis/lib/modes/setup_X.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: catch-all toplevel settings for q modes only
- last mod: $Id: setup_X.h 16894 2010-02-12 20:32:12Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/os.h b/media/libvorbis/lib/os.h
index 8bc3e5fe9c..9ded7358d4 100644
--- a/media/libvorbis/lib/os.h
+++ b/media/libvorbis/lib/os.h
@@ -8,12 +8,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: #ifdef jail to whip a few platforms into the UNIX ideal.
- last mod: $Id: os.h 19457 2015-03-03 00:15:29Z giles $
 
  ********************************************************************/
 
@@ -31,7 +30,7 @@
 
 #  ifdef __GNUC__
 #    define STIN static __inline__
-#  elif _WIN32
+#  elif defined(_WIN32)
 #    define STIN static __inline
 #  else
 #    define STIN static
@@ -61,7 +60,7 @@ void *_alloca(size_t size);
 #  define FAST_HYPOT hypot
 #endif
 
-#endif
+#endif /* _V_IFDEFJAIL_H_ */
 
 #ifdef HAVE_ALLOCA_H
 #  include <alloca.h>
@@ -81,7 +80,7 @@ void *_alloca(size_t size);
 
 
 /* Special i386 GCC implementation */
-#if defined(__i386__) && defined(__GNUC__) && !defined(__BEOS__)
+#if defined(__i386__) && defined(__GNUC__) && !defined(__BEOS__) && !defined(__SSE2_MATH__)
 #  define VORBIS_FPU_CONTROL
 /* both GCC and MSVC are kinda stupid about rounding/casting to int.
    Because of encapsulation constraints (GCC can't see inside the asm
@@ -120,8 +119,7 @@ static inline int vorbis_ftoi(double f){  /* yes, double!  Otherwise,
 
 /* MSVC inline assembly. 32 bit only; inline ASM isn't implemented in the
  * 64 bit compiler and doesn't work on arm. */
-#if defined(_MSC_VER) && !defined(_WIN64) && \
-      !defined(_WIN32_WCE) && !defined(_M_ARM)
+#if defined(_MSC_VER) && defined(_M_IX86) && !defined(_WIN32_WCE)
 #  define VORBIS_FPU_CONTROL
 
 typedef ogg_int16_t vorbis_fpu_control;
@@ -148,7 +146,7 @@ static __inline void vorbis_fpu_restore(vorbis_fpu_control fpu){
 
 /* Optimized code path for x86_64 builds. Uses SSE2 intrinsics. This can be
    done safely because all x86_64 CPUs supports SSE2. */
-#if (defined(_MSC_VER) && defined(_WIN64)) || (defined(__GNUC__) && defined (__x86_64__))
+#if (defined(_MSC_VER) && defined(_M_X64)) || (defined(__GNUC__) && defined (__SSE2_MATH__))
 #  define VORBIS_FPU_CONTROL
 
 typedef ogg_int16_t vorbis_fpu_control;
@@ -175,7 +173,7 @@ static __inline void vorbis_fpu_restore(vorbis_fpu_control fpu){
 
 typedef int vorbis_fpu_control;
 
-static int vorbis_ftoi(double f){
+STIN int vorbis_ftoi(double f){
         /* Note: MSVC and GCC (at least on some systems) round towards zero, thus,
            the floor() call is required to ensure correct roudning of
            negative numbers */
diff --git a/media/libvorbis/lib/psy.h b/media/libvorbis/lib/psy.h
index c1ea824401..d9a04e8b74 100644
--- a/media/libvorbis/lib/psy.h
+++ b/media/libvorbis/lib/psy.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: random psychoacoustics (not including preecho)
- last mod: $Id: psy.h 16946 2010-03-03 16:12:40Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/registry.h b/media/libvorbis/lib/registry.h
index 3ae04776d8..b823aa6091 100644
--- a/media/libvorbis/lib/registry.h
+++ b/media/libvorbis/lib/registry.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: registry for time, floor, res backends and channel mappings
- last mod: $Id: registry.h 15531 2008-11-24 23:50:06Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/scales.h b/media/libvorbis/lib/scales.h
index 613f796e77..3c2ae48d9e 100644
--- a/media/libvorbis/lib/scales.h
+++ b/media/libvorbis/lib/scales.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: linear scale -> dB, Bark and Mel scales
- last mod: $Id: scales.h 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/smallft.h b/media/libvorbis/lib/smallft.h
index 456497326c..02fe8f9cd4 100644
--- a/media/libvorbis/lib/smallft.h
+++ b/media/libvorbis/lib/smallft.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: fft transform
- last mod: $Id: smallft.h 13293 2007-07-24 00:09:47Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbis_analysis.c b/media/libvorbis/lib/vorbis_analysis.c
index 01aa6f30db..14919737eb 100644
--- a/media/libvorbis/lib/vorbis_analysis.c
+++ b/media/libvorbis/lib/vorbis_analysis.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: single-block PCM analysis mode dispatch
- last mod: $Id: analysis.c 16226 2009-07-08 06:43:49Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbis_bitrate.c b/media/libvorbis/lib/vorbis_bitrate.c
index 3a71b1dc23..132553cbee 100644
--- a/media/libvorbis/lib/vorbis_bitrate.c
+++ b/media/libvorbis/lib/vorbis_bitrate.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: bitrate tracking and management
- last mod: $Id: bitrate.c 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbis_block.c b/media/libvorbis/lib/vorbis_block.c
index 345c042769..6a50da0843 100644
--- a/media/libvorbis/lib/vorbis_block.c
+++ b/media/libvorbis/lib/vorbis_block.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: PCM data vector blocking, windowing and dis/reassembly
- last mod: $Id: block.c 19457 2015-03-03 00:15:29Z giles $
 
  Handle windowing, overlap-add, etc of the PCM vectors.  This is made
  more amusing by Vorbis' current two allowed block sizes.
diff --git a/media/libvorbis/lib/vorbis_codebook.c b/media/libvorbis/lib/vorbis_codebook.c
index 49b7b09b27..7a0c206783 100644
--- a/media/libvorbis/lib/vorbis_codebook.c
+++ b/media/libvorbis/lib/vorbis_codebook.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: basic codebook pack/unpack/code/decode operations
- last mod: $Id: codebook.c 19457 2015-03-03 00:15:29Z giles $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbis_envelope.c b/media/libvorbis/lib/vorbis_envelope.c
index 010c66e2d6..22d39aa6e0 100644
--- a/media/libvorbis/lib/vorbis_envelope.c
+++ b/media/libvorbis/lib/vorbis_envelope.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: PCM data envelope analysis
- last mod: $Id: envelope.c 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbis_floor0.c b/media/libvorbis/lib/vorbis_floor0.c
index 213cce4ec8..f4a6d4d559 100644
--- a/media/libvorbis/lib/vorbis_floor0.c
+++ b/media/libvorbis/lib/vorbis_floor0.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: floor backend 0 implementation
- last mod: $Id: floor0.c 19457 2015-03-03 00:15:29Z giles $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbis_floor1.c b/media/libvorbis/lib/vorbis_floor1.c
index d8bd4645c1..c4fe3ea7e7 100644
--- a/media/libvorbis/lib/vorbis_floor1.c
+++ b/media/libvorbis/lib/vorbis_floor1.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: floor backend 1 implementation
- last mod: $Id: floor1.c 19457 2015-03-03 00:15:29Z giles $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbis_info.c b/media/libvorbis/lib/vorbis_info.c
index 8a2a001f99..f2e39e387e 100644
--- a/media/libvorbis/lib/vorbis_info.c
+++ b/media/libvorbis/lib/vorbis_info.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: maintain the info structure, info <-> header packets
- last mod: $Id: info.c 19441 2015-01-21 01:17:41Z xiphmont $
 
  ********************************************************************/
 
@@ -20,7 +19,6 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include <ctype.h>
 #include <ogg/ogg.h>
 #include "vorbis/codec.h"
 #include "codec_internal.h"
@@ -31,8 +29,8 @@
 #include "misc.h"
 #include "os.h"
 
-#define GENERAL_VENDOR_STRING "Xiph.Org libVorbis 1.3.5"
-#define ENCODE_VENDOR_STRING "Xiph.Org libVorbis I 20150105 (⛄⛄⛄⛄)"
+#define GENERAL_VENDOR_STRING "Xiph.Org libVorbis 1.3.7"
+#define ENCODE_VENDOR_STRING "Xiph.Org libVorbis I 20200704 (Reducing Environment)"
 
 /* helpers */
 static void _v_writestring(oggpack_buffer *o,const char *s, int bytes){
@@ -48,6 +46,10 @@ static void _v_readstring(oggpack_buffer *o,char *buf,int bytes){
   }
 }
 
+static int _v_toupper(int c) {
+  return (c >= 'a' && c <= 'z') ? (c & ~('a' - 'A')) : c;
+}
+
 void vorbis_comment_init(vorbis_comment *vc){
   memset(vc,0,sizeof(*vc));
 }
@@ -65,11 +67,13 @@ void vorbis_comment_add(vorbis_comment *vc,const char *comment){
 }
 
 void vorbis_comment_add_tag(vorbis_comment *vc, const char *tag, const char *contents){
-  char *comment=alloca(strlen(tag)+strlen(contents)+2); /* +2 for = and \0 */
+  /* Length for key and value +2 for = and \0 */
+  char *comment=_ogg_malloc(strlen(tag)+strlen(contents)+2);
   strcpy(comment, tag);
   strcat(comment, "=");
   strcat(comment, contents);
   vorbis_comment_add(vc, comment);
+  _ogg_free(comment);
 }
 
 /* This is more or less the same as strncasecmp - but that doesn't exist
@@ -77,7 +81,7 @@ void vorbis_comment_add_tag(vorbis_comment *vc, const char *tag, const char *con
 static int tagcompare(const char *s1, const char *s2, int n){
   int c=0;
   while(c < n){
-    if(toupper(s1[c]) != toupper(s2[c]))
+    if(_v_toupper(s1[c]) != _v_toupper(s2[c]))
       return !0;
     c++;
   }
@@ -88,27 +92,30 @@ char *vorbis_comment_query(vorbis_comment *vc, const char *tag, int count){
   long i;
   int found = 0;
   int taglen = strlen(tag)+1; /* +1 for the = we append */
-  char *fulltag = alloca(taglen+ 1);
+  char *fulltag = _ogg_malloc(taglen+1);
 
   strcpy(fulltag, tag);
   strcat(fulltag, "=");
 
   for(i=0;i<vc->comments;i++){
     if(!tagcompare(vc->user_comments[i], fulltag, taglen)){
-      if(count == found)
+      if(count == found) {
         /* We return a pointer to the data, not a copy */
-              return vc->user_comments[i] + taglen;
-      else
+        _ogg_free(fulltag);
+        return vc->user_comments[i] + taglen;
+      } else {
         found++;
+      }
     }
   }
+  _ogg_free(fulltag);
   return NULL; /* didn't find anything */
 }
 
 int vorbis_comment_query_count(vorbis_comment *vc, const char *tag){
   int i,count=0;
   int taglen = strlen(tag)+1; /* +1 for the = we append */
-  char *fulltag = alloca(taglen+1);
+  char *fulltag = _ogg_malloc(taglen+1);
   strcpy(fulltag,tag);
   strcat(fulltag, "=");
 
@@ -117,6 +124,7 @@ int vorbis_comment_query_count(vorbis_comment *vc, const char *tag){
       count++;
   }
 
+  _ogg_free(fulltag);
   return count;
 }
 
@@ -198,6 +206,7 @@ void vorbis_info_clear(vorbis_info *vi){
 
 static int _vorbis_unpack_info(vorbis_info *vi,oggpack_buffer *opb){
   codec_setup_info     *ci=vi->codec_setup;
+  int bs;
   if(!ci)return(OV_EFAULT);
 
   vi->version=oggpack_read(opb,32);
@@ -206,12 +215,16 @@ static int _vorbis_unpack_info(vorbis_info *vi,oggpack_buffer *opb){
   vi->channels=oggpack_read(opb,8);
   vi->rate=oggpack_read(opb,32);
 
-  vi->bitrate_upper=oggpack_read(opb,32);
-  vi->bitrate_nominal=oggpack_read(opb,32);
-  vi->bitrate_lower=oggpack_read(opb,32);
+  vi->bitrate_upper=(ogg_int32_t)oggpack_read(opb,32);
+  vi->bitrate_nominal=(ogg_int32_t)oggpack_read(opb,32);
+  vi->bitrate_lower=(ogg_int32_t)oggpack_read(opb,32);
 
-  ci->blocksizes[0]=1<<oggpack_read(opb,4);
-  ci->blocksizes[1]=1<<oggpack_read(opb,4);
+  bs = oggpack_read(opb,4);
+  if(bs<0)goto err_out;
+  ci->blocksizes[0]=1<<bs;
+  bs = oggpack_read(opb,4);
+  if(bs<0)goto err_out;
+  ci->blocksizes[1]=1<<bs;
 
   if(vi->rate<1)goto err_out;
   if(vi->channels<1)goto err_out;
@@ -583,7 +596,8 @@ int vorbis_analysis_headerout(vorbis_dsp_state *v,
   oggpack_buffer opb;
   private_state *b=v->backend_state;
 
-  if(!b||vi->channels<=0){
+  if(!b||vi->channels<=0||vi->channels>256){
+    b = NULL;
     ret=OV_EFAULT;
     goto err_out;
   }
@@ -642,7 +656,7 @@ int vorbis_analysis_headerout(vorbis_dsp_state *v,
   memset(op_code,0,sizeof(*op_code));
 
   if(b){
-    oggpack_writeclear(&opb);
+    if(vi->channels>0)oggpack_writeclear(&opb);
     if(b->header)_ogg_free(b->header);
     if(b->header1)_ogg_free(b->header1);
     if(b->header2)_ogg_free(b->header2);
diff --git a/media/libvorbis/lib/vorbis_lookup.c b/media/libvorbis/lib/vorbis_lookup.c
index 3321ed3dbc..7cd01a44d3 100644
--- a/media/libvorbis/lib/vorbis_lookup.c
+++ b/media/libvorbis/lib/vorbis_lookup.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
   function: lookup based functions
-  last mod: $Id: lookup.c 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbis_lpc.c b/media/libvorbis/lib/vorbis_lpc.c
index f5199ec235..877da47f8e 100644
--- a/media/libvorbis/lib/vorbis_lpc.c
+++ b/media/libvorbis/lib/vorbis_lpc.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
   function: LPC low level routines
-  last mod: $Id: lpc.c 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbis_lsp.c b/media/libvorbis/lib/vorbis_lsp.c
index bf5ab861c9..a92882a131 100644
--- a/media/libvorbis/lib/vorbis_lsp.c
+++ b/media/libvorbis/lib/vorbis_lsp.c
@@ -6,19 +6,19 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
   function: LSP (also called LSF) conversion routines
-  last mod: $Id: lsp.c 19453 2015-03-02 22:35:34Z xiphmont $
 
   The LSP generation code is taken (with minimal modification and a
   few bugfixes) from "On the Computation of the LSP Frequencies" by
   Joseph Rothweiler (see http://www.rothweiler.us for contact info).
+
   The paper is available at:
 
-  http://www.myown1.com/joe/lsf
+  https://web.archive.org/web/20110810174000/http://home.myfairpoint.net/vzenxj75/myown1/joe/lsf/index.html
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbis_mapping0.c b/media/libvorbis/lib/vorbis_mapping0.c
index 85c7d22d83..efa0fbcd93 100644
--- a/media/libvorbis/lib/vorbis_mapping0.c
+++ b/media/libvorbis/lib/vorbis_mapping0.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: channel mapping 0 implementation
- last mod: $Id: mapping0.c 19441 2015-01-21 01:17:41Z xiphmont $
 
  ********************************************************************/
 
@@ -93,7 +92,6 @@ static vorbis_info_mapping *mapping0_unpack(vorbis_info *vi,oggpack_buffer *opb)
   int i,b;
   vorbis_info_mapping0 *info=_ogg_calloc(1,sizeof(*info));
   codec_setup_info     *ci=vi->codec_setup;
-  memset(info,0,sizeof(*info));
   if(vi->channels<=0)goto err_out;
 
   b=oggpack_read(opb,1);
diff --git a/media/libvorbis/lib/vorbis_mdct.c b/media/libvorbis/lib/vorbis_mdct.c
index 0816331805..2a0ff8d01b 100644
--- a/media/libvorbis/lib/vorbis_mdct.c
+++ b/media/libvorbis/lib/vorbis_mdct.c
@@ -6,13 +6,12 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: normalized modified discrete cosine transform
            power of two length transform only [64 <= n ]
- last mod: $Id: mdct.c 16227 2009-07-08 06:58:46Z xiphmont $
 
  Original algorithm adapted long ago from _The use of multirate filter
  banks for coding of high quality digital audio_, by T. Sporer,
diff --git a/media/libvorbis/lib/vorbis_psy.c b/media/libvorbis/lib/vorbis_psy.c
index f7a44c6d00..036b094aa7 100644
--- a/media/libvorbis/lib/vorbis_psy.c
+++ b/media/libvorbis/lib/vorbis_psy.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: psychoacoustics not including preecho
- last mod: $Id: psy.c 18077 2011-09-02 02:49:00Z giles $
 
  ********************************************************************/
 
@@ -600,11 +599,12 @@ static void bark_noise_hybridmp(int n,const long *b,
     XY[i] = tXY;
   }
 
-  for (i = 0, x = 0.f;; i++, x += 1.f) {
+  for (i = 0, x = 0.f; i < n; i++, x += 1.f) {
 
     lo = b[i] >> 16;
-    if( lo>=0 ) break;
     hi = b[i] & 0xffff;
+    if( lo>=0 || -lo>=n ) break;
+    if( hi>=n ) break;
 
     tN = N[hi] + N[-lo];
     tX = X[hi] - X[-lo];
@@ -616,17 +616,17 @@ static void bark_noise_hybridmp(int n,const long *b,
     B = tN * tXY - tX * tY;
     D = tN * tXX - tX * tX;
     R = (A + x * B) / D;
-    if (R < 0.f)
-      R = 0.f;
+    if (R < 0.f) R = 0.f;
 
     noise[i] = R - offset;
   }
 
-  for ( ;; i++, x += 1.f) {
+  for ( ; i < n; i++, x += 1.f) {
 
     lo = b[i] >> 16;
     hi = b[i] & 0xffff;
-    if(hi>=n)break;
+    if( lo<0 || lo>=n ) break;
+    if( hi>=n ) break;
 
     tN = N[hi] - N[lo];
     tX = X[hi] - X[lo];
@@ -642,6 +642,7 @@ static void bark_noise_hybridmp(int n,const long *b,
 
     noise[i] = R - offset;
   }
+
   for ( ; i < n; i++, x += 1.f) {
 
     R = (A + x * B) / D;
@@ -652,10 +653,11 @@ static void bark_noise_hybridmp(int n,const long *b,
 
   if (fixed <= 0) return;
 
-  for (i = 0, x = 0.f;; i++, x += 1.f) {
+  for (i = 0, x = 0.f; i < n; i++, x += 1.f) {
     hi = i + fixed / 2;
     lo = hi - fixed;
-    if(lo>=0)break;
+    if ( hi>=n ) break;
+    if ( lo>=0 ) break;
 
     tN = N[hi] + N[-lo];
     tX = X[hi] - X[-lo];
@@ -671,11 +673,12 @@ static void bark_noise_hybridmp(int n,const long *b,
 
     if (R - offset < noise[i]) noise[i] = R - offset;
   }
-  for ( ;; i++, x += 1.f) {
+  for ( ; i < n; i++, x += 1.f) {
 
     hi = i + fixed / 2;
     lo = hi - fixed;
-    if(hi>=n)break;
+    if ( hi>=n ) break;
+    if ( lo<0 ) break;
 
     tN = N[hi] - N[lo];
     tX = X[hi] - X[lo];
diff --git a/media/libvorbis/lib/vorbis_registry.c b/media/libvorbis/lib/vorbis_registry.c
index 3961ed1403..db0f67b2e2 100644
--- a/media/libvorbis/lib/vorbis_registry.c
+++ b/media/libvorbis/lib/vorbis_registry.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: registry for time, floor, res backends and channel mappings
- last mod: $Id: registry.c 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbis_res0.c b/media/libvorbis/lib/vorbis_res0.c
index ec11488c2f..c931aded38 100644
--- a/media/libvorbis/lib/vorbis_res0.c
+++ b/media/libvorbis/lib/vorbis_res0.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: residue backend 0, 1 and 2 implementation
- last mod: $Id: res0.c 19441 2015-01-21 01:17:41Z xiphmont $
 
  ********************************************************************/
 
@@ -31,9 +30,6 @@
 #include "misc.h"
 #include "os.h"
 
-//#define TRAIN_RES 1
-//#define TRAIN_RESAUX 1
-
 #if defined(TRAIN_RES) || defined (TRAIN_RESAUX)
 #include <stdio.h>
 #endif
diff --git a/media/libvorbis/lib/vorbis_sharedbook.c b/media/libvorbis/lib/vorbis_sharedbook.c
index 6bfdf7311e..444f42b5aa 100644
--- a/media/libvorbis/lib/vorbis_sharedbook.c
+++ b/media/libvorbis/lib/vorbis_sharedbook.c
@@ -6,16 +6,16 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: basic shared codebook operations
- last mod: $Id: sharedbook.c 19457 2015-03-03 00:15:29Z giles $
 
  ********************************************************************/
 
 #include <stdlib.h>
+#include <limits.h>
 #include <math.h>
 #include <string.h>
 #include <ogg/ogg.h>
@@ -50,7 +50,7 @@ long _float32_pack(float val){
     sign=0x80000000;
     val= -val;
   }
-  exp= floor(log(val)/log(2.f)+.001); //+epsilon
+  exp= floor(log(val)/log(2.f)+.001); /* +epsilon */
   mant=rint(ldexp(val,(VQ_FMAN-1)-exp));
   exp=(exp+VQ_FEXP_BIAS)<<VQ_FMAN;
 
@@ -62,7 +62,15 @@ float _float32_unpack(long val){
   int    sign=val&0x80000000;
   long   exp =(val&0x7fe00000L)>>VQ_FMAN;
   if(sign)mant= -mant;
-  return(ldexp(mant,exp-(VQ_FMAN-1)-VQ_FEXP_BIAS));
+  exp=exp-(VQ_FMAN-1)-VQ_FEXP_BIAS;
+  /* clamp excessive exponent values */
+  if (exp>63){
+    exp=63;
+  }
+  if (exp<-63){
+    exp=-63;
+  }
+  return(ldexp(mant,exp));
 }
 
 /* given a list of word lengths, generate a list of codewords.  Works
@@ -158,25 +166,34 @@ ogg_uint32_t *_make_words(char *l,long n,long sparsecount){
    that's portable and totally safe against roundoff, but I haven't
    thought of it.  Therefore, we opt on the side of caution */
 long _book_maptype1_quantvals(const static_codebook *b){
-  long vals=floor(pow((float)b->entries,1.f/b->dim));
+  long vals;
+  if(b->entries<1){
+    return(0);
+  }
+  vals=floor(pow((float)b->entries,1.f/b->dim));
 
   /* the above *should* be reliable, but we'll not assume that FP is
      ever reliable when bitstream sync is at stake; verify via integer
      means that vals really is the greatest value of dim for which
      vals^b->bim <= b->entries */
   /* treat the above as an initial guess */
+  if(vals<1){
+    vals=1;
+  }
   while(1){
     long acc=1;
     long acc1=1;
     int i;
     for(i=0;i<b->dim;i++){
+      if(b->entries/vals<acc)break;
       acc*=vals;
-      acc1*=vals+1;
+      if(LONG_MAX/(vals+1)<acc1)acc1=LONG_MAX;
+      else acc1*=vals+1;
     }
-    if(acc<=b->entries && acc1>b->entries){
+    if(i>=b->dim && acc<=b->entries && acc1>b->entries){
       return(vals);
     }else{
-      if(acc>b->entries){
+      if(i<b->dim || acc>b->entries){
         vals--;
       }else{
         vals++;
@@ -285,7 +302,7 @@ int vorbis_book_init_encode(codebook *c,const static_codebook *s){
   c->used_entries=s->entries;
   c->dim=s->dim;
   c->codelist=_make_words(s->lengthlist,s->entries,0);
-  //c->valuelist=_book_unquantize(s,s->entries,NULL);
+  /* c->valuelist=_book_unquantize(s,s->entries,NULL); */
   c->quantvals=_book_maptype1_quantvals(s);
   c->minval=(int)rint(_float32_unpack(s->q_min));
   c->delta=(int)rint(_float32_unpack(s->q_delta));
@@ -564,6 +581,7 @@ void run_test(static_codebook *b,float *comp){
       exit(1);
     }
   }
+  _ogg_free(out);
 }
 
 int main(){
diff --git a/media/libvorbis/lib/vorbis_smallft.c b/media/libvorbis/lib/vorbis_smallft.c
index ae2bc41b6b..4ffabab4bb 100644
--- a/media/libvorbis/lib/vorbis_smallft.c
+++ b/media/libvorbis/lib/vorbis_smallft.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: *unnormalized* fft transform
- last mod: $Id: smallft.c 16227 2009-07-08 06:58:46Z xiphmont $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbis_synthesis.c b/media/libvorbis/lib/vorbis_synthesis.c
index 932d271a63..3e2d681270 100644
--- a/media/libvorbis/lib/vorbis_synthesis.c
+++ b/media/libvorbis/lib/vorbis_synthesis.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: single-block PCM synthesis
- last mod: $Id: synthesis.c 19441 2015-01-21 01:17:41Z xiphmont $
 
  ********************************************************************/
 
@@ -117,7 +116,7 @@ int vorbis_synthesis_trackonly(vorbis_block *vb,ogg_packet *op){
   if(!ci->mode_param[mode]){
     return(OV_EBADPACKET);
   }
-  
+
   vb->W=ci->mode_param[mode]->blockflag;
   if(vb->W){
     vb->lW=oggpack_read(opb,1);
diff --git a/media/libvorbis/lib/vorbis_window.c b/media/libvorbis/lib/vorbis_window.c
index 0305b79297..2151b278d1 100644
--- a/media/libvorbis/lib/vorbis_window.c
+++ b/media/libvorbis/lib/vorbis_window.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: window functions
- last mod: $Id: window.c 19028 2013-12-02 23:23:39Z tterribe $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/lib/vorbisenc.c b/media/libvorbis/lib/vorbisenc.c
index b5d621e900..cf3806a6e1 100644
--- a/media/libvorbis/lib/vorbisenc.c
+++ b/media/libvorbis/lib/vorbisenc.c
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2015             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: simple programmatic interface for encoder mode setup
- last mod: $Id: vorbisenc.c 19457 2015-03-03 00:15:29Z giles $
 
  ********************************************************************/
 
@@ -685,6 +684,7 @@ int vorbis_encode_setup_init(vorbis_info *vi){
   highlevel_encode_setup *hi=&ci->hi;
 
   if(ci==NULL)return(OV_EINVAL);
+  if(vi->channels<1||vi->channels>255)return(OV_EINVAL);
   if(!hi->impulse_block_p)i0=1;
 
   /* too low/high an ATH floater is nonsensical, but doesn't break anything */
@@ -1211,7 +1211,7 @@ int vorbis_encode_ctl(vorbis_info *vi,int number,void *arg){
                                           hi->req,
                                           hi->managed,
                                           &new_base);
-        if(!hi->setup)return OV_EIMPL;
+        if(!new_template)return OV_EIMPL;
         hi->setup=new_template;
         hi->base_setting=new_base;
         vorbis_encode_setup_setting(vi,vi->channels,vi->rate);
diff --git a/media/libvorbis/lib/window.h b/media/libvorbis/lib/window.h
index 51f97599f5..33d83f85f9 100644
--- a/media/libvorbis/lib/window.h
+++ b/media/libvorbis/lib/window.h
@@ -6,12 +6,11 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation https://xiph.org/                     *
  *                                                                  *
  ********************************************************************
 
  function: window functions
- last mod: $Id: window.h 19028 2013-12-02 23:23:39Z tterribe $
 
  ********************************************************************/
 
diff --git a/media/libvorbis/todo.txt b/media/libvorbis/todo.txt
deleted file mode 100644
index b0e1f93cda..0000000000
--- a/media/libvorbis/todo.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-Open project list for further development:
-
-libvorbis:
-
-Meaningful error code returns
-
-still some padding at EOS
-
-Option for brute-forcing vq search on maptype 2 (helps on undertrained
-sets).
-
-encoder switch interface for binary compat through changes; ioctl()-like?
-
-API changes: 
-  break up some of the more monolithic calls (eg, allow access
-       to MDCT domain data, additional low level framing capability)
-  convenience calls for text comments
-
-other:
-
-command line suite
-'crashme'
diff --git a/media/libvorbis/update.sh b/media/libvorbis/update.sh
index ae82a8d8b3..b2ad0b23cb 100644..100755
--- a/media/libvorbis/update.sh
+++ b/media/libvorbis/update.sh
@@ -1,4 +1,8 @@
-# Usage: /bin/sh update.sh <vorbis_src_directory>
+#!/bin/sh
+if test $# -ne 1; then
+	echo "Usage: /bin/sh update.sh <vorbis_src_directory>"
+	exit 1
+fi
 #
 # Copies the needed files from a directory containing the original
 # libvorbis source that we need for the Mozilla HTML5 media support.
@@ -44,9 +48,9 @@ cp $1/lib/codebook.c ./lib/vorbis_codebook.c
 cp $1/lib/bitrate.c ./lib/vorbis_bitrate.c
 cp $1/lib/block.c ./lib/vorbis_block.c
 cp $1/include/vorbis/codec.h ./include/vorbis/codec.h
-cp $1/todo.txt ./todo.txt
+cp $1/CHANGES ./CHANGES
 cp $1/COPYING ./COPYING
-cp $1/README ./README
+cp $1/README.md ./README.md
 cp $1/AUTHORS ./AUTHORS
 
 # Encoder support
@@ -81,4 +85,4 @@ cp $1/lib/books/floor/floor_books.h ./lib/books/floor/
 cp $1/lib/books/uncoupled/res_books_uncoupled.h ./lib/books/uncoupled/
 
 # Add any patches against upstream here.
-# ...nothing to apply...
+
diff --git a/media/libwebp/AUTHORS b/media/libwebp/AUTHORS
index 0d70b7fb2a..8307c2099d 100644
--- a/media/libwebp/AUTHORS
+++ b/media/libwebp/AUTHORS
@@ -1,9 +1,15 @@
 Contributors:
+- Aidan O'Loan (aidanol at gmail dot com)
 - Alan Browning (browning at google dot com)
 - Charles Munger (clm at google dot com)
+- Cheng Yi (cyi at google dot com)
 - Christian Duvivier (cduvivier at google dot com)
+- Christopher Degawa (ccom at randomderp dot com)
+- Clement Courbet (courbet at google dot com)
 - Djordje Pesut (djordje dot pesut at imgtec dot com)
 - Hui Su (huisu at google dot com)
+- Ilya Kurdyukov (jpegqs at gmail dot com)
+- Ingvar Stepanyan (rreverser at google dot com)
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
 - Jehan (jehan at girinstud dot io)
@@ -20,11 +26,13 @@ Contributors:
 - Mislav Bradac (mislavm at google dot com)
 - Nico Weber (thakis at chromium dot org)
 - Noel Chromium (noel at chromium dot org)
+- Oliver Wolff (oliver dot wolff at qt dot io)
 - Owen Rodley (orodley at google dot com)
 - Parag Salasakar (img dot mips1 at gmail dot com)
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
 - Pierre Joye (pierre dot php at gmail dot com)
+- Roberto Alanis (alanisbaez at google dot com)
 - Sam Clegg (sbc at chromium dot org)
 - Scott Hancher (seh at google dot com)
 - Scott LaVarnway (slavarnway at google dot com)
@@ -38,5 +46,7 @@ Contributors:
 - Vikas Arora (vikasa at google dot com)
 - Vincent Rabaud (vrabaud at google dot com)
 - Vlad Tsyrklevich (vtsyrklevich at chromium dot org)
+- Wan-Teh Chang (wtc at google dot com)
 - Yang Zhang (yang dot zhang at arm dot com)
 - Yannis Guyon (yguyon at google dot com)
+- Zhi An Ng (zhin at chromium dot org)
diff --git a/media/libwebp/NEWS b/media/libwebp/NEWS
index aa393c819f..5b36c5cf30 100644
--- a/media/libwebp/NEWS
+++ b/media/libwebp/NEWS
@@ -1,3 +1,58 @@
+- 1/11/2022: version 1.2.2
+  This is a binary compatible release.
+  * webpmux: add "-set bgcolor A,R,G,B"
+  * add ARM64 NEON support for MSVC builds (#539)
+  * fix duplicate include error in Xcode when using multiple XCFrameworks in a
+    project (#542)
+  * doc updates and bug fixes (#538, #544, #548, #550)
+
+- 7/20/2021: version 1.2.1
+  This is a binary compatible release.
+  * minor lossless encoder improvements and x86 color conversion speed up
+  * add ARM64 simulator support to xcframeworkbuild.sh (#510)
+  * further security related hardening in libwebp & examples
+    (issues: #497, #508, #518)
+    (chromium: #1196480, #1196773, #1196775, #1196777, #1196778, #1196850)
+    (oss-fuzz: #28658, #28978)
+  * toolchain updates and bug fixes (#498, #501, #502, #504, #505, #506, #509,
+                                     #533)
+  * use more inclusive language within the source (#507)
+
+- 12/23/2020: version 1.2.0
+  * API changes:
+    - libwebp:
+      encode.h: add a qmin / qmax range for quality factor (cwebp adds -qrange)
+  * lossless encoder improvements
+  * SIMD support for Wasm builds
+  * add xcframeworkbuild.sh, supports Mac Catalyst builds
+  * import fuzzers from oss-fuzz & chromium (#409)
+  * webpmux: add an '-set loop <value>' option (#494)
+  * toolchain updates and bug fixes (#449, #463, #470, #475, #477, #478, #479,
+    #488, #491)
+
+- 12/18/2019: version 1.1.0
+  * API changes:
+    - libwebp:
+      WebPMalloc (issue #442)
+    - extras:
+      WebPUnmultiplyARGB
+  * alpha decode fix (issue #439)
+  * toolchain updates and bug fixes
+    (chromium: #1026858, #1027136, #1027409, #1028620, #1028716, #995200)
+    (oss-fuzz: #19430, #19447)
+
+- 7/4/2019: version 1.0.3
+  This is a binary compatible release.
+  * resize fixes for Nx1 sizes and the addition of non-opaque alpha values for
+    odd sizes (issues #418, #434)
+  * lossless encode/decode performance improvements
+  * lossy compression performance improvement at low quality levels with flat
+    content (issue #432)
+  * python swig files updated to support python 3
+  Tool updates:
+    vwebp will now preserve the aspect ratio of images that exceed monitor
+    resolution by scaling the image to fit (issue #433)
+
 - 1/14/2019: version 1.0.2
   This is a binary compatible release.
   * (Windows) unicode file support in the tools (linux and mac already had
diff --git a/media/libwebp/README b/media/libwebp/README
index 502a4c1c20..f6eaf2c049 100644
--- a/media/libwebp/README
+++ b/media/libwebp/README
@@ -4,7 +4,7 @@
           \__\__/\____/\_____/__/ ____  ___
                 / _/ /    \    \ /  _ \/ _/
                /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v1.0.2
+               \____/____/\_____/_____/____/v1.2.2
 
 Description:
 ============
@@ -13,13 +13,13 @@ WebP codec: library to encode and decode images in WebP format. This package
 contains the library that can be used in other programs to add WebP support,
 as well as the command line tools 'cwebp' and 'dwebp'.
 
-See http://developers.google.com/speed/webp
+See https://developers.google.com/speed/webp
 
 The latest source tree is available at
 https://chromium.googlesource.com/webm/libwebp
 
 It is released under the same license as the WebM project.
-See http://www.webmproject.org/license/software/ or the
+See https://www.webmproject.org/license/software/ or the
 "COPYING" file for details. An additional intellectual
 property rights grant can be found in the file PATENTS.
 
@@ -113,7 +113,7 @@ make install
 
 CMake:
 ------
-With CMake, you can compile libwebp, cwebp, dwebp, gif2web, img2webp, webpinfo
+With CMake, you can compile libwebp, cwebp, dwebp, gif2webp, img2webp, webpinfo
 and the JS bindings.
 
 Prerequisites:
@@ -225,6 +225,7 @@ Usage:
 
 If input size (-s) for an image is not specified, it is
 assumed to be a PNG, JPEG, TIFF or WebP file.
+Note: Animated PNG and WebP files are not supported.
 
 Options:
   -h / -help ............. short help
@@ -254,6 +255,8 @@ Options:
   -partition_limit <int> . limit quality to fit the 512k limit on
                            the first partition (0=no degradation ... 100=full)
   -pass <int> ............ analysis pass number (1..10)
+  -qrange <min> <max> .... specifies the permissible quality range
+                           (default: 0 100)
   -crop <x> <y> <w> <h> .. crop picture with the given rectangle
   -resize <w> <h> ........ resize picture (after any cropping)
   -mt .................... use multi-threading if available
@@ -294,6 +297,7 @@ Experimental Options:
   -af .................... auto-adjust filter strength
   -pre <int> ............. pre-processing filter
 
+
 The main options you might want to try in order to further tune the
 visual quality are:
  -preset
@@ -341,7 +345,9 @@ The full list of options is available using -h:
 > dwebp -h
 Usage: dwebp in_file [options] [-o out_file]
 
-Decodes the WebP image file to PNG format [Default]
+Decodes the WebP image file to PNG format [Default].
+Note: Animated WebP files are not supported.
+
 Use following options to convert into alternate image formats:
   -pam ......... save the raw RGBA samples as a color PAM
   -ppm ......... save the raw RGB samples as a color PPM
@@ -423,15 +429,15 @@ Prerequisites:
 1) OpenGL & OpenGL Utility Toolkit (GLUT)
   Linux:
     $ sudo apt-get install freeglut3-dev mesa-common-dev
-  Mac + XCode:
+  Mac + Xcode:
     - These libraries should be available in the OpenGL / GLUT frameworks.
   Windows:
     http://freeglut.sourceforge.net/index.php#download
 
 2) (Optional) qcms (Quick Color Management System)
   i. Download qcms from Mozilla / Chromium:
-    http://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
-    http://src.chromium.org/viewvc/chrome/trunk/src/third_party/qcms
+    https://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
+    https://source.chromium.org/chromium/chromium/src/+/main:third_party/qcms/;drc=d4a2f8e1ed461d8fc05ed88d1ae2dc94c9773825
   ii. Build and archive the source files as libqcms.a / qcms.lib
   iii. Update makefile.unix / Makefile.vc
     a) Define WEBP_HAVE_QCMS
@@ -450,7 +456,7 @@ modes, etc.
 
 Usage:
 
-  img2webp [file-level options] [image files...] [per-frame options...]
+  img2webp [file_options] [[frame_options] frame_file]...
 
 File-level options (only used at the start of compression):
  -min_size ............ minimize size
@@ -597,7 +603,7 @@ The encoding flow looks like:
   // Setup a config, starting form a preset and tuning some additional
   // parameters
   WebPConfig config;
-  if (!WebPConfigPreset(&config, WEBP_PRESET_PHOTO, quality_factor))
+  if (!WebPConfigPreset(&config, WEBP_PRESET_PHOTO, quality_factor)) {
     return 0;   // version error
   }
   // ... additional tuning
@@ -613,7 +619,7 @@ The encoding flow looks like:
   pic.width = width;
   pic.height = height;
   // allocated picture of dimension width x height
-  if (!WebPPictureAllocate(&pic)) {
+  if (!WebPPictureAlloc(&pic)) {
     return 0;   // memory error
   }
   // at this point, 'pic' has been initialized as a container,
@@ -780,10 +786,10 @@ Bugs:
 Please report all bugs to the issue tracker:
     https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
-    http://www.webmproject.org/code/contribute/submitting-patches/
+    https://www.webmproject.org/code/contribute/submitting-patches/
 
 Discuss:
 ========
 
 Email: webp-discuss@webmproject.org
-Web: http://groups.google.com/a/webmproject.org/group/webp-discuss
+Web: https://groups.google.com/a/webmproject.org/group/webp-discuss
diff --git a/media/libwebp/README.mux b/media/libwebp/README.mux
index 7e9c3c903b..099d8e061d 100644
--- a/media/libwebp/README.mux
+++ b/media/libwebp/README.mux
@@ -1,7 +1,7 @@
           __   __  ____  ____  ____  __ __  _     __ __
          /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
          \       /   __/  _  \   __/      /  /  (_/  /__
-          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v1.0.2
+          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v1.2.2
 
 
 Description:
@@ -43,10 +43,12 @@ GET_OPTIONS:
    frame n   get nth frame
 
 SET_OPTIONS:
- Set color profile/metadata:
-   icc  file.icc     set ICC profile
-   exif file.exif    set EXIF metadata
-   xmp  file.xmp     set XMP metadata
+ Set color profile/metadata/parameters:
+   loop LOOP_COUNT            set the loop count
+   bgcolor BACKGROUND_COLOR   set the animation background color
+   icc  file.icc              set ICC profile
+   exif file.exif             set EXIF metadata
+   xmp  file.xmp              set XMP metadata
    where:    'file.icc' contains the ICC profile to be set,
              'file.exif' contains the EXIF metadata to be set
              'file.xmp' contains the XMP metadata to be set
@@ -247,10 +249,10 @@ Bugs:
 Please report all bugs to the issue tracker:
     https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
-    http://www.webmproject.org/code/contribute/submitting-patches/
+    https://www.webmproject.org/code/contribute/submitting-patches/
 
 Discuss:
 ========
 
 Email: webp-discuss@webmproject.org
-Web: http://groups.google.com/a/webmproject.org/group/webp-discuss
+Web: https://groups.google.com/a/webmproject.org/group/webp-discuss
diff --git a/media/libwebp/UXPCHANGES b/media/libwebp/UXPCHANGES
index 78b7823c8d..bf1fe22d6e 100644
--- a/media/libwebp/UXPCHANGES
+++ b/media/libwebp/UXPCHANGES
@@ -3,3 +3,4 @@ Changes made to pristine libwebp source by Moonchild Productions and mozilla.org
 2017/01/27  -- Synced with libwebp-0.6.0 (BZ #1294490).
 2018/06/29  -- Synced with libwebp-1.0.0 + BUG=webp:381,383,384.
 2019/01/21  -- Synced with libwebp-1.0.2
+2022/06/26  -- Synced with libwebp-1.2.2
diff --git a/media/libwebp/dec/alpha_dec.c b/media/libwebp/dec/alpha_dec.c
index 1ff7c62d8b..52c24037e4 100644
--- a/media/libwebp/dec/alpha_dec.c
+++ b/media/libwebp/dec/alpha_dec.c
@@ -183,7 +183,7 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
   assert(dec != NULL && io != NULL);
 
   if (row < 0 || num_rows <= 0 || row + num_rows > height) {
-    return NULL;    // sanity check.
+    return NULL;
   }
 
   if (!dec->is_alpha_decoded_) {
diff --git a/media/libwebp/dec/buffer_dec.c b/media/libwebp/dec/buffer_dec.c
index d72d32b0a9..0f3eed2cfe 100644
--- a/media/libwebp/dec/buffer_dec.c
+++ b/media/libwebp/dec/buffer_dec.c
@@ -102,7 +102,7 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
     int stride;
     uint64_t size;
 
-    if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
+    if ((uint64_t)w * kModeBpp[mode] >= (1ull << 31)) {
       return VP8_STATUS_INVALID_PARAM;
     }
     stride = w * kModeBpp[mode];
@@ -117,7 +117,6 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
     }
     total_size = size + 2 * uv_size + a_size;
 
-    // Security/sanity checks
     output = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*output));
     if (output == NULL) {
       return VP8_STATUS_OUT_OF_MEMORY;
@@ -156,11 +155,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
   }
   if (WebPIsRGBMode(buffer->colorspace)) {
     WebPRGBABuffer* const buf = &buffer->u.RGBA;
-    buf->rgba += (buffer->height - 1) * buf->stride;
+    buf->rgba += (int64_t)(buffer->height - 1) * buf->stride;
     buf->stride = -buf->stride;
   } else {
     WebPYUVABuffer* const buf = &buffer->u.YUVA;
-    const int H = buffer->height;
+    const int64_t H = buffer->height;
     buf->y += (H - 1) * buf->y_stride;
     buf->y_stride = -buf->y_stride;
     buf->u += ((H - 1) >> 1) * buf->u_stride;
@@ -188,8 +187,7 @@ VP8StatusCode WebPAllocateDecBuffer(int width, int height,
       const int ch = options->crop_height;
       const int x = options->crop_left & ~1;
       const int y = options->crop_top & ~1;
-      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
-          x + cw > width || y + ch > height) {
+      if (!WebPCheckCropDimensions(width, height, x, y, cw, ch)) {
         return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
       }
       width = cw;
diff --git a/media/libwebp/dec/frame_dec.c b/media/libwebp/dec/frame_dec.c
index 3d1d662746..d4cdc15344 100644
--- a/media/libwebp/dec/frame_dec.c
+++ b/media/libwebp/dec/frame_dec.c
@@ -705,7 +705,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
                         + cache_size + alpha_size + WEBP_ALIGN_CST;
   uint8_t* mem;
 
-  if (needed != (size_t)needed) return 0;  // check for overflow
+  if (!CheckSizeOverflow(needed)) return 0;  // check for overflow
   if (needed > dec->mem_size_) {
     WebPSafeFree(dec->mem_);
     dec->mem_size_ = 0;
@@ -732,7 +732,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
   mem += f_info_size;
   dec->thread_ctx_.id_ = 0;
   dec->thread_ctx_.f_info_ = dec->f_info_;
-  if (dec->mt_method_ > 0) {
+  if (dec->filter_type_ > 0 && dec->mt_method_ > 0) {
     // secondary cache line. The deblocking process need to make use of the
     // filtering strength from previous macroblock row, while the new ones
     // are being decoded in parallel. We'll just swap the pointers.
diff --git a/media/libwebp/dec/idec_dec.c b/media/libwebp/dec/idec_dec.c
index ee0d33eac4..3a592d59ed 100644
--- a/media/libwebp/dec/idec_dec.c
+++ b/media/libwebp/dec/idec_dec.c
@@ -166,9 +166,11 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
   VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
   MemBuffer* const mem = &idec->mem_;
   const int need_compressed_alpha = NeedCompressedAlpha(idec);
-  const uint8_t* const old_start = mem->buf_ + mem->start_;
+  const uint8_t* const old_start =
+      (mem->buf_ == NULL) ? NULL : mem->buf_ + mem->start_;
   const uint8_t* const old_base =
       need_compressed_alpha ? dec->alpha_data_ : old_start;
+  assert(mem->buf_ != NULL || mem->start_ == 0);
   assert(mem->mode_ == MEM_MODE_APPEND);
   if (data_size > MAX_CHUNK_PAYLOAD) {
     // security safeguard: trying to allocate more than what the format
@@ -184,7 +186,7 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
     uint8_t* const new_buf =
         (uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
     if (new_buf == NULL) return 0;
-    memcpy(new_buf, old_base, current_size);
+    if (old_base != NULL) memcpy(new_buf, old_base, current_size);
     WebPSafeFree(mem->buf_);
     mem->buf_ = new_buf;
     mem->buf_size_ = (size_t)extra_size;
@@ -192,6 +194,7 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
     mem->end_ = current_size;
   }
 
+  assert(mem->buf_ != NULL);
   memcpy(mem->buf_ + mem->end_, data, data_size);
   mem->end_ += data_size;
   assert(mem->end_ <= mem->buf_size_);
@@ -204,7 +207,9 @@ static int RemapMemBuffer(WebPIDecoder* const idec,
                           const uint8_t* const data, size_t data_size) {
   MemBuffer* const mem = &idec->mem_;
   const uint8_t* const old_buf = mem->buf_;
-  const uint8_t* const old_start = old_buf + mem->start_;
+  const uint8_t* const old_start =
+      (old_buf == NULL) ? NULL : old_buf + mem->start_;
+  assert(old_buf != NULL || mem->start_ == 0);
   assert(mem->mode_ == MEM_MODE_MAP);
 
   if (data_size < mem->buf_size_) return 0;  // can't remap to a shorter buffer!
diff --git a/media/libwebp/dec/io_dec.c b/media/libwebp/dec/io_dec.c
index 0edd9f526e..6124c61393 100644
--- a/media/libwebp/dec/io_dec.c
+++ b/media/libwebp/dec/io_dec.c
@@ -25,21 +25,16 @@
 static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
   WebPDecBuffer* output = p->output;
   const WebPYUVABuffer* const buf = &output->u.YUVA;
-  uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride;
-  uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride;
-  uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride;
+  uint8_t* const y_dst = buf->y + (size_t)io->mb_y * buf->y_stride;
+  uint8_t* const u_dst = buf->u + (size_t)(io->mb_y >> 1) * buf->u_stride;
+  uint8_t* const v_dst = buf->v + (size_t)(io->mb_y >> 1) * buf->v_stride;
   const int mb_w = io->mb_w;
   const int mb_h = io->mb_h;
   const int uv_w = (mb_w + 1) / 2;
   const int uv_h = (mb_h + 1) / 2;
-  int j;
-  for (j = 0; j < mb_h; ++j) {
-    memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
-  }
-  for (j = 0; j < uv_h; ++j) {
-    memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
-    memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
-  }
+  WebPCopyPlane(io->y, io->y_stride, y_dst, buf->y_stride, mb_w, mb_h);
+  WebPCopyPlane(io->u, io->uv_stride, u_dst, buf->u_stride, uv_w, uv_h);
+  WebPCopyPlane(io->v, io->uv_stride, v_dst, buf->v_stride, uv_w, uv_h);
   return io->mb_h;
 }
 
@@ -47,7 +42,7 @@ static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
   WebPDecBuffer* const output = p->output;
   WebPRGBABuffer* const buf = &output->u.RGBA;
-  uint8_t* const dst = buf->rgba + io->mb_y * buf->stride;
+  uint8_t* const dst = buf->rgba + (size_t)io->mb_y * buf->stride;
   WebPSamplerProcessPlane(io->y, io->y_stride,
                           io->u, io->v, io->uv_stride,
                           dst, buf->stride, io->mb_w, io->mb_h,
@@ -62,7 +57,7 @@ static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
   int num_lines_out = io->mb_h;   // a priori guess
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  uint8_t* dst = buf->rgba + (size_t)io->mb_y * buf->stride;
   WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
   const uint8_t* cur_y = io->y;
   const uint8_t* cur_u = io->u;
@@ -133,7 +128,7 @@ static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
   const WebPYUVABuffer* const buf = &p->output->u.YUVA;
   const int mb_w = io->mb_w;
   const int mb_h = io->mb_h;
-  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+  uint8_t* dst = buf->a + (size_t)io->mb_y * buf->a_stride;
   int j;
   (void)expected_num_lines_out;
   assert(expected_num_lines_out == mb_h);
@@ -186,7 +181,7 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
         (colorspace == MODE_ARGB || colorspace == MODE_Argb);
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
     int num_rows;
-    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
     uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
     const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
@@ -210,7 +205,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
     const WEBP_CSP_MODE colorspace = p->output->colorspace;
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
     int num_rows;
-    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
 #if (WEBP_SWAP_16BIT_CSP == 1)
     uint8_t* alpha_dst = base_rgba;
@@ -276,9 +271,9 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
                                 int expected_num_lines_out) {
   const WebPYUVABuffer* const buf = &p->output->u.YUVA;
-  uint8_t* const dst_a = buf->a + p->last_y * buf->a_stride;
+  uint8_t* const dst_a = buf->a + (size_t)p->last_y * buf->a_stride;
   if (io->a != NULL) {
-    uint8_t* const dst_y = buf->y + p->last_y * buf->y_stride;
+    uint8_t* const dst_y = buf->y + (size_t)p->last_y * buf->y_stride;
     const int num_lines_out = Rescale(io->a, io->width, io->mb_h, p->scaler_a);
     assert(expected_num_lines_out == num_lines_out);
     if (num_lines_out > 0) {   // unmultiply the Y
@@ -303,46 +298,57 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const int uv_out_height = (out_height + 1) >> 1;
   const int uv_in_width  = (io->mb_w + 1) >> 1;
   const int uv_in_height = (io->mb_h + 1) >> 1;
-  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
+  // scratch memory for luma rescaler
+  const size_t work_size = 2 * (size_t)out_width;
   const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
-  size_t tmp_size, rescaler_size;
+  uint64_t total_size;
+  size_t rescaler_size;
   rescaler_t* work;
   WebPRescaler* scalers;
   const int num_rescalers = has_alpha ? 4 : 3;
 
-  tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
+  total_size = ((uint64_t)work_size + 2 * uv_work_size) * sizeof(*work);
   if (has_alpha) {
-    tmp_size += work_size * sizeof(*work);
+    total_size += (uint64_t)work_size * sizeof(*work);
   }
   rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+  total_size += rescaler_size;
+  if (!CheckSizeOverflow(total_size)) {
+    return 0;
+  }
 
-  p->memory = WebPSafeMalloc(1ULL, tmp_size + rescaler_size);
+  p->memory = WebPSafeMalloc(1ULL, (size_t)total_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
   work = (rescaler_t*)p->memory;
 
-  scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + tmp_size);
+  scalers = (WebPRescaler*)WEBP_ALIGN(
+      (const uint8_t*)work + total_size - rescaler_size);
   p->scaler_y = &scalers[0];
   p->scaler_u = &scalers[1];
   p->scaler_v = &scalers[2];
   p->scaler_a = has_alpha ? &scalers[3] : NULL;
 
-  WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
-                   buf->y, out_width, out_height, buf->y_stride, 1,
-                   work);
-  WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
-                   buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
-                   work + work_size);
-  WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
-                   buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
-                   work + work_size + uv_work_size);
+  if (!WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
+                        buf->y, out_width, out_height, buf->y_stride, 1,
+                        work) ||
+      !WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
+                        buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
+                        work + work_size) ||
+      !WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
+                        buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
+                        work + work_size + uv_work_size)) {
+    return 0;
+  }
   p->emit = EmitRescaledYUV;
 
   if (has_alpha) {
-    WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
-                     buf->a, out_width, out_height, buf->a_stride, 1,
-                     work + work_size + 2 * uv_work_size);
+    if (!WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
+                          buf->a, out_width, out_height, buf->a_stride, 1,
+                          work + work_size + 2 * uv_work_size)) {
+      return 0;
+    }
     p->emit_alpha = EmitRescaledAlphaYUV;
     WebPInitAlphaProcessing();
   }
@@ -356,7 +362,7 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
   const WebPYUV444Converter convert =
       WebPYUV444Converters[p->output->colorspace];
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + y_pos * buf->stride;
+  uint8_t* dst = buf->rgba + (size_t)y_pos * buf->stride;
   int num_lines_out = 0;
   // For RGB rescaling, because of the YUV420, current scan position
   // U/V can be +1/-1 line from the Y one.  Hence the double test.
@@ -383,15 +389,15 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
   while (j < mb_h) {
     const int y_lines_in =
         WebPRescalerImport(p->scaler_y, mb_h - j,
-                           io->y + j * io->y_stride, io->y_stride);
+                           io->y + (size_t)j * io->y_stride, io->y_stride);
     j += y_lines_in;
     if (WebPRescaleNeededLines(p->scaler_u, uv_mb_h - uv_j)) {
-      const int u_lines_in =
-          WebPRescalerImport(p->scaler_u, uv_mb_h - uv_j,
-                             io->u + uv_j * io->uv_stride, io->uv_stride);
-      const int v_lines_in =
-          WebPRescalerImport(p->scaler_v, uv_mb_h - uv_j,
-                             io->v + uv_j * io->uv_stride, io->uv_stride);
+      const int u_lines_in = WebPRescalerImport(
+          p->scaler_u, uv_mb_h - uv_j, io->u + (size_t)uv_j * io->uv_stride,
+          io->uv_stride);
+      const int v_lines_in = WebPRescalerImport(
+          p->scaler_v, uv_mb_h - uv_j, io->v + (size_t)uv_j * io->uv_stride,
+          io->uv_stride);
       (void)v_lines_in;   // remove a gcc warning
       assert(u_lines_in == v_lines_in);
       uv_j += u_lines_in;
@@ -403,7 +409,7 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
 
 static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
   const WEBP_CSP_MODE colorspace = p->output->colorspace;
   const int alpha_first =
       (colorspace == MODE_ARGB || colorspace == MODE_Argb);
@@ -431,7 +437,7 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
 static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
                                int max_lines_out) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
 #if (WEBP_SWAP_16BIT_CSP == 1)
   uint8_t* alpha_dst = base_rgba;
 #else
@@ -470,7 +476,7 @@ static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
     int lines_left = expected_num_out_lines;
     const int y_end = p->last_y + lines_left;
     while (lines_left > 0) {
-      const int row_offset = scaler->src_y - io->mb_y;
+      const int64_t row_offset = (int64_t)scaler->src_y - io->mb_y;
       WebPRescalerImport(scaler, io->mb_h + io->mb_y - scaler->src_y,
                          io->a + row_offset * io->width, io->width);
       lines_left -= p->emit_alpha_row(p, y_end - lines_left, lines_left);
@@ -485,51 +491,58 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const int out_height = io->scaled_height;
   const int uv_in_width  = (io->mb_w + 1) >> 1;
   const int uv_in_height = (io->mb_h + 1) >> 1;
-  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
+  // scratch memory for one rescaler
+  const size_t work_size = 2 * (size_t)out_width;
   rescaler_t* work;  // rescalers work area
   uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
-  size_t tmp_size1, tmp_size2, total_size, rescaler_size;
+  uint64_t tmp_size1, tmp_size2, total_size;
+  size_t rescaler_size;
   WebPRescaler* scalers;
   const int num_rescalers = has_alpha ? 4 : 3;
 
-  tmp_size1 = 3 * work_size;
-  tmp_size2 = 3 * out_width;
-  if (has_alpha) {
-    tmp_size1 += work_size;
-    tmp_size2 += out_width;
-  }
+  tmp_size1 = (uint64_t)num_rescalers * work_size;
+  tmp_size2 = (uint64_t)num_rescalers * out_width;
   total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
   rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+  total_size += rescaler_size;
+  if (!CheckSizeOverflow(total_size)) {
+    return 0;
+  }
 
-  p->memory = WebPSafeMalloc(1ULL, total_size + rescaler_size);
+  p->memory = WebPSafeMalloc(1ULL, (size_t)total_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
   work = (rescaler_t*)p->memory;
   tmp = (uint8_t*)(work + tmp_size1);
 
-  scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + total_size);
+  scalers = (WebPRescaler*)WEBP_ALIGN(
+      (const uint8_t*)work + total_size - rescaler_size);
   p->scaler_y = &scalers[0];
   p->scaler_u = &scalers[1];
   p->scaler_v = &scalers[2];
   p->scaler_a = has_alpha ? &scalers[3] : NULL;
 
-  WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
-                   tmp + 0 * out_width, out_width, out_height, 0, 1,
-                   work + 0 * work_size);
-  WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
-                   tmp + 1 * out_width, out_width, out_height, 0, 1,
-                   work + 1 * work_size);
-  WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
-                   tmp + 2 * out_width, out_width, out_height, 0, 1,
-                   work + 2 * work_size);
+  if (!WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
+                        tmp + 0 * out_width, out_width, out_height, 0, 1,
+                        work + 0 * work_size) ||
+      !WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
+                        tmp + 1 * out_width, out_width, out_height, 0, 1,
+                        work + 1 * work_size) ||
+      !WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
+                        tmp + 2 * out_width, out_width, out_height, 0, 1,
+                        work + 2 * work_size)) {
+    return 0;
+  }
   p->emit = EmitRescaledRGB;
   WebPInitYUV444Converters();
 
   if (has_alpha) {
-    WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
-                     tmp + 3 * out_width, out_width, out_height, 0, 1,
-                     work + 3 * work_size);
+    if (!WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
+                          tmp + 3 * out_width, out_width, out_height, 0, 1,
+                          work + 3 * work_size)) {
+      return 0;
+    }
     p->emit_alpha = EmitRescaledAlphaRGB;
     if (p->output->colorspace == MODE_RGBA_4444 ||
         p->output->colorspace == MODE_rgbA_4444) {
diff --git a/media/libwebp/dec/quant_dec.c b/media/libwebp/dec/quant_dec.c
index 6ecaf1c453..351da5f561 100644
--- a/media/libwebp/dec/quant_dec.c
+++ b/media/libwebp/dec/quant_dec.c
@@ -61,12 +61,17 @@ static const uint16_t kAcTable[128] = {
 
 void VP8ParseQuant(VP8Decoder* const dec) {
   VP8BitReader* const br = &dec->br_;
-  const int base_q0 = VP8GetValue(br, 7);
-  const int dqy1_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
-  const int dqy2_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
-  const int dqy2_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
-  const int dquv_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
-  const int dquv_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int base_q0 = VP8GetValue(br, 7, "global-header");
+  const int dqy1_dc = VP8Get(br, "global-header") ?
+       VP8GetSignedValue(br, 4, "global-header") : 0;
+  const int dqy2_dc = VP8Get(br, "global-header") ?
+       VP8GetSignedValue(br, 4, "global-header") : 0;
+  const int dqy2_ac = VP8Get(br, "global-header") ?
+       VP8GetSignedValue(br, 4, "global-header") : 0;
+  const int dquv_dc = VP8Get(br, "global-header") ?
+       VP8GetSignedValue(br, 4, "global-header") : 0;
+  const int dquv_ac = VP8Get(br, "global-header") ?
+       VP8GetSignedValue(br, 4, "global-header") : 0;
 
   const VP8SegmentHeader* const hdr = &dec->segment_hdr_;
   int i;
diff --git a/media/libwebp/dec/tree_dec.c b/media/libwebp/dec/tree_dec.c
index 5818860254..b219cdd2c9 100644
--- a/media/libwebp/dec/tree_dec.c
+++ b/media/libwebp/dec/tree_dec.c
@@ -296,20 +296,21 @@ static void ParseIntraMode(VP8BitReader* const br,
   // to decode more than 1 keyframe.
   if (dec->segment_hdr_.update_map_) {
     // Hardcoded tree parsing
-    block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0])
-                    ? VP8GetBit(br, dec->proba_.segments_[1])
-                    : 2 + VP8GetBit(br, dec->proba_.segments_[2]);
+    block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0], "segments")
+                    ?  VP8GetBit(br, dec->proba_.segments_[1], "segments")
+                    :  VP8GetBit(br, dec->proba_.segments_[2], "segments") + 2;
   } else {
     block->segment_ = 0;  // default for intra
   }
-  if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_);
+  if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_, "skip");
 
-  block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
+  block->is_i4x4_ = !VP8GetBit(br, 145, "block-size");
   if (!block->is_i4x4_) {
     // Hardcoded 16x16 intra-mode decision tree.
     const int ymode =
-        VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
-                           : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
+        VP8GetBit(br, 156, "pred-modes") ?
+            (VP8GetBit(br, 128, "pred-modes") ? TM_PRED : H_PRED) :
+            (VP8GetBit(br, 163, "pred-modes") ? V_PRED : DC_PRED);
     block->imodes_[0] = ymode;
     memset(top, ymode, 4 * sizeof(*top));
     memset(left, ymode, 4 * sizeof(*left));
@@ -323,22 +324,25 @@ static void ParseIntraMode(VP8BitReader* const br,
         const uint8_t* const prob = kBModesProba[top[x]][ymode];
 #if (USE_GENERIC_TREE == 1)
         // Generic tree-parsing
-        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
+        int i = kYModesIntra4[VP8GetBit(br, prob[0], "pred-modes")];
         while (i > 0) {
-          i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
+          i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i], "pred-modes")];
         }
         ymode = -i;
 #else
         // Hardcoded tree parsing
-        ymode = !VP8GetBit(br, prob[0]) ? B_DC_PRED :
-                  !VP8GetBit(br, prob[1]) ? B_TM_PRED :
-                    !VP8GetBit(br, prob[2]) ? B_VE_PRED :
-                      !VP8GetBit(br, prob[3]) ?
-                        (!VP8GetBit(br, prob[4]) ? B_HE_PRED :
-                          (!VP8GetBit(br, prob[5]) ? B_RD_PRED : B_VR_PRED)) :
-                        (!VP8GetBit(br, prob[6]) ? B_LD_PRED :
-                          (!VP8GetBit(br, prob[7]) ? B_VL_PRED :
-                            (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
+        ymode = !VP8GetBit(br, prob[0], "pred-modes") ? B_DC_PRED :
+                  !VP8GetBit(br, prob[1], "pred-modes") ? B_TM_PRED :
+                    !VP8GetBit(br, prob[2], "pred-modes") ? B_VE_PRED :
+                      !VP8GetBit(br, prob[3], "pred-modes") ?
+                        (!VP8GetBit(br, prob[4], "pred-modes") ? B_HE_PRED :
+                          (!VP8GetBit(br, prob[5], "pred-modes") ? B_RD_PRED
+                                                                 : B_VR_PRED)) :
+                        (!VP8GetBit(br, prob[6], "pred-modes") ? B_LD_PRED :
+                          (!VP8GetBit(br, prob[7], "pred-modes") ? B_VL_PRED :
+                            (!VP8GetBit(br, prob[8], "pred-modes") ? B_HD_PRED
+                                                                   : B_HU_PRED))
+                        );
 #endif  // USE_GENERIC_TREE
         top[x] = ymode;
       }
@@ -348,9 +352,9 @@ static void ParseIntraMode(VP8BitReader* const br,
     }
   }
   // Hardcoded UVMode decision tree
-  block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
-                 : !VP8GetBit(br, 114) ? V_PRED
-                 : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
+  block->uvmode_ = !VP8GetBit(br, 142, "pred-modes-uv") ? DC_PRED
+                 : !VP8GetBit(br, 114, "pred-modes-uv") ? V_PRED
+                 : VP8GetBit(br, 183, "pred-modes-uv") ? TM_PRED : H_PRED;
 }
 
 int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec) {
@@ -514,8 +518,10 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
     for (b = 0; b < NUM_BANDS; ++b) {
       for (c = 0; c < NUM_CTX; ++c) {
         for (p = 0; p < NUM_PROBAS; ++p) {
-          const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ?
-                        VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p];
+          const int v =
+              VP8GetBit(br, CoeffsUpdateProba[t][b][c][p], "global-header") ?
+                        VP8GetValue(br, 8, "global-header") :
+                        CoeffsProba0[t][b][c][p];
           proba->bands_[t][b].probas_[c][p] = v;
         }
       }
@@ -524,9 +530,8 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
       proba->bands_ptr_[t][b] = &proba->bands_[t][kBands[b]];
     }
   }
-  dec->use_skip_proba_ = VP8Get(br);
+  dec->use_skip_proba_ = VP8Get(br, "global-header");
   if (dec->use_skip_proba_) {
-    dec->skip_p_ = VP8GetValue(br, 8);
+    dec->skip_p_ = VP8GetValue(br, 8, "global-header");
   }
 }
-
diff --git a/media/libwebp/dec/vp8_dec.c b/media/libwebp/dec/vp8_dec.c
index e7958be6b0..5f51363e53 100644
--- a/media/libwebp/dec/vp8_dec.c
+++ b/media/libwebp/dec/vp8_dec.c
@@ -161,23 +161,26 @@ static int ParseSegmentHeader(VP8BitReader* br,
                               VP8SegmentHeader* hdr, VP8Proba* proba) {
   assert(br != NULL);
   assert(hdr != NULL);
-  hdr->use_segment_ = VP8Get(br);
+  hdr->use_segment_ = VP8Get(br, "global-header");
   if (hdr->use_segment_) {
-    hdr->update_map_ = VP8Get(br);
-    if (VP8Get(br)) {   // update data
+    hdr->update_map_ = VP8Get(br, "global-header");
+    if (VP8Get(br, "global-header")) {   // update data
       int s;
-      hdr->absolute_delta_ = VP8Get(br);
+      hdr->absolute_delta_ = VP8Get(br, "global-header");
       for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        hdr->quantizer_[s] = VP8Get(br) ? VP8GetSignedValue(br, 7) : 0;
+        hdr->quantizer_[s] = VP8Get(br, "global-header") ?
+            VP8GetSignedValue(br, 7, "global-header") : 0;
       }
       for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        hdr->filter_strength_[s] = VP8Get(br) ? VP8GetSignedValue(br, 6) : 0;
+        hdr->filter_strength_[s] = VP8Get(br, "global-header") ?
+            VP8GetSignedValue(br, 6, "global-header") : 0;
       }
     }
     if (hdr->update_map_) {
       int s;
       for (s = 0; s < MB_FEATURE_TREE_PROBS; ++s) {
-        proba->segments_[s] = VP8Get(br) ? VP8GetValue(br, 8) : 255u;
+        proba->segments_[s] = VP8Get(br, "global-header") ?
+            VP8GetValue(br, 8, "global-header") : 255u;
       }
     }
   } else {
@@ -205,7 +208,7 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
   size_t last_part;
   size_t p;
 
-  dec->num_parts_minus_one_ = (1 << VP8GetValue(br, 2)) - 1;
+  dec->num_parts_minus_one_ = (1 << VP8GetValue(br, 2, "global-header")) - 1;
   last_part = dec->num_parts_minus_one_;
   if (size < 3 * last_part) {
     // we can't even read the sizes with sz[]! That's a failure.
@@ -229,21 +232,21 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
 // Paragraph 9.4
 static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
   VP8FilterHeader* const hdr = &dec->filter_hdr_;
-  hdr->simple_    = VP8Get(br);
-  hdr->level_     = VP8GetValue(br, 6);
-  hdr->sharpness_ = VP8GetValue(br, 3);
-  hdr->use_lf_delta_ = VP8Get(br);
+  hdr->simple_    = VP8Get(br, "global-header");
+  hdr->level_     = VP8GetValue(br, 6, "global-header");
+  hdr->sharpness_ = VP8GetValue(br, 3, "global-header");
+  hdr->use_lf_delta_ = VP8Get(br, "global-header");
   if (hdr->use_lf_delta_) {
-    if (VP8Get(br)) {   // update lf-delta?
+    if (VP8Get(br, "global-header")) {   // update lf-delta?
       int i;
       for (i = 0; i < NUM_REF_LF_DELTAS; ++i) {
-        if (VP8Get(br)) {
-          hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6);
+        if (VP8Get(br, "global-header")) {
+          hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6, "global-header");
         }
       }
       for (i = 0; i < NUM_MODE_LF_DELTAS; ++i) {
-        if (VP8Get(br)) {
-          hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6);
+        if (VP8Get(br, "global-header")) {
+          hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6, "global-header");
         }
       }
     }
@@ -332,7 +335,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
     io->scaled_width = io->width;
     io->scaled_height = io->height;
 
-    io->mb_w = io->width;   // sanity check
+    io->mb_w = io->width;   // for soundness
     io->mb_h = io->height;  // ditto
 
     VP8ResetProba(&dec->proba_);
@@ -352,8 +355,8 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
   buf_size -= frm_hdr->partition_length_;
 
   if (frm_hdr->key_frame_) {
-    pic_hdr->colorspace_ = VP8Get(br);
-    pic_hdr->clamp_type_ = VP8Get(br);
+    pic_hdr->colorspace_ = VP8Get(br, "global-header");
+    pic_hdr->clamp_type_ = VP8Get(br, "global-header");
   }
   if (!ParseSegmentHeader(br, &dec->segment_hdr_, &dec->proba_)) {
     return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@@ -378,7 +381,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
                        "Not a key frame.");
   }
 
-  VP8Get(br);   // ignore the value of update_proba_
+  VP8Get(br, "global-header");   // ignore the value of update_proba_
 
   VP8ParseProba(br, dec);
 
@@ -400,31 +403,31 @@ static const uint8_t kZigzag[16] = {
   0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 
-// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+// See section 13-2: https://datatracker.ietf.org/doc/html/rfc6386#section-13.2
 static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
   int v;
-  if (!VP8GetBit(br, p[3])) {
-    if (!VP8GetBit(br, p[4])) {
+  if (!VP8GetBit(br, p[3], "coeffs")) {
+    if (!VP8GetBit(br, p[4], "coeffs")) {
       v = 2;
     } else {
-      v = 3 + VP8GetBit(br, p[5]);
+      v = 3 + VP8GetBit(br, p[5], "coeffs");
     }
   } else {
-    if (!VP8GetBit(br, p[6])) {
-      if (!VP8GetBit(br, p[7])) {
-        v = 5 + VP8GetBit(br, 159);
+    if (!VP8GetBit(br, p[6], "coeffs")) {
+      if (!VP8GetBit(br, p[7], "coeffs")) {
+        v = 5 + VP8GetBit(br, 159, "coeffs");
       } else {
-        v = 7 + 2 * VP8GetBit(br, 165);
-        v += VP8GetBit(br, 145);
+        v = 7 + 2 * VP8GetBit(br, 165, "coeffs");
+        v += VP8GetBit(br, 145, "coeffs");
       }
     } else {
       const uint8_t* tab;
-      const int bit1 = VP8GetBit(br, p[8]);
-      const int bit0 = VP8GetBit(br, p[9 + bit1]);
+      const int bit1 = VP8GetBit(br, p[8], "coeffs");
+      const int bit0 = VP8GetBit(br, p[9 + bit1], "coeffs");
       const int cat = 2 * bit1 + bit0;
       v = 0;
       for (tab = kCat3456[cat]; *tab; ++tab) {
-        v += v + VP8GetBit(br, *tab);
+        v += v + VP8GetBit(br, *tab, "coeffs");
       }
       v += 3 + (8 << cat);
     }
@@ -438,24 +441,24 @@ static int GetCoeffsFast(VP8BitReader* const br,
                          int ctx, const quant_t dq, int n, int16_t* out) {
   const uint8_t* p = prob[n]->probas_[ctx];
   for (; n < 16; ++n) {
-    if (!VP8GetBit(br, p[0])) {
+    if (!VP8GetBit(br, p[0], "coeffs")) {
       return n;  // previous coeff was last non-zero coeff
     }
-    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
+    while (!VP8GetBit(br, p[1], "coeffs")) {       // sequence of zero coeffs
       p = prob[++n]->probas_[0];
       if (n == 16) return 16;
     }
     {        // non zero coeff
       const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
       int v;
-      if (!VP8GetBit(br, p[2])) {
+      if (!VP8GetBit(br, p[2], "coeffs")) {
         v = 1;
         p = p_ctx[1];
       } else {
         v = GetLargeValue(br, p);
         p = p_ctx[2];
       }
-      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+      out[kZigzag[n]] = VP8GetSigned(br, v, "coeffs") * dq[n > 0];
     }
   }
   return 16;
@@ -468,36 +471,34 @@ static int GetCoeffsAlt(VP8BitReader* const br,
                         int ctx, const quant_t dq, int n, int16_t* out) {
   const uint8_t* p = prob[n]->probas_[ctx];
   for (; n < 16; ++n) {
-    if (!VP8GetBitAlt(br, p[0])) {
+    if (!VP8GetBitAlt(br, p[0], "coeffs")) {
       return n;  // previous coeff was last non-zero coeff
     }
-    while (!VP8GetBitAlt(br, p[1])) {       // sequence of zero coeffs
+    while (!VP8GetBitAlt(br, p[1], "coeffs")) {       // sequence of zero coeffs
       p = prob[++n]->probas_[0];
       if (n == 16) return 16;
     }
     {        // non zero coeff
       const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
       int v;
-      if (!VP8GetBitAlt(br, p[2])) {
+      if (!VP8GetBitAlt(br, p[2], "coeffs")) {
         v = 1;
         p = p_ctx[1];
       } else {
         v = GetLargeValue(br, p);
         p = p_ctx[2];
       }
-      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+      out[kZigzag[n]] = VP8GetSigned(br, v, "coeffs") * dq[n > 0];
     }
   }
   return 16;
 }
 
-static WEBP_TSAN_IGNORE_FUNCTION void InitGetCoeffs(void) {
-  if (GetCoeffs == NULL) {
-    if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
-      GetCoeffs = GetCoeffsAlt;
-    } else {
-      GetCoeffs = GetCoeffsFast;
-    }
+WEBP_DSP_INIT_FUNC(InitGetCoeffs) {
+  if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
+    GetCoeffs = GetCoeffsAlt;
+  } else {
+    GetCoeffs = GetCoeffsFast;
   }
 }
 
diff --git a/media/libwebp/dec/vp8i_dec.h b/media/libwebp/dec/vp8i_dec.h
index fabee44a0b..31d9080ca1 100644
--- a/media/libwebp/dec/vp8i_dec.h
+++ b/media/libwebp/dec/vp8i_dec.h
@@ -31,7 +31,7 @@ extern "C" {
 
 // version numbers
 #define DEC_MAJ_VERSION 1
-#define DEC_MIN_VERSION 0
+#define DEC_MIN_VERSION 2
 #define DEC_REV_VERSION 2
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
diff --git a/media/libwebp/dec/vp8l_dec.c b/media/libwebp/dec/vp8l_dec.c
index 0502cb9a52..32371a67fe 100644
--- a/media/libwebp/dec/vp8l_dec.c
+++ b/media/libwebp/dec/vp8l_dec.c
@@ -84,7 +84,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
 // to 256 (green component values) + 24 (length prefix values)
 // + color_cache_size (between 0 and 2048).
 // All values computed for 8-bit first level lookup with Mark Adler's tool:
-// http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
+// https://github.com/madler/zlib/blob/v1.2.5/examples/enough.c
 #define FIXED_TABLE_SIZE (630 * 3 + 410)
 static const uint16_t kTableSize[12] = {
   FIXED_TABLE_SIZE + 654,
@@ -362,12 +362,8 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
   VP8LMetadata* const hdr = &dec->hdr_;
   uint32_t* huffman_image = NULL;
   HTreeGroup* htree_groups = NULL;
-  // When reading htrees, some might be unused, as the format allows it.
-  // We will still read them but put them in this htree_group_bogus.
-  HTreeGroup htree_group_bogus;
   HuffmanCode* huffman_tables = NULL;
-  HuffmanCode* huffman_tables_bogus = NULL;
-  HuffmanCode* next = NULL;
+  HuffmanCode* huffman_table = NULL;
   int num_htree_groups = 1;
   int num_htree_groups_max = 1;
   int max_alphabet_size = 0;
@@ -418,12 +414,6 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
         if (*mapped_group == -1) *mapped_group = num_htree_groups++;
         huffman_image[i] = *mapped_group;
       }
-      huffman_tables_bogus = (HuffmanCode*)WebPSafeMalloc(
-          table_size, sizeof(*huffman_tables_bogus));
-      if (huffman_tables_bogus == NULL) {
-        dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-        goto Error;
-      }
     } else {
       num_htree_groups = num_htree_groups_max;
     }
@@ -453,63 +443,71 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
     goto Error;
   }
 
-  next = huffman_tables;
+  huffman_table = huffman_tables;
   for (i = 0; i < num_htree_groups_max; ++i) {
-    // If the index "i" is unused in the Huffman image, read the coefficients
-    // but store them to a bogus htree_group.
-    const int is_bogus = (mapping != NULL && mapping[i] == -1);
-    HTreeGroup* const htree_group =
-        is_bogus ? &htree_group_bogus :
-        &htree_groups[(mapping == NULL) ? i : mapping[i]];
-    HuffmanCode** const htrees = htree_group->htrees;
-    HuffmanCode* huffman_tables_i = is_bogus ? huffman_tables_bogus : next;
-    int size;
-    int total_size = 0;
-    int is_trivial_literal = 1;
-    int max_bits = 0;
-    for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
-      int alphabet_size = kAlphabetSize[j];
-      htrees[j] = huffman_tables_i;
-      if (j == 0 && color_cache_bits > 0) {
-        alphabet_size += 1 << color_cache_bits;
-      }
-      size =
-          ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_tables_i);
-      if (size == 0) {
-        goto Error;
-      }
-      if (is_trivial_literal && kLiteralMap[j] == 1) {
-        is_trivial_literal = (huffman_tables_i->bits == 0);
+    // If the index "i" is unused in the Huffman image, just make sure the
+    // coefficients are valid but do not store them.
+    if (mapping != NULL && mapping[i] == -1) {
+      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+        int alphabet_size = kAlphabetSize[j];
+        if (j == 0 && color_cache_bits > 0) {
+          alphabet_size += (1 << color_cache_bits);
+        }
+        // Passing in NULL so that nothing gets filled.
+        if (!ReadHuffmanCode(alphabet_size, dec, code_lengths, NULL)) {
+          goto Error;
+        }
       }
-      total_size += huffman_tables_i->bits;
-      huffman_tables_i += size;
-      if (j <= ALPHA) {
-        int local_max_bits = code_lengths[0];
-        int k;
-        for (k = 1; k < alphabet_size; ++k) {
-          if (code_lengths[k] > local_max_bits) {
-            local_max_bits = code_lengths[k];
+    } else {
+      HTreeGroup* const htree_group =
+          &htree_groups[(mapping == NULL) ? i : mapping[i]];
+      HuffmanCode** const htrees = htree_group->htrees;
+      int size;
+      int total_size = 0;
+      int is_trivial_literal = 1;
+      int max_bits = 0;
+      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+        int alphabet_size = kAlphabetSize[j];
+        htrees[j] = huffman_table;
+        if (j == 0 && color_cache_bits > 0) {
+          alphabet_size += (1 << color_cache_bits);
+        }
+        size = ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_table);
+        if (size == 0) {
+          goto Error;
+        }
+        if (is_trivial_literal && kLiteralMap[j] == 1) {
+          is_trivial_literal = (huffman_table->bits == 0);
+        }
+        total_size += huffman_table->bits;
+        huffman_table += size;
+        if (j <= ALPHA) {
+          int local_max_bits = code_lengths[0];
+          int k;
+          for (k = 1; k < alphabet_size; ++k) {
+            if (code_lengths[k] > local_max_bits) {
+              local_max_bits = code_lengths[k];
+            }
           }
+          max_bits += local_max_bits;
         }
-        max_bits += local_max_bits;
       }
-    }
-    if (!is_bogus) next = huffman_tables_i;
-    htree_group->is_trivial_literal = is_trivial_literal;
-    htree_group->is_trivial_code = 0;
-    if (is_trivial_literal) {
-      const int red = htrees[RED][0].value;
-      const int blue = htrees[BLUE][0].value;
-      const int alpha = htrees[ALPHA][0].value;
-      htree_group->literal_arb = ((uint32_t)alpha << 24) | (red << 16) | blue;
-      if (total_size == 0 && htrees[GREEN][0].value < NUM_LITERAL_CODES) {
-        htree_group->is_trivial_code = 1;
-        htree_group->literal_arb |= htrees[GREEN][0].value << 8;
+      htree_group->is_trivial_literal = is_trivial_literal;
+      htree_group->is_trivial_code = 0;
+      if (is_trivial_literal) {
+        const int red = htrees[RED][0].value;
+        const int blue = htrees[BLUE][0].value;
+        const int alpha = htrees[ALPHA][0].value;
+        htree_group->literal_arb = ((uint32_t)alpha << 24) | (red << 16) | blue;
+        if (total_size == 0 && htrees[GREEN][0].value < NUM_LITERAL_CODES) {
+          htree_group->is_trivial_code = 1;
+          htree_group->literal_arb |= htrees[GREEN][0].value << 8;
+        }
       }
+      htree_group->use_packed_table =
+          !htree_group->is_trivial_code && (max_bits < HUFFMAN_PACKED_BITS);
+      if (htree_group->use_packed_table) BuildPackedTable(htree_group);
     }
-    htree_group->use_packed_table =
-        !htree_group->is_trivial_code && (max_bits < HUFFMAN_PACKED_BITS);
-    if (htree_group->use_packed_table) BuildPackedTable(htree_group);
   }
   ok = 1;
 
@@ -521,7 +519,6 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
 
  Error:
   WebPSafeFree(code_lengths);
-  WebPSafeFree(huffman_tables_bogus);
   WebPSafeFree(mapping);
   if (!ok) {
     WebPSafeFree(huffman_image);
@@ -562,8 +559,11 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
   memory += work_size * sizeof(*work);
   scaled_data = (uint32_t*)memory;
 
-  WebPRescalerInit(dec->rescaler, in_width, in_height, (uint8_t*)scaled_data,
-                   out_width, out_height, 0, num_channels, work);
+  if (!WebPRescalerInit(dec->rescaler, in_width, in_height,
+                        (uint8_t*)scaled_data, out_width, out_height,
+                        0, num_channels, work)) {
+    return 0;
+  }
   return 1;
 }
 #endif   // WEBP_REDUCE_SIZE
@@ -577,13 +577,14 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
 static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                   int rgba_stride, uint8_t* const rgba) {
   uint32_t* const src = (uint32_t*)rescaler->dst;
+  uint8_t* dst = rgba;
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
-    uint8_t* const dst = rgba + num_lines_out * rgba_stride;
     WebPRescalerExportRow(rescaler);
     WebPMultARGBRow(src, dst_width, 1);
     VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
+    dst += rgba_stride;
     ++num_lines_out;
   }
   return num_lines_out;
@@ -597,8 +598,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
   int num_lines_in = 0;
   int num_lines_out = 0;
   while (num_lines_in < mb_h) {
-    uint8_t* const row_in = in + num_lines_in * in_stride;
-    uint8_t* const row_out = out + num_lines_out * out_stride;
+    uint8_t* const row_in = in + (uint64_t)num_lines_in * in_stride;
+    uint8_t* const row_out = out + (uint64_t)num_lines_out * out_stride;
     const int lines_left = mb_h - num_lines_in;
     const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
     int lines_imported;
@@ -757,11 +758,11 @@ static WEBP_INLINE HTreeGroup* GetHtreeGroupForPos(VP8LMetadata* const hdr,
 
 typedef void (*ProcessRowsFunc)(VP8LDecoder* const dec, int row);
 
-static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
+static void ApplyInverseTransforms(VP8LDecoder* const dec,
+                                   int start_row, int num_rows,
                                    const uint32_t* const rows) {
   int n = dec->next_transform_;
   const int cache_pixs = dec->width_ * num_rows;
-  const int start_row = dec->last_row_;
   const int end_row = start_row + num_rows;
   const uint32_t* rows_in = rows;
   uint32_t* const rows_out = dec->argb_cache_;
@@ -792,15 +793,15 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
     VP8Io* const io = dec->io_;
     uint8_t* rows_data = (uint8_t*)dec->argb_cache_;
     const int in_stride = io->width * sizeof(uint32_t);  // in unit of RGBA
-
-    ApplyInverseTransforms(dec, num_rows, rows);
+    ApplyInverseTransforms(dec, dec->last_row_, num_rows, rows);
     if (!SetCropWindow(io, dec->last_row_, row, &rows_data, in_stride)) {
       // Nothing to output (this time).
     } else {
       const WebPDecBuffer* const output = dec->output_;
       if (WebPIsRGBMode(output->colorspace)) {  // convert to RGBA
         const WebPRGBABuffer* const buf = &output->u.RGBA;
-        uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
+        uint8_t* const rgba =
+            buf->rgba + (int64_t)dec->last_out_row_ * buf->stride;
         const int num_rows_out =
 #if !defined(WEBP_REDUCE_SIZE)
          io->use_scaling ?
@@ -951,7 +952,6 @@ static WEBP_INLINE void CopyBlock8b(uint8_t* const dst, int dist, int length) {
         break;
       default:
         goto Copy;
-        break;
     }
     CopySmallPattern8b(src, dst, length, pattern);
     return;
@@ -1196,6 +1196,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
       VP8LFillBitWindow(br);
       dist_code = GetCopyDistance(dist_symbol, br);
       dist = PlaneCodeToDistance(width, dist_code);
+
       if (VP8LIsEndOfStream(br)) break;
       if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
         goto Error;
@@ -1518,7 +1519,7 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
   assert(dec->width_ <= final_width);
   dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint32_t));
   if (dec->pixels_ == NULL) {
-    dec->argb_cache_ = NULL;    // for sanity check
+    dec->argb_cache_ = NULL;    // for soundness
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     return 0;
   }
@@ -1528,7 +1529,7 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
 
 static int AllocateInternalBuffers8b(VP8LDecoder* const dec) {
   const uint64_t total_num_pixels = (uint64_t)dec->width_ * dec->height_;
-  dec->argb_cache_ = NULL;    // for sanity check
+  dec->argb_cache_ = NULL;    // for soundness
   dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint8_t));
   if (dec->pixels_ == NULL) {
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
@@ -1556,7 +1557,7 @@ static void ExtractAlphaRows(VP8LDecoder* const dec, int last_row) {
     const int cache_pixs = width * num_rows_to_process;
     uint8_t* const dst = output + width * cur_row;
     const uint32_t* const src = dec->argb_cache_;
-    ApplyInverseTransforms(dec, num_rows_to_process, in);
+    ApplyInverseTransforms(dec, cur_row, num_rows_to_process, in);
     WebPExtractGreen(src, dst, cache_pixs);
     AlphaApplyFilter(alph_dec,
                      cur_row, cur_row + num_rows_to_process, dst, width);
@@ -1670,7 +1671,6 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
   VP8Io* io = NULL;
   WebPDecParams* params = NULL;
 
-  // Sanity checks.
   if (dec == NULL) return 0;
 
   assert(dec->hdr_.huffman_tables_ != NULL);
diff --git a/media/libwebp/dec/vp8li_dec.h b/media/libwebp/dec/vp8li_dec.h
index 2b9c95a44b..8df713beb8 100644
--- a/media/libwebp/dec/vp8li_dec.h
+++ b/media/libwebp/dec/vp8li_dec.h
@@ -37,7 +37,7 @@ struct VP8LTransform {
   int                    bits_;   // subsampling bits defining transform window.
   int                    xsize_;  // transform window X index.
   int                    ysize_;  // transform window Y index.
-  uint32_t              *data_;   // transform data.
+  uint32_t*              data_;   // transform data.
 };
 
 typedef struct {
@@ -48,23 +48,23 @@ typedef struct {
   int             huffman_mask_;
   int             huffman_subsample_bits_;
   int             huffman_xsize_;
-  uint32_t       *huffman_image_;
+  uint32_t*       huffman_image_;
   int             num_htree_groups_;
-  HTreeGroup     *htree_groups_;
-  HuffmanCode    *huffman_tables_;
+  HTreeGroup*     htree_groups_;
+  HuffmanCode*    huffman_tables_;
 } VP8LMetadata;
 
 typedef struct VP8LDecoder VP8LDecoder;
 struct VP8LDecoder {
   VP8StatusCode    status_;
   VP8LDecodeState  state_;
-  VP8Io           *io_;
+  VP8Io*           io_;
 
-  const WebPDecBuffer *output_;    // shortcut to io->opaque->output
+  const WebPDecBuffer* output_;    // shortcut to io->opaque->output
 
-  uint32_t        *pixels_;        // Internal data: either uint8_t* for alpha
+  uint32_t*        pixels_;        // Internal data: either uint8_t* for alpha
                                    // or uint32_t* for BGRA.
-  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
+  uint32_t*        argb_cache_;    // Scratch buffer for temporary BGRA storage.
 
   VP8LBitReader    br_;
   int              incremental_;   // if true, incremental decoding is expected
@@ -86,8 +86,8 @@ struct VP8LDecoder {
   // or'd bitset storing the transforms types.
   uint32_t         transforms_seen_;
 
-  uint8_t         *rescaler_memory;  // Working memory for rescaling work.
-  WebPRescaler    *rescaler;         // Common rescaler for all channels.
+  uint8_t*         rescaler_memory;  // Working memory for rescaling work.
+  WebPRescaler*    rescaler;         // Common rescaler for all channels.
 };
 
 //------------------------------------------------------------------------------
diff --git a/media/libwebp/dec/webp_dec.c b/media/libwebp/dec/webp_dec.c
index 89c264d0a0..6857960774 100644
--- a/media/libwebp/dec/webp_dec.c
+++ b/media/libwebp/dec/webp_dec.c
@@ -785,6 +785,13 @@ VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
 //------------------------------------------------------------------------------
 // Cropping and rescaling.
 
+int WebPCheckCropDimensions(int image_width, int image_height,
+                            int x, int y, int w, int h) {
+  return !(x < 0 || y < 0 || w <= 0 || h <= 0 ||
+           x >= image_width || w > image_width || w > image_width - x ||
+           y >= image_height || h > image_height || h > image_height - y);
+}
+
 int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
                           VP8Io* const io, WEBP_CSP_MODE src_colorspace) {
   const int W = io->width;
@@ -792,7 +799,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
   int x = 0, y = 0, w = W, h = H;
 
   // Cropping
-  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
+  io->use_cropping = (options != NULL) && options->use_cropping;
   if (io->use_cropping) {
     w = options->crop_width;
     h = options->crop_height;
@@ -802,7 +809,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
       x &= ~1;
       y &= ~1;
     }
-    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
+    if (!WebPCheckCropDimensions(W, H, x, y, w, h)) {
       return 0;  // out of frame boundary error
     }
   }
@@ -814,7 +821,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
   io->mb_h = h;
 
   // Scaling
-  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
+  io->use_scaling = (options != NULL) && options->use_scaling;
   if (io->use_scaling) {
     int scaled_width = options->scaled_width;
     int scaled_height = options->scaled_height;
@@ -835,8 +842,8 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
 
   if (io->use_scaling) {
     // disable filter (only for large downscaling ratio).
-    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
-                           (io->scaled_height < H * 3 / 4);
+    io->bypass_filtering |= (io->scaled_width < W * 3 / 4) &&
+                            (io->scaled_height < H * 3 / 4);
     io->fancy_upsampling = 0;
   }
   return 1;
diff --git a/media/libwebp/dec/webpi_dec.h b/media/libwebp/dec/webpi_dec.h
index 83d7444e51..a1b7c83fcd 100644
--- a/media/libwebp/dec/webpi_dec.h
+++ b/media/libwebp/dec/webpi_dec.h
@@ -77,6 +77,10 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);
 //------------------------------------------------------------------------------
 // Misc utils
 
+// Returns true if crop dimensions are within image bounds.
+int WebPCheckCropDimensions(int image_width, int image_height,
+                            int x, int y, int w, int h);
+
 // Initializes VP8Io with custom setup, io and teardown functions. The default
 // hooks will use the supplied 'params' as io->opaque handle.
 void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
diff --git a/media/libwebp/demux/demux.c b/media/libwebp/demux/demux.c
index 2034024d06..13953b1c54 100644
--- a/media/libwebp/demux/demux.c
+++ b/media/libwebp/demux/demux.c
@@ -24,7 +24,7 @@
 #include "../webp/format_constants.h"
 
 #define DMUX_MAJ_VERSION 1
-#define DMUX_MIN_VERSION 0
+#define DMUX_MIN_VERSION 2
 #define DMUX_REV_VERSION 2
 
 typedef struct {
@@ -221,12 +221,16 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
     const size_t chunk_start_offset = mem->start_;
     const uint32_t fourcc = ReadLE32(mem);
     const uint32_t payload_size = ReadLE32(mem);
-    const uint32_t payload_size_padded = payload_size + (payload_size & 1);
-    const size_t payload_available = (payload_size_padded > MemDataSize(mem))
-                                   ? MemDataSize(mem) : payload_size_padded;
-    const size_t chunk_size = CHUNK_HEADER_SIZE + payload_available;
+    uint32_t payload_size_padded;
+    size_t payload_available;
+    size_t chunk_size;
 
     if (payload_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+
+    payload_size_padded = payload_size + (payload_size & 1);
+    payload_available = (payload_size_padded > MemDataSize(mem))
+                      ? MemDataSize(mem) : payload_size_padded;
+    chunk_size = CHUNK_HEADER_SIZE + payload_available;
     if (SizeIsInvalid(mem, payload_size_padded)) return PARSE_ERROR;
     if (payload_size_padded > MemDataSize(mem)) status = PARSE_NEED_MORE_DATA;
 
@@ -312,6 +316,7 @@ static ParseStatus ParseAnimationFrame(
   int bits;
   MemBuffer* const mem = &dmux->mem_;
   Frame* frame;
+  size_t start_offset;
   ParseStatus status =
       NewFrame(mem, ANMF_CHUNK_SIZE, frame_chunk_size, &frame);
   if (status != PARSE_OK) return status;
@@ -332,7 +337,11 @@ static ParseStatus ParseAnimationFrame(
 
   // Store a frame only if the animation flag is set there is some data for
   // this frame is available.
+  start_offset = mem->start_;
   status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
+  if (status != PARSE_ERROR && mem->start_ - start_offset > anmf_payload_size) {
+    status = PARSE_ERROR;
+  }
   if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
     added_frame = AddFrame(dmux, frame);
     if (added_frame) {
@@ -446,9 +455,11 @@ static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
     const size_t chunk_start_offset = mem->start_;
     const uint32_t fourcc = ReadLE32(mem);
     const uint32_t chunk_size = ReadLE32(mem);
-    const uint32_t chunk_size_padded = chunk_size + (chunk_size & 1);
+    uint32_t chunk_size_padded;
 
     if (chunk_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+
+    chunk_size_padded = chunk_size + (chunk_size & 1);
     if (SizeIsInvalid(mem, chunk_size_padded)) return PARSE_ERROR;
 
     switch (fourcc) {
diff --git a/media/libwebp/dsp/alpha_processing.c b/media/libwebp/dsp/alpha_processing.c
index 6ff1352ae2..8c5e90210f 100644
--- a/media/libwebp/dsp/alpha_processing.c
+++ b/media/libwebp/dsp/alpha_processing.c
@@ -157,7 +157,8 @@ void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
   }
 }
 
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
+                   const uint8_t* WEBP_RESTRICT const alpha,
                    int width, int inverse) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -178,7 +179,8 @@ void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
 #undef MFIX
 
 void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
-void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
+                    const uint8_t* WEBP_RESTRICT const alpha,
                     int width, int inverse);
 
 //------------------------------------------------------------------------------
@@ -193,8 +195,8 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
   }
 }
 
-void WebPMultRows(uint8_t* ptr, int stride,
-                  const uint8_t* alpha, int alpha_stride,
+void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
+                  const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
                   int width, int num_rows, int inverse) {
   int n;
   for (n = 0; n < num_rows; ++n) {
@@ -290,9 +292,9 @@ static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
 }
 
 #if !WEBP_NEON_OMIT_C_CODE
-static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
+static int DispatchAlpha_C(const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
                            int width, int height,
-                           uint8_t* dst, int dst_stride) {
+                           uint8_t* WEBP_RESTRICT dst, int dst_stride) {
   uint32_t alpha_mask = 0xff;
   int i, j;
 
@@ -309,9 +311,10 @@ static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
   return (alpha_mask != 0xff);
 }
 
-static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
-                                   int width, int height,
-                                   uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_C(const uint8_t* WEBP_RESTRICT alpha,
+                                   int alpha_stride, int width, int height,
+                                   uint32_t* WEBP_RESTRICT dst,
+                                   int dst_stride) {
   int i, j;
   for (j = 0; j < height; ++j) {
     for (i = 0; i < width; ++i) {
@@ -322,9 +325,9 @@ static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_C(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                           int width, int height,
-                          uint8_t* alpha, int alpha_stride) {
+                          uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   uint8_t alpha_mask = 0xff;
   int i, j;
 
@@ -340,7 +343,8 @@ static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
   return (alpha_mask == 0xff);
 }
 
-static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
+static void ExtractGreen_C(const uint32_t* WEBP_RESTRICT argb,
+                           uint8_t* WEBP_RESTRICT alpha, int size) {
   int i;
   for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
 }
@@ -359,6 +363,11 @@ static int HasAlpha32b_C(const uint8_t* src, int length) {
   return 0;
 }
 
+static void AlphaReplace_C(uint32_t* src, int length, uint32_t color) {
+  int x;
+  for (x = 0; x < length; ++x) if ((src[x] >> 24) == 0) src[x] = color;
+}
+
 //------------------------------------------------------------------------------
 // Simple channel manipulations.
 
@@ -367,8 +376,11 @@ static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
 }
 
 #ifdef WORDS_BIGENDIAN
-static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                       const uint8_t* b, int len, uint32_t* out) {
+static void PackARGB_C(const uint8_t* WEBP_RESTRICT a,
+                       const uint8_t* WEBP_RESTRICT r,
+                       const uint8_t* WEBP_RESTRICT g,
+                       const uint8_t* WEBP_RESTRICT b,
+                       int len, uint32_t* WEBP_RESTRICT out) {
   int i;
   for (i = 0; i < len; ++i) {
     out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
@@ -376,8 +388,10 @@ static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
 }
 #endif
 
-static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                      int len, int step, uint32_t* out) {
+static void PackRGB_C(const uint8_t* WEBP_RESTRICT r,
+                      const uint8_t* WEBP_RESTRICT g,
+                      const uint8_t* WEBP_RESTRICT b,
+                      int len, int step, uint32_t* WEBP_RESTRICT out) {
   int i, offset = 0;
   for (i = 0; i < len; ++i) {
     out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
@@ -387,19 +401,26 @@ static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
 
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
-int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
-void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
-int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
-void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
+                         uint8_t* WEBP_RESTRICT, int);
+void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT, int, int, int,
+                                 uint32_t* WEBP_RESTRICT, int);
+int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
+                        uint8_t* WEBP_RESTRICT, int);
+void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
+                         uint8_t* WEBP_RESTRICT alpha, int size);
 #ifdef WORDS_BIGENDIAN
 void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
                      const uint8_t* b, int, uint32_t*);
 #endif
-void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out);
+void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
+                    const uint8_t* WEBP_RESTRICT g,
+                    const uint8_t* WEBP_RESTRICT b,
+                    int len, int step, uint32_t* WEBP_RESTRICT out);
 
 int (*WebPHasAlpha8b)(const uint8_t* src, int length);
 int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
 
 //------------------------------------------------------------------------------
 // Init function
@@ -428,13 +449,14 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
 
   WebPHasAlpha8b = HasAlpha8b_C;
   WebPHasAlpha32b = HasAlpha32b_C;
+  WebPAlphaReplace = AlphaReplace_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitAlphaProcessingSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
       if (VP8GetCPUInfo(kSSE4_1)) {
         WebPInitAlphaProcessingSSE41();
       }
@@ -448,7 +470,7 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPInitAlphaProcessingNEON();
@@ -469,4 +491,5 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
   assert(WebPPackRGB != NULL);
   assert(WebPHasAlpha8b != NULL);
   assert(WebPHasAlpha32b != NULL);
+  assert(WebPAlphaReplace != NULL);
 }
diff --git a/media/libwebp/dsp/alpha_processing_mips_dsp_r2.c b/media/libwebp/dsp/alpha_processing_mips_dsp_r2.c
new file mode 100644
index 0000000000..ab597e68bb
--- /dev/null
+++ b/media/libwebp/dsp/alpha_processing_mips_dsp_r2.c
@@ -0,0 +1,228 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+//            Djordje Pesut  (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
+                                   int width, int height,
+                                   uint8_t* dst, int dst_stride) {
+  uint32_t alpha_mask = 0xffffffff;
+  int i, j, temp0;
+
+  for (j = 0; j < height; ++j) {
+    uint8_t* pdst = dst;
+    const uint8_t* palpha = alpha;
+    for (i = 0; i < (width >> 2); ++i) {
+      int temp1, temp2, temp3;
+
+      __asm__ volatile (
+        "ulw    %[temp0],      0(%[palpha])                \n\t"
+        "addiu  %[palpha],     %[palpha],     4            \n\t"
+        "addiu  %[pdst],       %[pdst],       16           \n\t"
+        "srl    %[temp1],      %[temp0],      8            \n\t"
+        "srl    %[temp2],      %[temp0],      16           \n\t"
+        "srl    %[temp3],      %[temp0],      24           \n\t"
+        "and    %[alpha_mask], %[alpha_mask], %[temp0]     \n\t"
+        "sb     %[temp0],      -16(%[pdst])                \n\t"
+        "sb     %[temp1],      -12(%[pdst])                \n\t"
+        "sb     %[temp2],      -8(%[pdst])                 \n\t"
+        "sb     %[temp3],      -4(%[pdst])                 \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+          [temp3]"=&r"(temp3), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
+          [alpha_mask]"+r"(alpha_mask)
+        :
+        : "memory"
+      );
+    }
+
+    for (i = 0; i < (width & 3); ++i) {
+      __asm__ volatile (
+        "lbu    %[temp0],      0(%[palpha])                \n\t"
+        "addiu  %[palpha],     %[palpha],     1            \n\t"
+        "sb     %[temp0],      0(%[pdst])                  \n\t"
+        "and    %[alpha_mask], %[alpha_mask], %[temp0]     \n\t"
+        "addiu  %[pdst],       %[pdst],       4            \n\t"
+        : [temp0]"=&r"(temp0), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
+          [alpha_mask]"+r"(alpha_mask)
+        :
+        : "memory"
+      );
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+
+  __asm__ volatile (
+    "ext    %[temp0],      %[alpha_mask], 0, 16            \n\t"
+    "srl    %[alpha_mask], %[alpha_mask], 16               \n\t"
+    "and    %[alpha_mask], %[alpha_mask], %[temp0]         \n\t"
+    "ext    %[temp0],      %[alpha_mask], 0, 8             \n\t"
+    "srl    %[alpha_mask], %[alpha_mask], 8                \n\t"
+    "and    %[alpha_mask], %[alpha_mask], %[temp0]         \n\t"
+    : [temp0]"=&r"(temp0), [alpha_mask]"+r"(alpha_mask)
+    :
+  );
+
+  return (alpha_mask != 0xff);
+}
+
+static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
+                                  int inverse) {
+  int x;
+  const uint32_t c_00ffffff = 0x00ffffffu;
+  const uint32_t c_ff000000 = 0xff000000u;
+  const uint32_t c_8000000  = 0x00800000u;
+  const uint32_t c_8000080  = 0x00800080u;
+  for (x = 0; x < width; ++x) {
+    const uint32_t argb = ptr[x];
+    if (argb < 0xff000000u) {      // alpha < 255
+      if (argb <= 0x00ffffffu) {   // alpha == 0
+        ptr[x] = 0;
+      } else {
+        int temp0, temp1, temp2, temp3, alpha;
+        __asm__ volatile (
+          "srl          %[alpha],   %[argb],       24                \n\t"
+          "replv.qb     %[temp0],   %[alpha]                         \n\t"
+          "and          %[temp0],   %[temp0],      %[c_00ffffff]     \n\t"
+          "beqz         %[inverse], 0f                               \n\t"
+          "divu         $zero,      %[c_ff000000], %[alpha]          \n\t"
+          "mflo         %[temp0]                                     \n\t"
+        "0:                                                          \n\t"
+          "andi         %[temp1],   %[argb],       0xff              \n\t"
+          "ext          %[temp2],   %[argb],       8,             8  \n\t"
+          "ext          %[temp3],   %[argb],       16,            8  \n\t"
+          "mul          %[temp1],   %[temp1],      %[temp0]          \n\t"
+          "mul          %[temp2],   %[temp2],      %[temp0]          \n\t"
+          "mul          %[temp3],   %[temp3],      %[temp0]          \n\t"
+          "precrq.ph.w  %[temp1],   %[temp2],      %[temp1]          \n\t"
+          "addu         %[temp3],   %[temp3],      %[c_8000000]      \n\t"
+          "addu         %[temp1],   %[temp1],      %[c_8000080]      \n\t"
+          "precrq.ph.w  %[temp3],   %[argb],       %[temp3]          \n\t"
+          "precrq.qb.ph %[temp1],   %[temp3],      %[temp1]          \n\t"
+          : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+            [temp3]"=&r"(temp3), [alpha]"=&r"(alpha)
+          : [inverse]"r"(inverse), [c_00ffffff]"r"(c_00ffffff),
+            [c_8000000]"r"(c_8000000), [c_8000080]"r"(c_8000080),
+            [c_ff000000]"r"(c_ff000000), [argb]"r"(argb)
+          : "memory", "hi", "lo"
+        );
+        ptr[x] = temp1;
+      }
+    }
+  }
+}
+
+#ifdef WORDS_BIGENDIAN
+static void PackARGB_MIPSdspR2(const uint8_t* a, const uint8_t* r,
+                               const uint8_t* g, const uint8_t* b, int len,
+                               uint32_t* out) {
+  int temp0, temp1, temp2, temp3, offset;
+  const int rest = len & 1;
+  const uint32_t* const loop_end = out + len - rest;
+  const int step = 4;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+#endif  // WORDS_BIGENDIAN
+
+static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
+                              const uint8_t* b, int len, int step,
+                              uint32_t* out) {
+  int temp0, temp1, temp2, offset;
+  const int rest = len & 1;
+  const int a = 0xff;
+  const uint32_t* const loop_end = out + len - rest;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitAlphaProcessingMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
+  WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
+  WebPMultARGBRow = MultARGBRow_MIPSdspR2;
+#ifdef WORDS_BIGENDIAN
+  WebPPackARGB = PackARGB_MIPSdspR2;
+#endif
+  WebPPackRGB = PackRGB_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/alpha_processing_neon.c b/media/libwebp/dsp/alpha_processing_neon.c
index 53dfce2b36..c900279a35 100644
--- a/media/libwebp/dsp/alpha_processing_neon.c
+++ b/media/libwebp/dsp/alpha_processing_neon.c
@@ -80,9 +80,9 @@ static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
 
 //------------------------------------------------------------------------------
 
-static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
-                              int width, int height,
-                              uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha,
+                              int alpha_stride, int width, int height,
+                              uint8_t* WEBP_RESTRICT dst, int dst_stride) {
   uint32_t alpha_mask = 0xffffffffu;
   uint8x8_t mask8 = vdup_n_u8(0xff);
   uint32_t tmp[2];
@@ -112,9 +112,10 @@ static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
   return (alpha_mask != 0xffffffffu);
 }
 
-static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
-                                      int width, int height,
-                                      uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_NEON(const uint8_t* WEBP_RESTRICT alpha,
+                                      int alpha_stride, int width, int height,
+                                      uint32_t* WEBP_RESTRICT dst,
+                                      int dst_stride) {
   int i, j;
   uint8x8x4_t greens;   // leave A/R/B channels zero'd.
   greens.val[0] = vdup_n_u8(0);
@@ -131,9 +132,9 @@ static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                              int width, int height,
-                             uint8_t* alpha, int alpha_stride) {
+                             uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   uint32_t alpha_mask = 0xffffffffu;
   uint8x8_t mask8 = vdup_n_u8(0xff);
   uint32_t tmp[2];
@@ -161,8 +162,8 @@ static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
   return (alpha_mask == 0xffffffffu);
 }
 
-static void ExtractGreen_NEON(const uint32_t* argb,
-                              uint8_t* alpha, int size) {
+static void ExtractGreen_NEON(const uint32_t* WEBP_RESTRICT argb,
+                              uint8_t* WEBP_RESTRICT alpha, int size) {
   int i;
   for (i = 0; i + 16 <= size; i += 16) {
     const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i));
diff --git a/media/libwebp/dsp/alpha_processing_sse2.c b/media/libwebp/dsp/alpha_processing_sse2.c
index 9a3bc4485a..56d9ee5e98 100644
--- a/media/libwebp/dsp/alpha_processing_sse2.c
+++ b/media/libwebp/dsp/alpha_processing_sse2.c
@@ -18,9 +18,9 @@
 
 //------------------------------------------------------------------------------
 
-static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
-                              int width, int height,
-                              uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
+                              int alpha_stride, int width, int height,
+                              uint8_t* WEBP_RESTRICT dst, int dst_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
@@ -72,9 +72,10 @@ static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
   return (alpha_and != 0xff);
 }
 
-static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
-                                      int width, int height,
-                                      uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha,
+                                      int alpha_stride, int width, int height,
+                                      uint32_t* WEBP_RESTRICT dst,
+                                      int dst_stride) {
   int i, j;
   const __m128i zero = _mm_setzero_si128();
   const int limit = width & ~15;
@@ -98,9 +99,9 @@ static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                              int width, int height,
-                             uint8_t* alpha, int alpha_stride) {
+                             uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
@@ -214,7 +215,7 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
 // Alpha detection
 
 static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
-  const __m128i all_0xff = _mm_set1_epi8(0xff);
+  const __m128i all_0xff = _mm_set1_epi8((char)0xff);
   int i = 0;
   for (; i + 16 <= length; i += 16) {
     const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
@@ -228,7 +229,7 @@ static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
 
 static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
   const __m128i alpha_mask = _mm_set1_epi32(0xff);
-  const __m128i all_0xff = _mm_set1_epi8(0xff);
+  const __m128i all_0xff = _mm_set1_epi8((char)0xff);
   int i = 0;
   // We don't know if we can access the last 3 bytes after the last alpha
   // value 'src[4 * length - 4]' (because we don't know if alpha is the first
@@ -265,6 +266,27 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
   return 0;
 }
 
+static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) {
+  const __m128i m_color = _mm_set1_epi32(color);
+  const __m128i zero = _mm_setzero_si128();
+  int i = 0;
+  for (; i + 8 <= length; i += 8) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4));
+    const __m128i b0 = _mm_srai_epi32(a0, 24);
+    const __m128i b1 = _mm_srai_epi32(a1, 24);
+    const __m128i c0 = _mm_cmpeq_epi32(b0, zero);
+    const __m128i c1 = _mm_cmpeq_epi32(b1, zero);
+    const __m128i d0 = _mm_and_si128(c0, m_color);
+    const __m128i d1 = _mm_and_si128(c1, m_color);
+    const __m128i e0 = _mm_andnot_si128(c0, a0);
+    const __m128i e1 = _mm_andnot_si128(c1, a1);
+    _mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0));
+    _mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1));
+  }
+  for (; i < length; ++i) if ((src[i] >> 24) == 0) src[i] = color;
+}
+
 // -----------------------------------------------------------------------------
 // Apply alpha value to rows
 
@@ -296,7 +318,8 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
   if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
 }
 
-static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
+static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr,
+                         const uint8_t* WEBP_RESTRICT const alpha,
                          int width, int inverse) {
   int x = 0;
   if (!inverse) {
@@ -334,6 +357,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
 
   WebPHasAlpha8b = HasAlpha8b_SSE2;
   WebPHasAlpha32b = HasAlpha32b_SSE2;
+  WebPAlphaReplace = AlphaReplace_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/alpha_processing_sse41.c b/media/libwebp/dsp/alpha_processing_sse41.c
index e33c1aba4d..307d200f1f 100644
--- a/media/libwebp/dsp/alpha_processing_sse41.c
+++ b/media/libwebp/dsp/alpha_processing_sse41.c
@@ -19,9 +19,9 @@
 
 //------------------------------------------------------------------------------
 
-static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
-                              int width, int height,
-                              uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb,
+                              int argb_stride, int width, int height,
+                              uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
diff --git a/media/libwebp/dsp/cost.c b/media/libwebp/dsp/cost.c
new file mode 100644
index 0000000000..bf112c7f0c
--- /dev/null
+++ b/media/libwebp/dsp/cost.c
@@ -0,0 +1,411 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+#include "../enc/cost_enc.h"
+
+//------------------------------------------------------------------------------
+// Boolean-cost cost table
+
+const uint16_t VP8EntropyCost[256] = {
+  1792, 1792, 1792, 1536, 1536, 1408, 1366, 1280, 1280, 1216,
+  1178, 1152, 1110, 1076, 1061, 1024, 1024,  992,  968,  951,
+   939,  911,  896,  878,  871,  854,  838,  820,  811,  794,
+   786,  768,  768,  752,  740,  732,  720,  709,  704,  690,
+   683,  672,  666,  655,  647,  640,  631,  622,  615,  607,
+   598,  592,  586,  576,  572,  564,  559,  555,  547,  541,
+   534,  528,  522,  512,  512,  504,  500,  494,  488,  483,
+   477,  473,  467,  461,  458,  452,  448,  443,  438,  434,
+   427,  424,  419,  415,  410,  406,  403,  399,  394,  390,
+   384,  384,  377,  374,  370,  366,  362,  359,  355,  351,
+   347,  342,  342,  336,  333,  330,  326,  323,  320,  316,
+   312,  308,  305,  302,  299,  296,  293,  288,  287,  283,
+   280,  277,  274,  272,  268,  266,  262,  256,  256,  256,
+   251,  248,  245,  242,  240,  237,  234,  232,  228,  226,
+   223,  221,  218,  216,  214,  211,  208,  205,  203,  201,
+   198,  196,  192,  191,  188,  187,  183,  181,  179,  176,
+   175,  171,  171,  168,  165,  163,  160,  159,  156,  154,
+   152,  150,  148,  146,  144,  142,  139,  138,  135,  133,
+   131,  128,  128,  125,  123,  121,  119,  117,  115,  113,
+   111,  110,  107,  105,  103,  102,  100,   98,   96,   94,
+    92,   91,   89,   86,   86,   83,   82,   80,   77,   76,
+    74,   73,   71,   69,   67,   66,   64,   63,   61,   59,
+    57,   55,   54,   52,   51,   49,   47,   46,   44,   43,
+    41,   40,   38,   36,   35,   33,   32,   30,   29,   27,
+    25,   24,   22,   21,   19,   18,   16,   15,   13,   12,
+    10,    9,    7,    6,    4,    3
+};
+
+//------------------------------------------------------------------------------
+// Level cost tables
+
+// fixed costs for coding levels, deduce from the coding tree.
+// This is only the part that doesn't depend on the probability state.
+const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
+     0,  256,  256,  256,  256,  432,  618,  630,
+   731,  640,  640,  828,  901,  948, 1021, 1101,
+  1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
+  1245, 1275, 1318, 1337, 1380, 1410, 1453, 1497,
+  1540, 1570, 1613, 1280, 1295, 1317, 1332, 1358,
+  1373, 1395, 1410, 1454, 1469, 1491, 1506, 1532,
+  1547, 1569, 1584, 1601, 1616, 1638, 1653, 1679,
+  1694, 1716, 1731, 1775, 1790, 1812, 1827, 1853,
+  1868, 1890, 1905, 1727, 1733, 1742, 1748, 1759,
+  1765, 1774, 1780, 1800, 1806, 1815, 1821, 1832,
+  1838, 1847, 1853, 1878, 1884, 1893, 1899, 1910,
+  1916, 1925, 1931, 1951, 1957, 1966, 1972, 1983,
+  1989, 1998, 2004, 2027, 2033, 2042, 2048, 2059,
+  2065, 2074, 2080, 2100, 2106, 2115, 2121, 2132,
+  2138, 2147, 2153, 2178, 2184, 2193, 2199, 2210,
+  2216, 2225, 2231, 2251, 2257, 2266, 2272, 2283,
+  2289, 2298, 2304, 2168, 2174, 2183, 2189, 2200,
+  2206, 2215, 2221, 2241, 2247, 2256, 2262, 2273,
+  2279, 2288, 2294, 2319, 2325, 2334, 2340, 2351,
+  2357, 2366, 2372, 2392, 2398, 2407, 2413, 2424,
+  2430, 2439, 2445, 2468, 2474, 2483, 2489, 2500,
+  2506, 2515, 2521, 2541, 2547, 2556, 2562, 2573,
+  2579, 2588, 2594, 2619, 2625, 2634, 2640, 2651,
+  2657, 2666, 2672, 2692, 2698, 2707, 2713, 2724,
+  2730, 2739, 2745, 2540, 2546, 2555, 2561, 2572,
+  2578, 2587, 2593, 2613, 2619, 2628, 2634, 2645,
+  2651, 2660, 2666, 2691, 2697, 2706, 2712, 2723,
+  2729, 2738, 2744, 2764, 2770, 2779, 2785, 2796,
+  2802, 2811, 2817, 2840, 2846, 2855, 2861, 2872,
+  2878, 2887, 2893, 2913, 2919, 2928, 2934, 2945,
+  2951, 2960, 2966, 2991, 2997, 3006, 3012, 3023,
+  3029, 3038, 3044, 3064, 3070, 3079, 3085, 3096,
+  3102, 3111, 3117, 2981, 2987, 2996, 3002, 3013,
+  3019, 3028, 3034, 3054, 3060, 3069, 3075, 3086,
+  3092, 3101, 3107, 3132, 3138, 3147, 3153, 3164,
+  3170, 3179, 3185, 3205, 3211, 3220, 3226, 3237,
+  3243, 3252, 3258, 3281, 3287, 3296, 3302, 3313,
+  3319, 3328, 3334, 3354, 3360, 3369, 3375, 3386,
+  3392, 3401, 3407, 3432, 3438, 3447, 3453, 3464,
+  3470, 3479, 3485, 3505, 3511, 3520, 3526, 3537,
+  3543, 3552, 3558, 2816, 2822, 2831, 2837, 2848,
+  2854, 2863, 2869, 2889, 2895, 2904, 2910, 2921,
+  2927, 2936, 2942, 2967, 2973, 2982, 2988, 2999,
+  3005, 3014, 3020, 3040, 3046, 3055, 3061, 3072,
+  3078, 3087, 3093, 3116, 3122, 3131, 3137, 3148,
+  3154, 3163, 3169, 3189, 3195, 3204, 3210, 3221,
+  3227, 3236, 3242, 3267, 3273, 3282, 3288, 3299,
+  3305, 3314, 3320, 3340, 3346, 3355, 3361, 3372,
+  3378, 3387, 3393, 3257, 3263, 3272, 3278, 3289,
+  3295, 3304, 3310, 3330, 3336, 3345, 3351, 3362,
+  3368, 3377, 3383, 3408, 3414, 3423, 3429, 3440,
+  3446, 3455, 3461, 3481, 3487, 3496, 3502, 3513,
+  3519, 3528, 3534, 3557, 3563, 3572, 3578, 3589,
+  3595, 3604, 3610, 3630, 3636, 3645, 3651, 3662,
+  3668, 3677, 3683, 3708, 3714, 3723, 3729, 3740,
+  3746, 3755, 3761, 3781, 3787, 3796, 3802, 3813,
+  3819, 3828, 3834, 3629, 3635, 3644, 3650, 3661,
+  3667, 3676, 3682, 3702, 3708, 3717, 3723, 3734,
+  3740, 3749, 3755, 3780, 3786, 3795, 3801, 3812,
+  3818, 3827, 3833, 3853, 3859, 3868, 3874, 3885,
+  3891, 3900, 3906, 3929, 3935, 3944, 3950, 3961,
+  3967, 3976, 3982, 4002, 4008, 4017, 4023, 4034,
+  4040, 4049, 4055, 4080, 4086, 4095, 4101, 4112,
+  4118, 4127, 4133, 4153, 4159, 4168, 4174, 4185,
+  4191, 4200, 4206, 4070, 4076, 4085, 4091, 4102,
+  4108, 4117, 4123, 4143, 4149, 4158, 4164, 4175,
+  4181, 4190, 4196, 4221, 4227, 4236, 4242, 4253,
+  4259, 4268, 4274, 4294, 4300, 4309, 4315, 4326,
+  4332, 4341, 4347, 4370, 4376, 4385, 4391, 4402,
+  4408, 4417, 4423, 4443, 4449, 4458, 4464, 4475,
+  4481, 4490, 4496, 4521, 4527, 4536, 4542, 4553,
+  4559, 4568, 4574, 4594, 4600, 4609, 4615, 4626,
+  4632, 4641, 4647, 3515, 3521, 3530, 3536, 3547,
+  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
+  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
+  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
+  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
+  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
+  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
+  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
+  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
+  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
+  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
+  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
+  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
+  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
+  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
+  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
+  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
+  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
+  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
+  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
+  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
+  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
+  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
+  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
+  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
+  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
+  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
+  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
+  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
+  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
+  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
+  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
+  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
+  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
+  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
+  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
+  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
+  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
+  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
+  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
+  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
+  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
+  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
+  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
+  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
+  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
+  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
+  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
+  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
+  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
+  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
+  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
+  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
+  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
+  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
+  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
+  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
+  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
+  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
+  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
+  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
+  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
+  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
+  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
+  6420, 6429, 6435, 3515, 3521, 3530, 3536, 3547,
+  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
+  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
+  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
+  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
+  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
+  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
+  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
+  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
+  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
+  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
+  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
+  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
+  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
+  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
+  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
+  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
+  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
+  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
+  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
+  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
+  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
+  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
+  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
+  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
+  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
+  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
+  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
+  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
+  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
+  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
+  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
+  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
+  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
+  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
+  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
+  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
+  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
+  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
+  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
+  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
+  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
+  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
+  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
+  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
+  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
+  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
+  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
+  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
+  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
+  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
+  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
+  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
+  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
+  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
+  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
+  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
+  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
+  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
+  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
+  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
+  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
+  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
+  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
+  6420, 6429, 6435, 5303, 5309, 5318, 5324, 5335,
+  5341, 5350, 5356, 5376, 5382, 5391, 5397, 5408,
+  5414, 5423, 5429, 5454, 5460, 5469, 5475, 5486,
+  5492, 5501, 5507, 5527, 5533, 5542, 5548, 5559,
+  5565, 5574, 5580, 5603, 5609, 5618, 5624, 5635,
+  5641, 5650, 5656, 5676, 5682, 5691, 5697, 5708,
+  5714, 5723, 5729, 5754, 5760, 5769, 5775, 5786,
+  5792, 5801, 5807, 5827, 5833, 5842, 5848, 5859,
+  5865, 5874, 5880, 5744, 5750, 5759, 5765, 5776,
+  5782, 5791, 5797, 5817, 5823, 5832, 5838, 5849,
+  5855, 5864, 5870, 5895, 5901, 5910, 5916, 5927,
+  5933, 5942, 5948, 5968, 5974, 5983, 5989, 6000,
+  6006, 6015, 6021, 6044, 6050, 6059, 6065, 6076,
+  6082, 6091, 6097, 6117, 6123, 6132, 6138, 6149,
+  6155, 6164, 6170, 6195, 6201, 6210, 6216, 6227,
+  6233, 6242, 6248, 6268, 6274, 6283, 6289, 6300,
+  6306, 6315, 6321, 6116, 6122, 6131, 6137, 6148,
+  6154, 6163, 6169, 6189, 6195, 6204, 6210, 6221,
+  6227, 6236, 6242, 6267, 6273, 6282, 6288, 6299,
+  6305, 6314, 6320, 6340, 6346, 6355, 6361, 6372,
+  6378, 6387, 6393, 6416, 6422, 6431, 6437, 6448,
+  6454, 6463, 6469, 6489, 6495, 6504, 6510, 6521,
+  6527, 6536, 6542, 6567, 6573, 6582, 6588, 6599,
+  6605, 6614, 6620, 6640, 6646, 6655, 6661, 6672,
+  6678, 6687, 6693, 6557, 6563, 6572, 6578, 6589,
+  6595, 6604, 6610, 6630, 6636, 6645, 6651, 6662,
+  6668, 6677, 6683, 6708, 6714, 6723, 6729, 6740,
+  6746, 6755, 6761, 6781, 6787, 6796, 6802, 6813,
+  6819, 6828, 6834, 6857, 6863, 6872, 6878, 6889,
+  6895, 6904, 6910, 6930, 6936, 6945, 6951, 6962,
+  6968, 6977, 6983, 7008, 7014, 7023, 7029, 7040,
+  7046, 7055, 7061, 7081, 7087, 7096, 7102, 7113,
+  7119, 7128, 7134, 6392, 6398, 6407, 6413, 6424,
+  6430, 6439, 6445, 6465, 6471, 6480, 6486, 6497,
+  6503, 6512, 6518, 6543, 6549, 6558, 6564, 6575,
+  6581, 6590, 6596, 6616, 6622, 6631, 6637, 6648,
+  6654, 6663, 6669, 6692, 6698, 6707, 6713, 6724,
+  6730, 6739, 6745, 6765, 6771, 6780, 6786, 6797,
+  6803, 6812, 6818, 6843, 6849, 6858, 6864, 6875,
+  6881, 6890, 6896, 6916, 6922, 6931, 6937, 6948,
+  6954, 6963, 6969, 6833, 6839, 6848, 6854, 6865,
+  6871, 6880, 6886, 6906, 6912, 6921, 6927, 6938,
+  6944, 6953, 6959, 6984, 6990, 6999, 7005, 7016,
+  7022, 7031, 7037, 7057, 7063, 7072, 7078, 7089,
+  7095, 7104, 7110, 7133, 7139, 7148, 7154, 7165,
+  7171, 7180, 7186, 7206, 7212, 7221, 7227, 7238,
+  7244, 7253, 7259, 7284, 7290, 7299, 7305, 7316,
+  7322, 7331, 7337, 7357, 7363, 7372, 7378, 7389,
+  7395, 7404, 7410, 7205, 7211, 7220, 7226, 7237,
+  7243, 7252, 7258, 7278, 7284, 7293, 7299, 7310,
+  7316, 7325, 7331, 7356, 7362, 7371, 7377, 7388,
+  7394, 7403, 7409, 7429, 7435, 7444, 7450, 7461,
+  7467, 7476, 7482, 7505, 7511, 7520, 7526, 7537,
+  7543, 7552, 7558, 7578, 7584, 7593, 7599, 7610,
+  7616, 7625, 7631, 7656, 7662, 7671, 7677, 7688,
+  7694, 7703, 7709, 7729, 7735, 7744, 7750, 7761
+};
+
+//------------------------------------------------------------------------------
+// Tables for level coding
+
+const uint8_t VP8EncBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  // sentinel
+};
+
+//------------------------------------------------------------------------------
+// Mode costs
+
+static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+  for (; n < res->last; ++n) {
+    const int v = abs(res->coeffs[n]);
+    const int ctx = (v >= 2) ? 2 : v;
+    cost += VP8LevelCost(t, v);
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+static void SetResidualCoeffs_C(const int16_t* const coeffs,
+                                VP8Residual* const res) {
+  int n;
+  res->last = -1;
+  assert(res->first == 0 || coeffs[0] == 0);
+  for (n = 15; n >= 0; --n) {
+    if (coeffs[n]) {
+      res->last = n;
+      break;
+    }
+  }
+  res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// init function
+
+VP8GetResidualCostFunc VP8GetResidualCost;
+VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+extern void VP8EncDspCostInitMIPS32(void);
+extern void VP8EncDspCostInitMIPSdspR2(void);
+extern void VP8EncDspCostInitSSE2(void);
+extern void VP8EncDspCostInitNEON(void);
+
+WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
+  VP8GetResidualCost = GetResidualCost_C;
+  VP8SetResidualCoeffs = SetResidualCoeffs_C;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8EncDspCostInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspCostInitMIPSdspR2();
+    }
+#endif
+#if defined(WEBP_HAVE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspCostInitSSE2();
+    }
+#endif
+#if defined(WEBP_HAVE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8EncDspCostInitNEON();
+    }
+#endif
+  }
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/dsp/cost_mips32.c b/media/libwebp/dsp/cost_mips32.c
new file mode 100644
index 0000000000..4e97e8a756
--- /dev/null
+++ b/media/libwebp/dsp/cost_mips32.c
@@ -0,0 +1,154 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../enc/cost_enc.h"
+
+static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
+  int temp0, temp1;
+  int v_reg, ctx_reg;
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+  const int16_t* res_coeffs = res->coeffs;
+  const int res_last = res->last;
+  const int const_max_level = MAX_VARIABLE_LEVEL;
+  const int const_2 = 2;
+  const uint16_t** p_costs = &costs[n][0];
+  const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  __asm__ volatile (
+    ".set      push                                                        \n\t"
+    ".set      noreorder                                                   \n\t"
+    "subu      %[temp1],        %[res_last],        %[n]                   \n\t"
+    "sll       %[temp0],        %[n],               1                      \n\t"
+    "blez      %[temp1],        2f                                         \n\t"
+    " addu     %[res_coeffs],   %[res_coeffs],      %[temp0]               \n\t"
+  "1:                                                                      \n\t"
+    "lh        %[v_reg],        0(%[res_coeffs])                           \n\t"
+    "addiu     %[n],            %[n],               1                      \n\t"
+    "negu      %[temp0],        %[v_reg]                                   \n\t"
+    "slti      %[temp1],        %[v_reg],           0                      \n\t"
+    "movn      %[v_reg],        %[temp0],           %[temp1]               \n\t"
+    "sltiu     %[temp0],        %[v_reg],           2                      \n\t"
+    "move      %[ctx_reg],      %[v_reg]                                   \n\t"
+    "movz      %[ctx_reg],      %[const_2],         %[temp0]               \n\t"
+    "sll       %[temp1],        %[v_reg],           1                      \n\t"
+    "addu      %[temp1],        %[temp1],           %[VP8LevelFixedCosts]  \n\t"
+    "lhu       %[temp1],        0(%[temp1])                                \n\t"
+    "slt       %[temp0],        %[v_reg],           %[const_max_level]     \n\t"
+    "movz      %[v_reg],        %[const_max_level], %[temp0]               \n\t"
+    "addu      %[cost],         %[cost],            %[temp1]               \n\t"
+    "sll       %[v_reg],        %[v_reg],           1                      \n\t"
+    "sll       %[ctx_reg],      %[ctx_reg],         2                      \n\t"
+    "addu      %[v_reg],        %[v_reg],           %[t]                   \n\t"
+    "lhu       %[temp0],        0(%[v_reg])                                \n\t"
+    "addu      %[p_costs],      %[p_costs],         %[inc_p_costs]         \n\t"
+    "addu      %[t],            %[p_costs],         %[ctx_reg]             \n\t"
+    "addu      %[cost],         %[cost],            %[temp0]               \n\t"
+    "addiu     %[res_coeffs],   %[res_coeffs],      2                      \n\t"
+    "bne       %[n],            %[res_last],        1b                     \n\t"
+    " lw       %[t],            0(%[t])                                    \n\t"
+  "2:                                                                      \n\t"
+    ".set      pop                                                         \n\t"
+    : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [res_coeffs]"+&r"(res_coeffs)
+    : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+      [inc_p_costs]"r"(inc_p_costs)
+    : "memory"
+  );
+
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
+                                     VP8Residual* const res) {
+  const int16_t* p_coeffs = (int16_t*)coeffs;
+  int temp0, temp1, temp2, n, n1;
+  assert(res->first == 0 || coeffs[0] == 0);
+
+  __asm__ volatile (
+    ".set     push                                      \n\t"
+    ".set     noreorder                                 \n\t"
+    "addiu    %[p_coeffs],   %[p_coeffs],    28         \n\t"
+    "li       %[n],          15                         \n\t"
+    "li       %[temp2],      -1                         \n\t"
+  "0:                                                   \n\t"
+    "ulw      %[temp0],      0(%[p_coeffs])             \n\t"
+    "beqz     %[temp0],      1f                         \n\t"
+#if defined(WORDS_BIGENDIAN)
+    " sll     %[temp1],      %[temp0],       16         \n\t"
+#else
+    " srl     %[temp1],      %[temp0],       16         \n\t"
+#endif
+    "addiu    %[n1],         %[n],           -1         \n\t"
+    "movz     %[temp0],      %[n1],          %[temp1]   \n\t"
+    "movn     %[temp0],      %[n],           %[temp1]   \n\t"
+    "j        2f                                        \n\t"
+    " addiu   %[temp2],      %[temp0],       0          \n\t"
+  "1:                                                   \n\t"
+    "addiu    %[n],          %[n],           -2         \n\t"
+    "bgtz     %[n],          0b                         \n\t"
+    " addiu   %[p_coeffs],   %[p_coeffs],    -4         \n\t"
+  "2:                                                   \n\t"
+    ".set     pop                                       \n\t"
+    : [p_coeffs]"+&r"(p_coeffs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [n]"=&r"(n), [n1]"=&r"(n1)
+    :
+    : "memory"
+  );
+  res->last = temp2;
+  res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
+  VP8GetResidualCost = GetResidualCost_MIPS32;
+  VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/cost_mips_dsp_r2.c b/media/libwebp/dsp/cost_mips_dsp_r2.c
new file mode 100644
index 0000000000..e9ee99f6ac
--- /dev/null
+++ b/media/libwebp/dsp/cost_mips_dsp_r2.c
@@ -0,0 +1,107 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../enc/cost_enc.h"
+
+static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
+  int temp0, temp1;
+  int v_reg, ctx_reg;
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+  const int16_t* res_coeffs = res->coeffs;
+  const int res_last = res->last;
+  const int const_max_level = MAX_VARIABLE_LEVEL;
+  const int const_2 = 2;
+  const uint16_t** p_costs = &costs[n][0];
+  const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  __asm__ volatile (
+    ".set      push                                                     \n\t"
+    ".set      noreorder                                                \n\t"
+    "subu      %[temp1],        %[res_last],        %[n]                \n\t"
+    "blez      %[temp1],        2f                                      \n\t"
+    " nop                                                               \n\t"
+  "1:                                                                   \n\t"
+    "sll       %[temp0],        %[n],               1                   \n\t"
+    "lhx       %[v_reg],        %[temp0](%[res_coeffs])                 \n\t"
+    "addiu     %[n],            %[n],               1                   \n\t"
+    "absq_s.w  %[v_reg],        %[v_reg]                                \n\t"
+    "sltiu     %[temp0],        %[v_reg],           2                   \n\t"
+    "move      %[ctx_reg],      %[v_reg]                                \n\t"
+    "movz      %[ctx_reg],      %[const_2],         %[temp0]            \n\t"
+    "sll       %[temp1],        %[v_reg],           1                   \n\t"
+    "lhx       %[temp1],        %[temp1](%[VP8LevelFixedCosts])         \n\t"
+    "slt       %[temp0],        %[v_reg],           %[const_max_level]  \n\t"
+    "movz      %[v_reg],        %[const_max_level], %[temp0]            \n\t"
+    "addu      %[cost],         %[cost],            %[temp1]            \n\t"
+    "sll       %[v_reg],        %[v_reg],           1                   \n\t"
+    "sll       %[ctx_reg],      %[ctx_reg],         2                   \n\t"
+    "lhx       %[temp0],        %[v_reg](%[t])                          \n\t"
+    "addu      %[p_costs],      %[p_costs],         %[inc_p_costs]      \n\t"
+    "addu      %[t],            %[p_costs],         %[ctx_reg]          \n\t"
+    "addu      %[cost],         %[cost],            %[temp0]            \n\t"
+    "bne       %[n],            %[res_last],        1b                  \n\t"
+    " lw       %[t],            0(%[t])                                 \n\t"
+  "2:                                                                   \n\t"
+    ".set      pop                                                      \n\t"
+    : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1)
+    : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+      [res_coeffs]"r"(res_coeffs), [inc_p_costs]"r"(inc_p_costs)
+    : "memory"
+  );
+
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
+  VP8GetResidualCost = GetResidualCost_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/cost_neon.c b/media/libwebp/dsp/cost_neon.c
new file mode 100644
index 0000000000..78f715ff27
--- /dev/null
+++ b/media/libwebp/dsp/cost_neon.c
@@ -0,0 +1,122 @@
+// Copyright 2018 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of cost functions
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include "../dsp/neon.h"
+#include "../enc/cost_enc.h"
+
+static const uint8_t position[16] = { 1, 2,  3,  4,  5,  6,  7,  8,
+                                      9, 10, 11, 12, 13, 14, 15, 16 };
+
+static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
+                                   VP8Residual* const res) {
+  const int16x8_t minus_one = vdupq_n_s16(-1);
+  const int16x8_t coeffs_0 = vld1q_s16(coeffs);
+  const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8);
+  const uint16x8_t eob_0 = vtstq_s16(coeffs_0, minus_one);
+  const uint16x8_t eob_1 = vtstq_s16(coeffs_1, minus_one);
+  const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1));
+  const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position));
+
+#ifdef __aarch64__
+  res->last = vmaxvq_u8(masked) - 1;
+#else
+  const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked));
+  const uint16x8_t eob_16x8 = vmovl_u8(eob_8x8);
+  const uint16x4_t eob_16x4 =
+      vmax_u16(vget_low_u16(eob_16x8), vget_high_u16(eob_16x8));
+  const uint32x4_t eob_32x4 = vmovl_u16(eob_16x4);
+  uint32x2_t eob_32x2 =
+      vmax_u32(vget_low_u32(eob_32x4), vget_high_u32(eob_32x4));
+  eob_32x2 = vpmax_u32(eob_32x2, eob_32x2);
+
+  vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0);
+  --res->last;
+#endif  // __aarch64__
+
+  res->coeffs = coeffs;
+}
+
+static int GetResidualCost_NEON(int ctx0, const VP8Residual* const res) {
+  uint8_t levels[16], ctxs[16];
+  uint16_t abs_levels[16];
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  {   // precompute clamped levels and contexts, packed to 8b.
+    const uint8x16_t kCst2 = vdupq_n_u8(2);
+    const uint8x16_t kCst67 = vdupq_n_u8(MAX_VARIABLE_LEVEL);
+    const int16x8_t c0 = vld1q_s16(res->coeffs);
+    const int16x8_t c1 = vld1q_s16(res->coeffs + 8);
+    const uint16x8_t E0 = vreinterpretq_u16_s16(vabsq_s16(c0));
+    const uint16x8_t E1 = vreinterpretq_u16_s16(vabsq_s16(c1));
+    const uint8x16_t F = vcombine_u8(vqmovn_u16(E0), vqmovn_u16(E1));
+    const uint8x16_t G = vminq_u8(F, kCst2);   // context = 0,1,2
+    const uint8x16_t H = vminq_u8(F, kCst67);  // clamp_level in [0..67]
+
+    vst1q_u8(ctxs, G);
+    vst1q_u8(levels, H);
+
+    vst1q_u16(abs_levels, E0);
+    vst1q_u16(abs_levels + 8, E1);
+  }
+  for (; n < res->last; ++n) {
+    const int ctx = ctxs[n];
+    const int level = levels[n];
+    const int flevel = abs_levels[n];   // full level
+    cost += VP8LevelFixedCosts[flevel] + t[level];  // simplified VP8LevelCost()
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int level = levels[n];
+    const int flevel = abs_levels[n];
+    assert(flevel != 0);
+    cost += VP8LevelFixedCosts[flevel] + t[level];
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = ctxs[n];
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitNEON(void) {
+  VP8SetResidualCoeffs = SetResidualCoeffs_NEON;
+  VP8GetResidualCost = GetResidualCost_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/media/libwebp/dsp/cost_sse2.c b/media/libwebp/dsp/cost_sse2.c
new file mode 100644
index 0000000000..8cfe4e0091
--- /dev/null
+++ b/media/libwebp/dsp/cost_sse2.c
@@ -0,0 +1,119 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of cost functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+
+static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
+                                   VP8Residual* const res) {
+  const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
+  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
+  // Use SSE2 to compare 16 values with a single instruction.
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i m0 = _mm_packs_epi16(c0, c1);
+  const __m128i m1 = _mm_cmpeq_epi8(m0, zero);
+  // Get the comparison results as a bitmask into 16bits. Negate the mask to get
+  // the position of entries that are not equal to zero. We don't need to mask
+  // out least significant bits according to res->first, since coeffs[0] is 0
+  // if res->first > 0.
+  const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1);
+  // The position of the most significant non-zero bit indicates the position of
+  // the last non-zero value.
+  assert(res->first == 0 || coeffs[0] == 0);
+  res->last = mask ? BitsLog2Floor(mask) : -1;
+  res->coeffs = coeffs;
+}
+
+static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
+  uint8_t levels[16], ctxs[16];
+  uint16_t abs_levels[16];
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  {   // precompute clamped levels and contexts, packed to 8b.
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i kCst2 = _mm_set1_epi8(2);
+    const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL);
+    const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]);
+    const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]);
+    const __m128i D0 = _mm_sub_epi16(zero, c0);
+    const __m128i D1 = _mm_sub_epi16(zero, c1);
+    const __m128i E0 = _mm_max_epi16(c0, D0);   // abs(v), 16b
+    const __m128i E1 = _mm_max_epi16(c1, D1);
+    const __m128i F = _mm_packs_epi16(E0, E1);
+    const __m128i G = _mm_min_epu8(F, kCst2);    // context = 0,1,2
+    const __m128i H = _mm_min_epu8(F, kCst67);   // clamp_level in [0..67]
+
+    _mm_storeu_si128((__m128i*)&ctxs[0], G);
+    _mm_storeu_si128((__m128i*)&levels[0], H);
+
+    _mm_storeu_si128((__m128i*)&abs_levels[0], E0);
+    _mm_storeu_si128((__m128i*)&abs_levels[8], E1);
+  }
+  for (; n < res->last; ++n) {
+    const int ctx = ctxs[n];
+    const int level = levels[n];
+    const int flevel = abs_levels[n];   // full level
+    cost += VP8LevelFixedCosts[flevel] + t[level];  // simplified VP8LevelCost()
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int level = levels[n];
+    const int flevel = abs_levels[n];
+    assert(flevel != 0);
+    cost += VP8LevelFixedCosts[flevel] + t[level];
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = ctxs[n];
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
+  VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
+  VP8GetResidualCost = GetResidualCost_SSE2;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/cpu.c b/media/libwebp/dsp/cpu.c
new file mode 100644
index 0000000000..ff57d90224
--- /dev/null
+++ b/media/libwebp/dsp/cpu.c
@@ -0,0 +1,253 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// CPU detection
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_HAVE_NEON_RTCD)
+#include <stdio.h>
+#include <string.h>
+#endif
+
+#if defined(WEBP_ANDROID_NEON)
+#include <cpu-features.h>
+#endif
+
+//------------------------------------------------------------------------------
+// SSE2 detection.
+//
+
+// apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
+#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "mov %%ebx, %%edi\n"
+    "cpuid\n"
+    "xchg %%edi, %%ebx\n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type), "c"(0));
+}
+#elif defined(__x86_64__) && \
+      (defined(__code_model_medium__) || defined(__code_model_large__)) && \
+      defined(__PIC__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "xchg{q}\t{%%rbx}, %q1\n"
+    "cpuid\n"
+    "xchg{q}\t{%%rbx}, %q1\n"
+    : "=a"(cpu_info[0]), "=&r"(cpu_info[1]), "=c"(cpu_info[2]),
+      "=d"(cpu_info[3])
+    : "a"(info_type), "c"(0));
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "cpuid\n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type), "c"(0));
+}
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+
+#if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
+#include <intrin.h>
+#define GetCPUInfo(info, type) __cpuidex(info, type, 0)  // set ecx=0
+#define WEBP_HAVE_MSC_CPUID
+#elif _MSC_VER > 1310
+#include <intrin.h>
+#define GetCPUInfo __cpuid
+#define WEBP_HAVE_MSC_CPUID
+#endif
+
+#endif
+
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static WEBP_INLINE uint64_t xgetbv(void) {
+  const uint32_t ecx = 0;
+  uint32_t eax, edx;
+  // Use the raw opcode for xgetbv for compatibility with older toolchains.
+  __asm__ volatile (
+    ".byte 0x0f, 0x01, 0xd0\n"
+    : "=a"(eax), "=d"(edx) : "c" (ecx));
+  return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static WEBP_INLINE uint64_t xgetbv(void) {
+  uint32_t eax_, edx_;
+  __asm {
+    xor ecx, ecx  // ecx = 0
+    // Use the raw opcode for xgetbv for compatibility with older toolchains.
+    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+    mov eax_, eax
+    mov edx_, edx
+  }
+  return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_HAVE_MSC_CPUID)
+
+// helper function for run-time detection of slow SSSE3 platforms
+static int CheckSlowModel(int info) {
+  // Table listing display models with longer latencies for the bsr instruction
+  // (ie 2 cycles vs 10/16 cycles) and some SSSE3 instructions like pshufb.
+  // Refer to Intel 64 and IA-32 Architectures Optimization Reference Manual.
+  static const uint8_t kSlowModels[] = {
+    0x37, 0x4a, 0x4d,  // Silvermont Microarchitecture
+    0x1c, 0x26, 0x27   // Atom Microarchitecture
+  };
+  const uint32_t model = ((info & 0xf0000) >> 12) | ((info >> 4) & 0xf);
+  const uint32_t family = (info >> 8) & 0xf;
+  if (family == 0x06) {
+    size_t i;
+    for (i = 0; i < sizeof(kSlowModels) / sizeof(kSlowModels[0]); ++i) {
+      if (model == kSlowModels[i]) return 1;
+    }
+  }
+  return 0;
+}
+
+static int x86CPUInfo(CPUFeature feature) {
+  int max_cpuid_value;
+  int cpu_info[4];
+  int is_intel = 0;
+
+  // get the highest feature value cpuid supports
+  GetCPUInfo(cpu_info, 0);
+  max_cpuid_value = cpu_info[0];
+  if (max_cpuid_value < 1) {
+    return 0;
+  } else {
+    const int VENDOR_ID_INTEL_EBX = 0x756e6547;  // uneG
+    const int VENDOR_ID_INTEL_EDX = 0x49656e69;  // Ieni
+    const int VENDOR_ID_INTEL_ECX = 0x6c65746e;  // letn
+    is_intel = (cpu_info[1] == VENDOR_ID_INTEL_EBX &&
+                cpu_info[2] == VENDOR_ID_INTEL_ECX &&
+                cpu_info[3] == VENDOR_ID_INTEL_EDX);    // genuine Intel?
+  }
+
+  GetCPUInfo(cpu_info, 1);
+  if (feature == kSSE2) {
+    return !!(cpu_info[3] & (1 << 26));
+  }
+  if (feature == kSSE3) {
+    return !!(cpu_info[2] & (1 << 0));
+  }
+  if (feature == kSlowSSSE3) {
+    if (is_intel && (cpu_info[2] & (1 << 9))) {   // SSSE3?
+      return CheckSlowModel(cpu_info[0]);
+    }
+    return 0;
+  }
+
+  if (feature == kSSE4_1) {
+    return !!(cpu_info[2] & (1 << 19));
+  }
+  if (feature == kAVX) {
+    // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+    if ((cpu_info[2] & 0x18000000) == 0x18000000) {
+      // XMM state and YMM state enabled by the OS.
+      return (xgetbv() & 0x6) == 0x6;
+    }
+  }
+  if (feature == kAVX2) {
+    if (x86CPUInfo(kAVX) && max_cpuid_value >= 7) {
+      GetCPUInfo(cpu_info, 7);
+      return !!(cpu_info[1] & (1 << 5));
+    }
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
+#elif defined(WEBP_ANDROID_NEON)  // NB: needs to be before generic NEON test.
+static int AndroidCPUInfo(CPUFeature feature) {
+  const AndroidCpuFamily cpu_family = android_getCpuFamily();
+  const uint64_t cpu_features = android_getCpuFeatures();
+  if (feature == kNEON) {
+    return cpu_family == ANDROID_CPU_FAMILY_ARM &&
+           (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
+#elif defined(EMSCRIPTEN) // also needs to be before generic NEON test
+// Use compile flags as an indicator of SIMD support instead of a runtime check.
+static int wasmCPUInfo(CPUFeature feature) {
+  switch (feature) {
+#ifdef WEBP_HAVE_SSE2
+    case kSSE2:
+      return 1;
+#endif
+#ifdef WEBP_HAVE_SSE41
+    case kSSE3:
+    case kSlowSSSE3:
+    case kSSE4_1:
+      return 1;
+#endif
+#ifdef WEBP_HAVE_NEON
+    case kNEON:
+      return 1;
+#endif
+    default:
+      break;
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
+#elif defined(WEBP_HAVE_NEON)
+// In most cases this function doesn't check for NEON support (it's assumed by
+// the configuration), but enables turning off NEON at runtime, for testing
+// purposes, by setting VP8DecGetCPUInfo = NULL.
+static int armCPUInfo(CPUFeature feature) {
+  if (feature != kNEON) return 0;
+#if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD)
+  {
+    int has_neon = 0;
+    char line[200];
+    FILE* const cpuinfo = fopen("/proc/cpuinfo", "r");
+    if (cpuinfo == NULL) return 0;
+    while (fgets(line, sizeof(line), cpuinfo)) {
+      if (!strncmp(line, "Features", 8)) {
+        if (strstr(line, " neon ") != NULL) {
+          has_neon = 1;
+          break;
+        }
+      }
+    }
+    fclose(cpuinfo);
+    return has_neon;
+  }
+#else
+  return 1;
+#endif
+}
+VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
+#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2) || \
+      defined(WEBP_USE_MSA)
+static int mipsCPUInfo(CPUFeature feature) {
+  if ((feature == kMIPS32) || (feature == kMIPSdspR2) || (feature == kMSA)) {
+    return 1;
+  } else {
+    return 0;
+  }
+
+}
+VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
+#else
+VP8CPUInfo VP8GetCPUInfo = NULL;
+#endif
diff --git a/media/libwebp/dsp/dec.c b/media/libwebp/dsp/dec.c
index a599d26bc0..5c94b6d403 100644
--- a/media/libwebp/dsp/dec.c
+++ b/media/libwebp/dsp/dec.c
@@ -807,10 +807,10 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8DspInitSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
       if (VP8GetCPUInfo(kSSE4_1)) {
         VP8DspInitSSE41();
       }
@@ -834,7 +834,7 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8DspInitNEON();
diff --git a/media/libwebp/dsp/dec_mips32.c b/media/libwebp/dsp/dec_mips32.c
new file mode 100644
index 0000000000..2d55214faa
--- /dev/null
+++ b/media/libwebp/dsp/dec_mips32.c
@@ -0,0 +1,587 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of dsp functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../dsp/mips_macro.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+static WEBP_INLINE int abs_mips32(int x) {
+  const int sign = x >> 31;
+  return (x ^ sign) - sign;
+}
+
+// 4 pixels in, 2 pixels out
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];
+  const int a1 = VP8ksclip2[(a + 4) >> 3];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
+  p[-step] = VP8kclip1[p0 + a2];
+  p[    0] = VP8kclip1[q0 - a1];
+}
+
+// 4 pixels in, 4 pixels out
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0);
+  const int a1 = VP8ksclip2[(a + 4) >> 3];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
+  const int a3 = (a1 + 1) >> 1;
+  p[-2 * step] = VP8kclip1[p1 + a3];
+  p[-    step] = VP8kclip1[p0 + a2];
+  p[        0] = VP8kclip1[q0 - a1];
+  p[     step] = VP8kclip1[q1 - a3];
+}
+
+// 6 pixels in, 6 pixels out
+static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
+  // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
+  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
+  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
+  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
+  p[-3 * step] = VP8kclip1[p2 + a3];
+  p[-2 * step] = VP8kclip1[p1 + a2];
+  p[-    step] = VP8kclip1[p0 + a1];
+  p[        0] = VP8kclip1[q0 - a1];
+  p[     step] = VP8kclip1[q1 - a2];
+  p[ 2 * step] = VP8kclip1[q2 - a3];
+}
+
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
+}
+
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) <= t);
+}
+
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+                                     int step, int t, int it) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step];
+  const int p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  if ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) > t) {
+    return 0;
+  }
+  return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
+         abs_mips32(p1 - p0) <= it && abs_mips32(q3 - q2) <= it &&
+         abs_mips32(q2 - q1) <= it && abs_mips32(q1 - q0) <= it;
+}
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter6(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter4(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i, stride, thresh2)) {
+      do_filter2(p + i, stride);
+    }
+  }
+}
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i * stride, 1, thresh2)) {
+      do_filter2(p + i * stride, 1);
+    }
+  }
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14;
+  int temp15, temp16, temp17, temp18;
+  int16_t* p_in = (int16_t*)in;
+
+  // loops unrolled and merged to avoid usage of tmp buffer
+  // and to reduce number of stalls. MUL macro is written
+  // in assembler and inlined
+  __asm__ volatile(
+    "lh       %[temp0],  0(%[in])                      \n\t"
+    "lh       %[temp8],  16(%[in])                     \n\t"
+    "lh       %[temp4],  8(%[in])                      \n\t"
+    "lh       %[temp12], 24(%[in])                     \n\t"
+    "addu     %[temp16], %[temp0],  %[temp8]           \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp8]           \n\t"
+    "mul      %[temp8],  %[temp4],  %[kC2]             \n\t"
+    "mul      %[temp17], %[temp12], %[kC1]             \n\t"
+    "mul      %[temp4],  %[temp4],  %[kC1]             \n\t"
+    "mul      %[temp12], %[temp12], %[kC2]             \n\t"
+    "lh       %[temp1],  2(%[in])                      \n\t"
+    "lh       %[temp5],  10(%[in])                     \n\t"
+    "lh       %[temp9],  18(%[in])                     \n\t"
+    "lh       %[temp13], 26(%[in])                     \n\t"
+    "sra      %[temp8],  %[temp8],  16                 \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp4],  %[temp4],  16                 \n\t"
+    "sra      %[temp12], %[temp12], 16                 \n\t"
+    "lh       %[temp2],  4(%[in])                      \n\t"
+    "lh       %[temp6],  12(%[in])                     \n\t"
+    "lh       %[temp10], 20(%[in])                     \n\t"
+    "lh       %[temp14], 28(%[in])                     \n\t"
+    "subu     %[temp17], %[temp8],  %[temp17]          \n\t"
+    "addu     %[temp4],  %[temp4],  %[temp12]          \n\t"
+    "addu     %[temp8],  %[temp16], %[temp4]           \n\t"
+    "subu     %[temp4],  %[temp16], %[temp4]           \n\t"
+    "addu     %[temp16], %[temp1],  %[temp9]           \n\t"
+    "subu     %[temp1],  %[temp1],  %[temp9]           \n\t"
+    "lh       %[temp3],  6(%[in])                      \n\t"
+    "lh       %[temp7],  14(%[in])                     \n\t"
+    "lh       %[temp11], 22(%[in])                     \n\t"
+    "lh       %[temp15], 30(%[in])                     \n\t"
+    "addu     %[temp12], %[temp0],  %[temp17]          \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp17]          \n\t"
+    "mul      %[temp9],  %[temp5],  %[kC2]             \n\t"
+    "mul      %[temp17], %[temp13], %[kC1]             \n\t"
+    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
+    "mul      %[temp13], %[temp13], %[kC2]             \n\t"
+    "sra      %[temp9],  %[temp9],  16                 \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "subu     %[temp17], %[temp9],  %[temp17]          \n\t"
+    "sra      %[temp5],  %[temp5],  16                 \n\t"
+    "sra      %[temp13], %[temp13], 16                 \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
+    "addu     %[temp13], %[temp1],  %[temp17]          \n\t"
+    "subu     %[temp1],  %[temp1],  %[temp17]          \n\t"
+    "mul      %[temp17], %[temp14], %[kC1]             \n\t"
+    "mul      %[temp14], %[temp14], %[kC2]             \n\t"
+    "addu     %[temp9],  %[temp16], %[temp5]           \n\t"
+    "subu     %[temp5],  %[temp16], %[temp5]           \n\t"
+    "addu     %[temp16], %[temp2],  %[temp10]          \n\t"
+    "subu     %[temp2],  %[temp2],  %[temp10]          \n\t"
+    "mul      %[temp10], %[temp6],  %[kC2]             \n\t"
+    "mul      %[temp6],  %[temp6],  %[kC1]             \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp14], %[temp14], 16                 \n\t"
+    "sra      %[temp10], %[temp10], 16                 \n\t"
+    "sra      %[temp6],  %[temp6],  16                 \n\t"
+    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
+    "addu     %[temp6],  %[temp6],  %[temp14]          \n\t"
+    "addu     %[temp10], %[temp16], %[temp6]           \n\t"
+    "subu     %[temp6],  %[temp16], %[temp6]           \n\t"
+    "addu     %[temp14], %[temp2],  %[temp17]          \n\t"
+    "subu     %[temp2],  %[temp2],  %[temp17]          \n\t"
+    "mul      %[temp17], %[temp15], %[kC1]             \n\t"
+    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
+    "addu     %[temp16], %[temp3],  %[temp11]          \n\t"
+    "subu     %[temp3],  %[temp3],  %[temp11]          \n\t"
+    "mul      %[temp11], %[temp7],  %[kC2]             \n\t"
+    "mul      %[temp7],  %[temp7],  %[kC1]             \n\t"
+    "addiu    %[temp8],  %[temp8],  4                  \n\t"
+    "addiu    %[temp12], %[temp12], 4                  \n\t"
+    "addiu    %[temp0],  %[temp0],  4                  \n\t"
+    "addiu    %[temp4],  %[temp4],  4                  \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp15], %[temp15], 16                 \n\t"
+    "sra      %[temp11], %[temp11], 16                 \n\t"
+    "sra      %[temp7],  %[temp7],  16                 \n\t"
+    "subu     %[temp17], %[temp11], %[temp17]          \n\t"
+    "addu     %[temp7],  %[temp7],  %[temp15]          \n\t"
+    "addu     %[temp15], %[temp3],  %[temp17]          \n\t"
+    "subu     %[temp3],  %[temp3],  %[temp17]          \n\t"
+    "addu     %[temp11], %[temp16], %[temp7]           \n\t"
+    "subu     %[temp7],  %[temp16], %[temp7]           \n\t"
+    "addu     %[temp16], %[temp8],  %[temp10]          \n\t"
+    "subu     %[temp8],  %[temp8],  %[temp10]          \n\t"
+    "mul      %[temp10], %[temp9],  %[kC2]             \n\t"
+    "mul      %[temp17], %[temp11], %[kC1]             \n\t"
+    "mul      %[temp9],  %[temp9],  %[kC1]             \n\t"
+    "mul      %[temp11], %[temp11], %[kC2]             \n\t"
+    "sra      %[temp10], %[temp10], 16                 \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp9],  %[temp9],  16                 \n\t"
+    "sra      %[temp11], %[temp11], 16                 \n\t"
+    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
+    "addu     %[temp11], %[temp9],  %[temp11]          \n\t"
+    "addu     %[temp10], %[temp12], %[temp14]          \n\t"
+    "subu     %[temp12], %[temp12], %[temp14]          \n\t"
+    "mul      %[temp14], %[temp13], %[kC2]             \n\t"
+    "mul      %[temp9],  %[temp15], %[kC1]             \n\t"
+    "mul      %[temp13], %[temp13], %[kC1]             \n\t"
+    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
+    "sra      %[temp14], %[temp14], 16                 \n\t"
+    "sra      %[temp9],  %[temp9],  16                 \n\t"
+    "sra      %[temp13], %[temp13], 16                 \n\t"
+    "sra      %[temp15], %[temp15], 16                 \n\t"
+    "subu     %[temp9],  %[temp14], %[temp9]           \n\t"
+    "addu     %[temp15], %[temp13], %[temp15]          \n\t"
+    "addu     %[temp14], %[temp0],  %[temp2]           \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp2]           \n\t"
+    "mul      %[temp2],  %[temp1],  %[kC2]             \n\t"
+    "mul      %[temp13], %[temp3],  %[kC1]             \n\t"
+    "mul      %[temp1],  %[temp1],  %[kC1]             \n\t"
+    "mul      %[temp3],  %[temp3],  %[kC2]             \n\t"
+    "sra      %[temp2],  %[temp2],  16                 \n\t"
+    "sra      %[temp13], %[temp13], 16                 \n\t"
+    "sra      %[temp1],  %[temp1],  16                 \n\t"
+    "sra      %[temp3],  %[temp3],  16                 \n\t"
+    "subu     %[temp13], %[temp2],  %[temp13]          \n\t"
+    "addu     %[temp3],  %[temp1],  %[temp3]           \n\t"
+    "addu     %[temp2],  %[temp4],  %[temp6]           \n\t"
+    "subu     %[temp4],  %[temp4],  %[temp6]           \n\t"
+    "mul      %[temp6],  %[temp5],  %[kC2]             \n\t"
+    "mul      %[temp1],  %[temp7],  %[kC1]             \n\t"
+    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
+    "mul      %[temp7],  %[temp7],  %[kC2]             \n\t"
+    "sra      %[temp6],  %[temp6],  16                 \n\t"
+    "sra      %[temp1],  %[temp1],  16                 \n\t"
+    "sra      %[temp5],  %[temp5],  16                 \n\t"
+    "sra      %[temp7],  %[temp7],  16                 \n\t"
+    "subu     %[temp1],  %[temp6],  %[temp1]           \n\t"
+    "addu     %[temp7],  %[temp5],  %[temp7]           \n\t"
+    "addu     %[temp5],  %[temp16], %[temp11]          \n\t"
+    "subu     %[temp16], %[temp16], %[temp11]          \n\t"
+    "addu     %[temp11], %[temp8],  %[temp17]          \n\t"
+    "subu     %[temp8],  %[temp8],  %[temp17]          \n\t"
+    "sra      %[temp5],  %[temp5],  3                  \n\t"
+    "sra      %[temp16], %[temp16], 3                  \n\t"
+    "sra      %[temp11], %[temp11], 3                  \n\t"
+    "sra      %[temp8],  %[temp8],  3                  \n\t"
+    "addu     %[temp17], %[temp10], %[temp15]          \n\t"
+    "subu     %[temp10], %[temp10], %[temp15]          \n\t"
+    "addu     %[temp15], %[temp12], %[temp9]           \n\t"
+    "subu     %[temp12], %[temp12], %[temp9]           \n\t"
+    "sra      %[temp17], %[temp17], 3                  \n\t"
+    "sra      %[temp10], %[temp10], 3                  \n\t"
+    "sra      %[temp15], %[temp15], 3                  \n\t"
+    "sra      %[temp12], %[temp12], 3                  \n\t"
+    "addu     %[temp9],  %[temp14], %[temp3]           \n\t"
+    "subu     %[temp14], %[temp14], %[temp3]           \n\t"
+    "addu     %[temp3],  %[temp0],  %[temp13]          \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp13]          \n\t"
+    "sra      %[temp9],  %[temp9],  3                  \n\t"
+    "sra      %[temp14], %[temp14], 3                  \n\t"
+    "sra      %[temp3],  %[temp3],  3                  \n\t"
+    "sra      %[temp0],  %[temp0],  3                  \n\t"
+    "addu     %[temp13], %[temp2],  %[temp7]           \n\t"
+    "subu     %[temp2],  %[temp2],  %[temp7]           \n\t"
+    "addu     %[temp7],  %[temp4],  %[temp1]           \n\t"
+    "subu     %[temp4],  %[temp4],  %[temp1]           \n\t"
+    "sra      %[temp13], %[temp13], 3                  \n\t"
+    "sra      %[temp2],  %[temp2],  3                  \n\t"
+    "sra      %[temp7],  %[temp7],  3                  \n\t"
+    "sra      %[temp4],  %[temp4],  3                  \n\t"
+    "addiu    %[temp6],  $zero,     255                \n\t"
+    "lbu      %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
+    "sra      %[temp5],  %[temp1],  8                  \n\t"
+    "sra      %[temp18], %[temp1],  31                 \n\t"
+    "beqz     %[temp5],  1f                            \n\t"
+    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
+    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
+  "1:                                                  \n\t"
+    "lbu      %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp18], %[temp18], %[temp11]          \n\t"
+    "sra      %[temp11], %[temp18], 8                  \n\t"
+    "sra      %[temp1],  %[temp18], 31                 \n\t"
+    "beqz     %[temp11], 2f                            \n\t"
+    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
+    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
+  "2:                                                  \n\t"
+    "lbu      %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
+    "sra      %[temp8],  %[temp1],  8                  \n\t"
+    "sra      %[temp18], %[temp1],  31                 \n\t"
+    "beqz     %[temp8],  3f                            \n\t"
+    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
+    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
+  "3:                                                  \n\t"
+    "lbu      %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp18], %[temp18], %[temp16]          \n\t"
+    "sra      %[temp16], %[temp18], 8                  \n\t"
+    "sra      %[temp1],  %[temp18], 31                 \n\t"
+    "beqz     %[temp16], 4f                            \n\t"
+    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
+    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
+  "4:                                                  \n\t"
+    "sb       %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
+    "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
+    "addu     %[temp11], %[temp11], %[temp12]          \n\t"
+    "addu     %[temp16], %[temp16], %[temp10]          \n\t"
+    "sra      %[temp18], %[temp5],  8                  \n\t"
+    "sra      %[temp1],  %[temp5],  31                 \n\t"
+    "beqz     %[temp18], 5f                            \n\t"
+    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
+    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
+  "5:                                                  \n\t"
+    "sra      %[temp18], %[temp8],  8                  \n\t"
+    "sra      %[temp1],  %[temp8],  31                 \n\t"
+    "beqz     %[temp18], 6f                            \n\t"
+    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
+    "movz     %[temp8],  %[temp6],  %[temp1]           \n\t"
+  "6:                                                  \n\t"
+    "sra      %[temp18], %[temp11], 8                  \n\t"
+    "sra      %[temp1],  %[temp11], 31                 \n\t"
+    "sra      %[temp17], %[temp16], 8                  \n\t"
+    "sra      %[temp15], %[temp16], 31                 \n\t"
+    "beqz     %[temp18], 7f                            \n\t"
+    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
+    "movz     %[temp11], %[temp6],  %[temp1]           \n\t"
+  "7:                                                  \n\t"
+    "beqz     %[temp17], 8f                            \n\t"
+    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
+    "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
+  "8:                                                  \n\t"
+    "sb       %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
+    "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
+    "addu     %[temp11], %[temp11], %[temp0]           \n\t"
+    "addu     %[temp16], %[temp16], %[temp14]          \n\t"
+    "sra      %[temp18], %[temp5],  8                  \n\t"
+    "sra      %[temp1],  %[temp5],  31                 \n\t"
+    "sra      %[temp17], %[temp8],  8                  \n\t"
+    "sra      %[temp15], %[temp8],  31                 \n\t"
+    "sra      %[temp12], %[temp11], 8                  \n\t"
+    "sra      %[temp10], %[temp11], 31                 \n\t"
+    "sra      %[temp9],  %[temp16], 8                  \n\t"
+    "sra      %[temp3],  %[temp16], 31                 \n\t"
+    "beqz     %[temp18], 9f                            \n\t"
+    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
+    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
+  "9:                                                  \n\t"
+    "beqz     %[temp17], 10f                           \n\t"
+    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
+    "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
+  "10:                                                 \n\t"
+    "beqz     %[temp12], 11f                           \n\t"
+    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
+    "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
+  "11:                                                 \n\t"
+    "beqz     %[temp9],  12f                           \n\t"
+    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
+    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
+  "12:                                                 \n\t"
+    "sb       %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
+    "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
+    "addu     %[temp11], %[temp11], %[temp4]           \n\t"
+    "addu     %[temp16], %[temp16], %[temp2]           \n\t"
+    "sra      %[temp18], %[temp5],  8                  \n\t"
+    "sra      %[temp1],  %[temp5],  31                 \n\t"
+    "sra      %[temp17], %[temp8],  8                  \n\t"
+    "sra      %[temp15], %[temp8],  31                 \n\t"
+    "sra      %[temp12], %[temp11], 8                  \n\t"
+    "sra      %[temp10], %[temp11], 31                 \n\t"
+    "sra      %[temp9],  %[temp16], 8                  \n\t"
+    "sra      %[temp3],  %[temp16], 31                 \n\t"
+    "beqz     %[temp18], 13f                           \n\t"
+    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
+    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
+  "13:                                                 \n\t"
+    "beqz     %[temp17], 14f                           \n\t"
+    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
+    "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
+  "14:                                                 \n\t"
+    "beqz     %[temp12], 15f                           \n\t"
+    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
+    "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
+  "15:                                                 \n\t"
+    "beqz     %[temp9],  16f                           \n\t"
+    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
+    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
+  "16:                                                 \n\t"
+    "sb       %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+      [temp18]"=&r"(temp18)
+    : [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPS32(void) {
+  VP8InitClipTables();
+
+  VP8Transform = TransformTwo;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8DspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/dec_mips_dsp_r2.c b/media/libwebp/dsp/dec_mips_dsp_r2.c
new file mode 100644
index 0000000000..dcc3041019
--- /dev/null
+++ b/media/libwebp/dsp/dec_mips_dsp_r2.c
@@ -0,0 +1,994 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of dsp functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/mips_macro.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
+
+  __asm__ volatile (
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    "lh               %[temp5],  0(%[in])               \n\t"
+    "addiu            %[temp5],  %[temp5],  4           \n\t"
+    "ins              %[temp5],  %[temp5],  16, 16      \n\t"
+    "shra.ph          %[temp5],  %[temp5],  3           \n\t"
+    CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
+                            temp3, temp1, temp2, temp3, temp4)
+    STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
+                     temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_10()
+    : [in]"r"(in), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  int c4 = MUL(in[4], kC2);
+  const int d4 = MUL(in[4], kC1);
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ins              %[c4],      %[d4],     16,       16    \n\t"
+    "replv.ph         %[temp1],   %[a]                       \n\t"
+    "replv.ph         %[temp4],   %[d1]                      \n\t"
+    ADD_SUB_HALVES(temp2, temp3, temp1, c4)
+    "replv.ph         %[temp5],   %[c1]                      \n\t"
+    SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
+                   temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
+    LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
+                            temp11, temp17, temp3, temp5, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
+                          temp4, temp7, temp6, temp10, temp9)
+    STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
+                     temp17, temp12, temp18, temp1, temp8, temp2, temp4,
+                     temp7, temp6, dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18(),
+      [c4]"+&r"(c4)
+    : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
+    : "memory"
+  );
+}
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ulw              %[temp1],   0(%[in])                 \n\t"
+    "ulw              %[temp2],   16(%[in])                \n\t"
+    LOAD_IN_X2(temp5, temp6, 24, 26)
+    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
+    LOAD_IN_X2(temp1, temp2, 8, 10)
+    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
+                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
+                  temp13, temp11, temp14, temp12)
+    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
+    "ulw              %[temp17],  4(%[in])                 \n\t"
+    "ulw              %[temp18],  20(%[in])                \n\t"
+    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
+    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
+    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
+    LOAD_IN_X2(temp17, temp18, 12, 14)
+    LOAD_IN_X2(temp9, temp10, 28, 30)
+    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
+                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
+                  temp15, temp4, temp16, temp17)
+    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
+    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
+    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
+
+    // horizontal
+    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
+    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
+    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
+    "repl.ph          %[temp2],   0x4                      \n\t"
+    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
+    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
+    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
+    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
+    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
+    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
+                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
+                  temp6, temp17, temp8, temp18)
+    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
+                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
+                  temp18, temp12, temp17, temp16)
+    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
+    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
+    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
+                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
+                   temp6)
+    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
+                          temp16, temp11, temp10, temp15, temp14)
+    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
+                            temp11, temp10, temp11, temp14, temp15)
+    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
+                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18()
+    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15;
+
+  __asm__ volatile (
+    ".set      push                                      \n\t"
+    ".set      noreorder                                 \n\t"
+  "1:                                                    \n\t"
+    "negu      %[temp1],  %[hstride]                     \n\t"
+    "addiu     %[size],   %[size],        -1             \n\t"
+    "sll       %[temp2],  %[hstride],     1              \n\t"
+    "sll       %[temp3],  %[temp1],       1              \n\t"
+    "addu      %[temp4],  %[temp2],       %[hstride]     \n\t"
+    "addu      %[temp5],  %[temp3],       %[temp1]       \n\t"
+    "lbu       %[temp7],  0(%[p])                        \n\t"
+    "sll       %[temp6],  %[temp3],       1              \n\t"
+    "lbux      %[temp8],  %[temp5](%[p])                 \n\t"
+    "lbux      %[temp9],  %[temp3](%[p])                 \n\t"
+    "lbux      %[temp10], %[temp1](%[p])                 \n\t"
+    "lbux      %[temp11], %[temp6](%[p])                 \n\t"
+    "lbux      %[temp12], %[hstride](%[p])               \n\t"
+    "lbux      %[temp13], %[temp2](%[p])                 \n\t"
+    "lbux      %[temp14], %[temp4](%[p])                 \n\t"
+    "subu      %[temp1],  %[temp10],      %[temp7]       \n\t"
+    "subu      %[temp2],  %[temp9],       %[temp12]      \n\t"
+    "absq_s.w  %[temp3],  %[temp1]                       \n\t"
+    "absq_s.w  %[temp4],  %[temp2]                       \n\t"
+    "negu      %[temp1],  %[temp1]                       \n\t"
+    "sll       %[temp3],  %[temp3],       2              \n\t"
+    "addu      %[temp15], %[temp3],       %[temp4]       \n\t"
+    "subu      %[temp3],  %[temp15],      %[thresh2]     \n\t"
+    "sll       %[temp6],  %[temp1],       1              \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp4],  %[temp11],      %[temp8]       \n\t"
+    "absq_s.w  %[temp4],  %[temp4]                       \n\t"
+    "shll_s.w  %[temp2],  %[temp2],       24             \n\t"
+    "subu      %[temp4],  %[temp4],       %[ithresh]     \n\t"
+    "bgtz      %[temp4],  3f                             \n\t"
+    " subu     %[temp3],  %[temp8],       %[temp9]       \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp5],  %[temp9],       %[temp10]      \n\t"
+    "absq_s.w  %[temp3],  %[temp5]                       \n\t"
+    "absq_s.w  %[temp5],  %[temp5]                       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp3],  %[temp14],      %[temp13]      \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "slt       %[temp5],  %[hev_thresh],  %[temp5]       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp3],  %[temp13],      %[temp12]      \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "sra       %[temp4],  %[temp2],       24             \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp15], %[temp12],      %[temp7]       \n\t"
+    "absq_s.w  %[temp3],  %[temp15]                      \n\t"
+    "absq_s.w  %[temp15], %[temp15]                      \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " slt      %[temp15], %[hev_thresh],  %[temp15]      \n\t"
+    "addu      %[temp3],  %[temp6],       %[temp1]       \n\t"
+    "or        %[temp2],  %[temp5],       %[temp15]      \n\t"
+    "addu      %[temp5],  %[temp4],       %[temp3]       \n\t"
+    "beqz      %[temp2],  4f                             \n\t"
+    " shra_r.w %[temp1],  %[temp5],       3              \n\t"
+    "addiu     %[temp2],  %[temp5],       3              \n\t"
+    "sra       %[temp2],  %[temp2],       3              \n\t"
+    "shll_s.w  %[temp1],  %[temp1],       27             \n\t"
+    "shll_s.w  %[temp2],  %[temp2],       27             \n\t"
+    "subu      %[temp3],  %[p],           %[hstride]     \n\t"
+    "sra       %[temp1],  %[temp1],       27             \n\t"
+    "sra       %[temp2],  %[temp2],       27             \n\t"
+    "subu      %[temp1],  %[temp7],       %[temp1]       \n\t"
+    "addu      %[temp2],  %[temp10],      %[temp2]       \n\t"
+    "lbux      %[temp2],  %[temp2](%[VP8kclip1])         \n\t"
+    "lbux      %[temp1],  %[temp1](%[VP8kclip1])         \n\t"
+    "sb        %[temp2],  0(%[temp3])                    \n\t"
+    "j         3f                                        \n\t"
+    " sb       %[temp1],  0(%[p])                        \n\t"
+  "4:                                                    \n\t"
+    "shll_s.w  %[temp5],  %[temp5],       24             \n\t"
+    "subu      %[temp14], %[p],           %[hstride]     \n\t"
+    "subu      %[temp11], %[temp14],      %[hstride]     \n\t"
+    "sra       %[temp6],  %[temp5],       24             \n\t"
+    "sll       %[temp1],  %[temp6],       3              \n\t"
+    "subu      %[temp15], %[temp11],      %[hstride]     \n\t"
+    "addu      %[temp2],  %[temp6],       %[temp1]       \n\t"
+    "sll       %[temp3],  %[temp2],       1              \n\t"
+    "addu      %[temp4],  %[temp3],       %[temp2]       \n\t"
+    "addiu     %[temp2],  %[temp2],       63             \n\t"
+    "addiu     %[temp3],  %[temp3],       63             \n\t"
+    "addiu     %[temp4],  %[temp4],       63             \n\t"
+    "sra       %[temp2],  %[temp2],       7              \n\t"
+    "sra       %[temp3],  %[temp3],       7              \n\t"
+    "sra       %[temp4],  %[temp4],       7              \n\t"
+    "addu      %[temp1],  %[temp8],       %[temp2]       \n\t"
+    "addu      %[temp5],  %[temp9],       %[temp3]       \n\t"
+    "addu      %[temp6],  %[temp10],      %[temp4]       \n\t"
+    "subu      %[temp8],  %[temp7],       %[temp4]       \n\t"
+    "subu      %[temp7],  %[temp12],      %[temp3]       \n\t"
+    "addu      %[temp10], %[p],           %[hstride]     \n\t"
+    "subu      %[temp9],  %[temp13],      %[temp2]       \n\t"
+    "addu      %[temp12], %[temp10],      %[hstride]     \n\t"
+    "lbux      %[temp2],  %[temp1](%[VP8kclip1])         \n\t"
+    "lbux      %[temp3],  %[temp5](%[VP8kclip1])         \n\t"
+    "lbux      %[temp4],  %[temp6](%[VP8kclip1])         \n\t"
+    "lbux      %[temp5],  %[temp8](%[VP8kclip1])         \n\t"
+    "lbux      %[temp6],  %[temp7](%[VP8kclip1])         \n\t"
+    "lbux      %[temp8],  %[temp9](%[VP8kclip1])         \n\t"
+    "sb        %[temp2],  0(%[temp15])                   \n\t"
+    "sb        %[temp3],  0(%[temp11])                   \n\t"
+    "sb        %[temp4],  0(%[temp14])                   \n\t"
+    "sb        %[temp5],  0(%[p])                        \n\t"
+    "sb        %[temp6],  0(%[temp10])                   \n\t"
+    "sb        %[temp8],  0(%[temp12])                   \n\t"
+  "3:                                                    \n\t"
+    "bgtz      %[size],   1b                             \n\t"
+    " addu     %[p],      %[p],           %[vstride]     \n\t"
+    ".set      pop                                       \n\t"
+    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
+      [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
+      [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
+      [size]"+&r"(size), [p]"+&r"(p)
+    : [hstride]"r"(hstride), [thresh2]"r"(thresh2),
+      [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
+      [VP8kclip1]"r"(VP8kclip1)
+    : "memory"
+  );
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  int p0, q0, p1, q1, p2, q2, p3, q3;
+  int step1, step2, temp1, temp2, temp3, temp4;
+  uint8_t* pTemp0;
+  uint8_t* pTemp1;
+  const int thresh2 = 2 * thresh + 1;
+
+  __asm__ volatile (
+    ".set      push                                   \n\t"
+    ".set      noreorder                              \n\t"
+    "bltz      %[size],    3f                         \n\t"
+    " nop                                             \n\t"
+  "2:                                                 \n\t"
+    "negu      %[step1],   %[hstride]                 \n\t"
+    "lbu       %[q0],      0(%[p])                    \n\t"
+    "lbux      %[p0],      %[step1](%[p])             \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "lbux      %[q1],      %[hstride](%[p])           \n\t"
+    "subu      %[temp1],   %[p0],         %[q0]       \n\t"
+    "lbux      %[p1],      %[step1](%[p])             \n\t"
+    "addu      %[step2],   %[hstride],    %[hstride]  \n\t"
+    "absq_s.w  %[temp2],   %[temp1]                   \n\t"
+    "subu      %[temp3],   %[p1],         %[q1]       \n\t"
+    "absq_s.w  %[temp4],   %[temp3]                   \n\t"
+    "sll       %[temp2],   %[temp2],      2           \n\t"
+    "addu      %[temp2],   %[temp2],      %[temp4]    \n\t"
+    "subu      %[temp4],   %[temp2],      %[thresh2]  \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " lbux     %[p2],      %[step1](%[p])             \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "lbux      %[q2],      %[step2](%[p])             \n\t"
+    "lbux      %[p3],      %[step1](%[p])             \n\t"
+    "subu      %[temp4],   %[p2],         %[p1]       \n\t"
+    "addu      %[step2],   %[step2],      %[hstride]  \n\t"
+    "subu      %[temp2],   %[p3],         %[p2]       \n\t"
+    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
+    "absq_s.w  %[temp2],   %[temp2]                   \n\t"
+    "lbux      %[q3],      %[step2](%[p])             \n\t"
+    "subu      %[temp4],   %[temp4],      %[ithresh]  \n\t"
+    "negu      %[temp1],   %[temp1]                   \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " subu     %[temp2],   %[temp2],      %[ithresh]  \n\t"
+    "subu      %[p3],      %[p1],         %[p0]       \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " absq_s.w %[p3],      %[p3]                      \n\t"
+    "subu      %[temp4],   %[q3],         %[q2]       \n\t"
+    "subu      %[pTemp0],  %[p],          %[hstride]  \n\t"
+    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
+    "subu      %[temp2],   %[p3],         %[ithresh]  \n\t"
+    "sll       %[step1],   %[temp1],      1           \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " subu     %[temp4],   %[temp4],      %[ithresh]  \n\t"
+    "subu      %[temp2],   %[q2],         %[q1]       \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " absq_s.w %[temp2],   %[temp2]                   \n\t"
+    "subu      %[q3],      %[q1],         %[q0]       \n\t"
+    "absq_s.w  %[q3],      %[q3]                      \n\t"
+    "subu      %[temp2],   %[temp2],      %[ithresh]  \n\t"
+    "addu      %[temp1],   %[temp1],      %[step1]    \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " subu     %[temp4],   %[q3],         %[ithresh]  \n\t"
+    "slt       %[p3],      %[hev_thresh], %[p3]       \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " slt      %[q3],      %[hev_thresh], %[q3]       \n\t"
+    "or        %[q3],      %[q3],         %[p3]       \n\t"
+    "bgtz      %[q3],      1f                         \n\t"
+    " shra_r.w %[temp2],   %[temp1],      3           \n\t"
+    "addiu     %[temp1],   %[temp1],      3           \n\t"
+    "sra       %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
+    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
+    "addu      %[pTemp1],  %[p],          %[hstride]  \n\t"
+    "sra       %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      27          \n\t"
+    "addiu     %[step1],   %[temp2],      1           \n\t"
+    "sra       %[step1],   %[step1],      1           \n\t"
+    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
+    "addu      %[p1],      %[p1],         %[step1]    \n\t"
+    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
+    "subu      %[q1],      %[q1],         %[step1]    \n\t"
+    "lbux      %[temp2],   %[p0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp3],   %[q0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp4],   %[q1](%[VP8kclip1])        \n\t"
+    "sb        %[temp2],   0(%[pTemp0])               \n\t"
+    "lbux      %[temp1],   %[p1](%[VP8kclip1])        \n\t"
+    "subu      %[pTemp0],  %[pTemp0],    %[hstride]   \n\t"
+    "sb        %[temp3],   0(%[p])                    \n\t"
+    "sb        %[temp4],   0(%[pTemp1])               \n\t"
+    "j         0f                                     \n\t"
+    " sb       %[temp1],   0(%[pTemp0])               \n\t"
+  "1:                                                 \n\t"
+    "shll_s.w  %[temp3],   %[temp3],      24          \n\t"
+    "sra       %[temp3],   %[temp3],      24          \n\t"
+    "addu      %[temp1],   %[temp1],      %[temp3]    \n\t"
+    "shra_r.w  %[temp2],   %[temp1],      3           \n\t"
+    "addiu     %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
+    "sra       %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      27          \n\t"
+    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
+    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
+    "lbux      %[temp1],   %[p0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp2],   %[q0](%[VP8kclip1])        \n\t"
+    "sb        %[temp2],   0(%[p])                    \n\t"
+    "sb        %[temp1],   0(%[pTemp0])               \n\t"
+  "0:                                                 \n\t"
+    "subu      %[size],    %[size],       1           \n\t"
+    "bgtz      %[size],    2b                         \n\t"
+    " addu     %[p],       %[p],          %[vstride]  \n\t"
+  "3:                                                 \n\t"
+    ".set      pop                                    \n\t"
+    : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
+      [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
+      [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+      [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
+      [size]"+&r"(size)
+    : [vstride]"r"(vstride), [ithresh]"r"(ithresh),
+      [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
+      [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+#undef MUL
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  uint8_t* p1 = p - stride;
+  __asm__ volatile (
+    ".set      push                                      \n\t"
+    ".set      noreorder                                 \n\t"
+    "li        %[i],        16                           \n\t"
+  "0:                                                    \n\t"
+    "negu      %[temp4],    %[stride]                    \n\t"
+    "sll       %[temp5],    %[temp4],       1            \n\t"
+    "lbu       %[temp2],    0(%[p])                      \n\t"
+    "lbux      %[temp3],    %[stride](%[p])              \n\t"
+    "lbux      %[temp1],    %[temp4](%[p])               \n\t"
+    "lbux      %[temp0],    %[temp5](%[p])               \n\t"
+    "subu      %[temp7],    %[temp1],       %[temp2]     \n\t"
+    "subu      %[temp6],    %[temp0],       %[temp3]     \n\t"
+    "absq_s.w  %[temp4],    %[temp7]                     \n\t"
+    "absq_s.w  %[temp5],    %[temp6]                     \n\t"
+    "sll       %[temp4],    %[temp4],       2            \n\t"
+    "subu      %[temp5],    %[temp5],       %[thresh2]   \n\t"
+    "addu      %[temp5],    %[temp4],       %[temp5]     \n\t"
+    "negu      %[temp8],    %[temp7]                     \n\t"
+    "bgtz      %[temp5],    1f                           \n\t"
+    " addiu    %[i],        %[i],           -1           \n\t"
+    "sll       %[temp4],    %[temp8],       1            \n\t"
+    "shll_s.w  %[temp5],    %[temp6],       24           \n\t"
+    "addu      %[temp3],    %[temp4],       %[temp8]     \n\t"
+    "sra       %[temp5],    %[temp5],       24           \n\t"
+    "addu      %[temp3],    %[temp3],       %[temp5]     \n\t"
+    "addiu     %[temp7],    %[temp3],       3            \n\t"
+    "sra       %[temp7],    %[temp7],       3            \n\t"
+    "shra_r.w  %[temp8],    %[temp3],       3            \n\t"
+    "shll_s.w  %[temp0],    %[temp7],       27           \n\t"
+    "shll_s.w  %[temp4],    %[temp8],       27           \n\t"
+    "sra       %[temp0],    %[temp0],       27           \n\t"
+    "sra       %[temp4],    %[temp4],       27           \n\t"
+    "addu      %[temp7],    %[temp1],       %[temp0]     \n\t"
+    "subu      %[temp2],    %[temp2],       %[temp4]     \n\t"
+    "lbux      %[temp3],    %[temp7](%[VP8kclip1])       \n\t"
+    "lbux      %[temp4],    %[temp2](%[VP8kclip1])       \n\t"
+    "sb        %[temp3],    0(%[p1])                     \n\t"
+    "sb        %[temp4],    0(%[p])                      \n\t"
+  "1:                                                    \n\t"
+    "addiu     %[p1],       %[p1],          1            \n\t"
+    "bgtz      %[i],        0b                           \n\t"
+    " addiu    %[p],        %[p],           1            \n\t"
+    " .set     pop                                       \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
+    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+// TEMP0 = SRC[A + A1 * BPS]
+// TEMP1 = SRC[B + B1 * BPS]
+// TEMP2 = SRC[C + C1 * BPS]
+// TEMP3 = SRC[D + D1 * BPS]
+#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3,                               \
+                     A, A1, B, B1, C, C1, D, D1, SRC)                          \
+  "lbu      %[" #TEMP0 "],   " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP1 "],   " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP2 "],   " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP3 "],   " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    ".set      push                                     \n\t"
+    ".set      noreorder                                \n\t"
+    "li        %[i],       16                           \n\t"
+  "0:                                                   \n\t"
+    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
+    "subu      %[temp7],    %[temp1],       %[temp2]    \n\t"
+    "subu      %[temp6],    %[temp0],       %[temp3]    \n\t"
+    "absq_s.w  %[temp4],    %[temp7]                    \n\t"
+    "absq_s.w  %[temp5],    %[temp6]                    \n\t"
+    "sll       %[temp4],    %[temp4],       2           \n\t"
+    "addu      %[temp5],    %[temp4],       %[temp5]    \n\t"
+    "subu      %[temp5],    %[temp5],       %[thresh2]  \n\t"
+    "negu      %[temp8],    %[temp7]                    \n\t"
+    "bgtz      %[temp5],    1f                          \n\t"
+    " addiu    %[i],        %[i],           -1          \n\t"
+    "sll       %[temp4],    %[temp8],       1           \n\t"
+    "shll_s.w  %[temp5],    %[temp6],       24          \n\t"
+    "addu      %[temp3],    %[temp4],       %[temp8]    \n\t"
+    "sra       %[temp5],    %[temp5],       24          \n\t"
+    "addu      %[temp3],    %[temp3],       %[temp5]    \n\t"
+    "addiu     %[temp7],    %[temp3],       3           \n\t"
+    "sra       %[temp7],    %[temp7],       3           \n\t"
+    "shra_r.w  %[temp8],    %[temp3],       3           \n\t"
+    "shll_s.w  %[temp0],    %[temp7],       27          \n\t"
+    "shll_s.w  %[temp4],    %[temp8],       27          \n\t"
+    "sra       %[temp0],    %[temp0],       27          \n\t"
+    "sra       %[temp4],    %[temp4],       27          \n\t"
+    "addu      %[temp7],    %[temp1],       %[temp0]    \n\t"
+    "subu      %[temp2],    %[temp2],       %[temp4]    \n\t"
+    "lbux      %[temp3],    %[temp7](%[VP8kclip1])      \n\t"
+    "lbux      %[temp4],    %[temp2](%[VP8kclip1])      \n\t"
+    "sb        %[temp3],    -1(%[p])                    \n\t"
+    "sb        %[temp4],    0(%[p])                     \n\t"
+  "1:                                                   \n\t"
+    "bgtz      %[i],        0b                          \n\t"
+    " addu     %[p],        %[p],           %[stride]   \n\t"
+    ".set      pop                                      \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [p]"+&r"(p), [i]"=&r"(i)
+    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+// DST[A * BPS]     = TEMP0
+// DST[B + C * BPS] = TEMP1
+#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST)                              \
+  "usw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #DST "])         \n\t"     \
+  "usw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #DST "])  \n\t"
+
+static void VE4(uint8_t* dst) {    // vertical
+  const uint8_t* top = dst - BPS;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  __asm__ volatile (
+    "ulw             %[temp0],   -1(%[top])              \n\t"
+    "ulh             %[temp1],   3(%[top])               \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
+    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
+    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
+    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
+    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
+    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
+    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
+    STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
+    STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC4(uint8_t* dst) {   // DC
+  int temp0, temp1, temp2, temp3, temp4;
+  __asm__ volatile (
+    "ulw          %[temp0],   -1*" XSTR(BPS) "(%[dst]) \n\t"
+    LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    "ins          %[temp1],   %[temp2],    8,     8    \n\t"
+    "ins          %[temp1],   %[temp3],    16,    8    \n\t"
+    "ins          %[temp1],   %[temp4],    24,    8    \n\t"
+    "raddu.w.qb   %[temp0],   %[temp0]                 \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                 \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]    \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3           \n\t"
+    "replv.qb     %[temp0],   %[temp0]                 \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    "ulw            %[temp7],   -1-" XSTR(BPS) "(%[dst])       \n\t"
+    "ins            %[temp1],   %[temp0], 16, 16               \n\t"
+    "preceu.ph.qbr  %[temp5],   %[temp7]                       \n\t"
+    "ins            %[temp2],   %[temp1], 16, 16               \n\t"
+    "preceu.ph.qbl  %[temp4],   %[temp7]                       \n\t"
+    "ins            %[temp3],   %[temp2], 16, 16               \n\t"
+    "shll.ph        %[temp2],   %[temp2], 1                    \n\t"
+    "addq.ph        %[temp3],   %[temp3], %[temp1]             \n\t"
+    "packrl.ph      %[temp6],   %[temp5], %[temp1]             \n\t"
+    "addq.ph        %[temp3],   %[temp3], %[temp2]             \n\t"
+    "addq.ph        %[temp1],   %[temp1], %[temp5]             \n\t"
+    "shll.ph        %[temp6],   %[temp6], 1                    \n\t"
+    "addq.ph        %[temp1],   %[temp1], %[temp6]             \n\t"
+    "packrl.ph      %[temp0],   %[temp4], %[temp5]             \n\t"
+    "addq.ph        %[temp8],   %[temp5], %[temp4]             \n\t"
+    "shra_r.ph      %[temp3],   %[temp3], 2                    \n\t"
+    "shll.ph        %[temp0],   %[temp0], 1                    \n\t"
+    "shra_r.ph      %[temp1],   %[temp1], 2                    \n\t"
+    "addq.ph        %[temp8],   %[temp0], %[temp8]             \n\t"
+    "lbu            %[temp5],   3-" XSTR(BPS) "(%[dst])        \n\t"
+    "precrq.ph.w    %[temp7],   %[temp7], %[temp7]             \n\t"
+    "shra_r.ph      %[temp8],   %[temp8], 2                    \n\t"
+    "ins            %[temp7],   %[temp5], 0,  8                \n\t"
+    "precr.qb.ph    %[temp2],   %[temp1], %[temp3]             \n\t"
+    "raddu.w.qb     %[temp4],   %[temp7]                       \n\t"
+    "precr.qb.ph    %[temp6],   %[temp8], %[temp1]             \n\t"
+    "shra_r.w       %[temp4],   %[temp4], 2                    \n\t"
+    STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
+    "prepend        %[temp2],   %[temp8], 8                    \n\t"
+    "prepend        %[temp6],   %[temp4], 8                    \n\t"
+    STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+// TEMP0 = SRC[A * BPS]
+// TEMP1 = SRC[B + C * BPS]
+#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC)                               \
+  "ulw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #SRC "])         \n\t"     \
+  "ulw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "])  \n\t"
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    "preceu.ph.qbl   %[temp2],    %[temp0]                     \n\t"
+    "preceu.ph.qbr   %[temp3],    %[temp0]                     \n\t"
+    "preceu.ph.qbr   %[temp4],    %[temp1]                     \n\t"
+    "preceu.ph.qbl   %[temp5],    %[temp1]                     \n\t"
+    "packrl.ph       %[temp6],    %[temp2],    %[temp3]        \n\t"
+    "packrl.ph       %[temp7],    %[temp4],    %[temp2]        \n\t"
+    "packrl.ph       %[temp8],    %[temp5],    %[temp4]        \n\t"
+    "shll.ph         %[temp6],    %[temp6],    1               \n\t"
+    "addq.ph         %[temp9],    %[temp2],    %[temp6]        \n\t"
+    "shll.ph         %[temp7],    %[temp7],    1               \n\t"
+    "addq.ph         %[temp9],    %[temp9],    %[temp3]        \n\t"
+    "shll.ph         %[temp8],    %[temp8],    1               \n\t"
+    "shra_r.ph       %[temp9],    %[temp9],    2               \n\t"
+    "addq.ph         %[temp3],    %[temp4],    %[temp7]        \n\t"
+    "addq.ph         %[temp0],    %[temp5],    %[temp8]        \n\t"
+    "addq.ph         %[temp3],    %[temp3],    %[temp2]        \n\t"
+    "addq.ph         %[temp0],    %[temp0],    %[temp4]        \n\t"
+    "shra_r.ph       %[temp3],    %[temp3],    2               \n\t"
+    "shra_r.ph       %[temp0],    %[temp0],    2               \n\t"
+    "srl             %[temp1],    %[temp1],    24              \n\t"
+    "sll             %[temp1],    %[temp1],    1               \n\t"
+    "raddu.w.qb      %[temp5],    %[temp5]                     \n\t"
+    "precr.qb.ph     %[temp9],    %[temp3],    %[temp9]        \n\t"
+    "precr.qb.ph     %[temp3],    %[temp0],    %[temp3]        \n\t"
+    "addu            %[temp1],    %[temp1],    %[temp5]        \n\t"
+    "shra_r.w        %[temp1],    %[temp1],    2               \n\t"
+    STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
+    "prepend         %[temp9],    %[temp0],    8               \n\t"
+    "prepend         %[temp3],    %[temp1],    8               \n\t"
+    STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void DC8uv(uint8_t* dst) {     // DC
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
+    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
+    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
+    "addu         %[temp8],   %[temp8],    %[temp9]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp2]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp6]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    4             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  int temp0, temp1;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
+    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
+    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
+    "addu         %[temp8],   %[temp8],    %[temp1]      \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
+    "addu         %[temp0],   %[temp6],    %[temp2]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+#undef LOAD_8_BYTES
+#undef STORE_8_BYTES
+#undef LOAD_4_BYTES
+
+#define CLIPPING(SIZE)                                                         \
+  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
+".endif                                                  \n\t"                 \
+  "addu.ph         %[temp2],   %[temp2],   %[dst_1]      \n\t"                 \
+  "addu.ph         %[temp0],   %[temp0],   %[dst_1]      \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "addu.ph         %[temp3],   %[temp3],   %[dst_1]      \n\t"                 \
+  "addu.ph         %[temp1],   %[temp1],   %[dst_1]      \n\t"                 \
+".endif                                                  \n\t"                 \
+  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
+  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
+  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
+".endif                                                  \n\t"                 \
+  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"                 \
+".endif                                                  \n\t"
+
+
+#define CLIP_8B_TO_DST(DST, TOP, SIZE) do {                                    \
+  int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1];                              \
+  int temp0, temp1, temp2, temp3;                                              \
+  __asm__ volatile (                                                           \
+  ".if " #SIZE " < 8                                     \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
+    CLIPPING(4)                                                                \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+  ".else                                                 \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "ulw             %[temp1],   4(%[top])               \n\t"                 \
+    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
+    CLIPPING(8)                                                                \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+    "usw             %[temp1],   4(%[dst])               \n\t"                 \
+  ".if " #SIZE " == 16                                   \n\t"                 \
+    "ulw             %[temp0],   8(%[top])               \n\t"                 \
+    "ulw             %[temp1],   12(%[top])              \n\t"                 \
+    CLIPPING(8)                                                                \
+    "usw             %[temp0],   8(%[dst])               \n\t"                 \
+    "usw             %[temp1],   12(%[dst])              \n\t"                 \
+  ".endif                                                \n\t"                 \
+  ".endif                                                \n\t"                 \
+    : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),           \
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
+    : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST))                      \
+    : "memory"                                                                 \
+  );                                                                           \
+} while (0)
+
+#define CLIP_TO_DST(DST, SIZE) do {                                            \
+  int y;                                                                       \
+  const uint8_t* top = (DST) - BPS;                                            \
+  const int top_1 = ((int)top[-1] << 16) + top[-1];                            \
+  for (y = 0; y < (SIZE); ++y) {                                               \
+    CLIP_8B_TO_DST((DST), top, (SIZE));                                        \
+    (DST) += BPS;                                                              \
+  }                                                                            \
+} while (0)
+
+#define TRUE_MOTION(DST, SIZE)                                                 \
+static void TrueMotion##SIZE(uint8_t* (DST)) {                                 \
+  CLIP_TO_DST((DST), (SIZE));                                                  \
+}
+
+TRUE_MOTION(dst, 4)
+TRUE_MOTION(dst, 8)
+TRUE_MOTION(dst, 16)
+
+#undef TRUE_MOTION
+#undef CLIP_TO_DST
+#undef CLIP_8B_TO_DST
+#undef CLIPPING
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
+  VP8TransformDC = TransformDC;
+  VP8TransformAC3 = TransformAC3;
+  VP8Transform = TransformTwo;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TrueMotion4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[6] = LD4;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TrueMotion8;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+
+  VP8PredLuma16[1] = TrueMotion16;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/dec_msa.c b/media/libwebp/dsp/dec_msa.c
new file mode 100644
index 0000000000..5b0b14cc93
--- /dev/null
+++ b/media/libwebp/dsp/dec_msa.c
@@ -0,0 +1,1020 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of dsp functions
+//
+// Author(s):  Prashant Patil   (prashant.patil@imgtec.com)
+
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/msa_macro.h"
+
+//------------------------------------------------------------------------------
+// Transforms
+
+#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v4i32 a1_m, b1_m, c1_m, d1_m;                                  \
+  v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
+  const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091);           \
+  const v4i32 sinpi8sqrt2 = __msa_fill_w(35468);                 \
+                                                                 \
+  a1_m = in0 + in2;                                              \
+  b1_m = in0 - in2;                                              \
+  c_tmp1_m = (in1 * sinpi8sqrt2) >> 16;                          \
+  c_tmp2_m = in3 + ((in3 * cospi8sqrt2minus1) >> 16);            \
+  c1_m = c_tmp1_m - c_tmp2_m;                                    \
+  d_tmp1_m = in1 + ((in1 * cospi8sqrt2minus1) >> 16);            \
+  d_tmp2_m = (in3 * sinpi8sqrt2) >> 16;                          \
+  d1_m = d_tmp1_m + d_tmp2_m;                                    \
+  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
+}
+#define MULT1(a) ((((a) * 20091) >> 16) + (a))
+#define MULT2(a) (((a) * 35468) >> 16)
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  v8i16 input0, input1;
+  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+  v4i32 res0, res1, res2, res3;
+  const v16i8 zero = { 0 };
+  v16i8 dest0, dest1, dest2, dest3;
+
+  LD_SH2(in, 8, input0, input1);
+  UNPCK_SH_SW(input0, in0, in1);
+  UNPCK_SH_SW(input1, in2, in3);
+  IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+  IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+  SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+  TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+  LD_SB4(dst, BPS, dest0, dest1, dest2, dest3);
+  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+             res0, res1, res2, res3);
+  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+             res0, res1, res2, res3);
+  ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+  CLIP_SW4_0_255(res0, res1, res2, res3);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+  v8i16 input0, input1;
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+  v8i16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0, out1;
+
+  LD_SH2(in, 8, input0, input1);
+  input1 = SLDI_SH(input1, input1, 8);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  out0 = tmp2 + tmp3;
+  out1 = tmp2 - tmp3;
+  VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  tmp0 = tmp2 + tmp3;
+  tmp1 = tmp2 - tmp3;
+  ADDVI_H2_SH(tmp0, 3, tmp1, 3, out0, out1);
+  SRAI_H2_SH(out0, out1, 3);
+  out[0] = __msa_copy_s_h(out0, 0);
+  out[16] = __msa_copy_s_h(out0, 4);
+  out[32] = __msa_copy_s_h(out1, 0);
+  out[48] = __msa_copy_s_h(out1, 4);
+  out[64] = __msa_copy_s_h(out0, 1);
+  out[80] = __msa_copy_s_h(out0, 5);
+  out[96] = __msa_copy_s_h(out1, 1);
+  out[112] = __msa_copy_s_h(out1, 5);
+  out[128] = __msa_copy_s_h(out0, 2);
+  out[144] = __msa_copy_s_h(out0, 6);
+  out[160] = __msa_copy_s_h(out1, 2);
+  out[176] = __msa_copy_s_h(out1, 6);
+  out[192] = __msa_copy_s_h(out0, 3);
+  out[208] = __msa_copy_s_h(out0, 7);
+  out[224] = __msa_copy_s_h(out1, 3);
+  out[240] = __msa_copy_s_h(out1, 7);
+}
+
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+  const int DC = (in[0] + 4) >> 3;
+  const v8i16 tmp0 = __msa_fill_h(DC);
+  ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
+}
+
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  const int c4 = MULT2(in[4]);
+  const int d4 = MULT1(in[4]);
+  const int in2 = MULT2(in[1]);
+  const int in3 = MULT1(in[1]);
+  v4i32 tmp0 = { 0 };
+  v4i32 out0 = __msa_fill_w(a + d4);
+  v4i32 out1 = __msa_fill_w(a + c4);
+  v4i32 out2 = __msa_fill_w(a - c4);
+  v4i32 out3 = __msa_fill_w(a - d4);
+  v4i32 res0, res1, res2, res3;
+  const v4i32 zero = { 0 };
+  v16u8 dest0, dest1, dest2, dest3;
+
+  INSERT_W4_SW(in3, in2, -in2, -in3, tmp0);
+  ADD4(out0, tmp0, out1, tmp0, out2, tmp0, out3, tmp0,
+       out0, out1, out2, out3);
+  SRAI_W4_SW(out0, out1, out2, out3, 3);
+  LD_UB4(dst, BPS, dest0, dest1, dest2, dest3);
+  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+             res0, res1, res2, res3);
+  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+             res0, res1, res2, res3);
+  ADD4(res0, out0, res1, out1, res2, out2, res3, out3, res0, res1, res2, res3);
+  CLIP_SW4_0_255(res0, res1, res2, res3);
+  PCKEV_B2_SW(res0, res1, res2, res3, out0, out1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)out0, (v16i8)out1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+//------------------------------------------------------------------------------
+// Edge filtering functions
+
+#define FLIP_SIGN2(in0, in1, out0, out1) {  \
+  out0 = (v16i8)__msa_xori_b(in0, 0x80);    \
+  out1 = (v16i8)__msa_xori_b(in1, 0x80);    \
+}
+
+#define FLIP_SIGN4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  FLIP_SIGN2(in0, in1, out0, out1);                               \
+  FLIP_SIGN2(in2, in3, out2, out3);                               \
+}
+
+#define FILT_VAL(q0_m, p0_m, mask, filt) do {  \
+  v16i8 q0_sub_p0;                             \
+  q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);      \
+  filt = __msa_adds_s_b(filt, q0_sub_p0);      \
+  filt = __msa_adds_s_b(filt, q0_sub_p0);      \
+  filt = __msa_adds_s_b(filt, q0_sub_p0);      \
+  filt = filt & mask;                          \
+} while (0)
+
+#define FILT2(q_m, p_m, q, p) do {            \
+  u_r = SRAI_H(temp1, 7);                     \
+  u_r = __msa_sat_s_h(u_r, 7);                \
+  u_l = SRAI_H(temp3, 7);                     \
+  u_l = __msa_sat_s_h(u_l, 7);                \
+  u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);  \
+  q_m = __msa_subs_s_b(q_m, u);               \
+  p_m = __msa_adds_s_b(p_m, u);               \
+  q = __msa_xori_b((v16u8)q_m, 0x80);         \
+  p = __msa_xori_b((v16u8)p_m, 0x80);         \
+} while (0)
+
+#define LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) do {  \
+  v16i8 p1_m, p0_m, q0_m, q1_m;                         \
+  v16i8 filt, t1, t2;                                   \
+  const v16i8 cnst4b = __msa_ldi_b(4);                  \
+  const v16i8 cnst3b = __msa_ldi_b(3);                  \
+                                                        \
+  FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m);   \
+  filt = __msa_subs_s_b(p1_m, q1_m);                    \
+  filt = filt & hev;                                    \
+  FILT_VAL(q0_m, p0_m, mask, filt);                     \
+  t1 = __msa_adds_s_b(filt, cnst4b);                    \
+  t1 = SRAI_B(t1, 3);                                   \
+  t2 = __msa_adds_s_b(filt, cnst3b);                    \
+  t2 = SRAI_B(t2, 3);                                   \
+  q0_m = __msa_subs_s_b(q0_m, t1);                      \
+  q0 = __msa_xori_b((v16u8)q0_m, 0x80);                 \
+  p0_m = __msa_adds_s_b(p0_m, t2);                      \
+  p0 = __msa_xori_b((v16u8)p0_m, 0x80);                 \
+  filt = __msa_srari_b(t1, 1);                          \
+  hev = __msa_xori_b(hev, 0xff);                        \
+  filt = filt & hev;                                    \
+  q1_m = __msa_subs_s_b(q1_m, filt);                    \
+  q1 = __msa_xori_b((v16u8)q1_m, 0x80);                 \
+  p1_m = __msa_adds_s_b(p1_m, filt);                    \
+  p1 = __msa_xori_b((v16u8)p1_m, 0x80);                 \
+} while (0)
+
+#define LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) do {  \
+  v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                   \
+  v16i8 u, filt, t1, t2, filt_sign;                           \
+  v8i16 filt_r, filt_l, u_r, u_l;                             \
+  v8i16 temp0, temp1, temp2, temp3;                           \
+  const v16i8 cnst4b = __msa_ldi_b(4);                        \
+  const v16i8 cnst3b = __msa_ldi_b(3);                        \
+  const v8i16 cnst9h = __msa_ldi_h(9);                        \
+  const v8i16 cnst63h = __msa_ldi_h(63);                      \
+                                                              \
+  FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m);         \
+  filt = __msa_subs_s_b(p1_m, q1_m);                          \
+  FILT_VAL(q0_m, p0_m, mask, filt);                           \
+  FLIP_SIGN2(p2, q2, p2_m, q2_m);                             \
+  t2 = filt & hev;                                            \
+  /* filt_val &= ~hev */                                      \
+  hev = __msa_xori_b(hev, 0xff);                              \
+  filt = filt & hev;                                          \
+  t1 = __msa_adds_s_b(t2, cnst4b);                            \
+  t1 = SRAI_B(t1, 3);                                         \
+  t2 = __msa_adds_s_b(t2, cnst3b);                            \
+  t2 = SRAI_B(t2, 3);                                         \
+  q0_m = __msa_subs_s_b(q0_m, t1);                            \
+  p0_m = __msa_adds_s_b(p0_m, t2);                            \
+  filt_sign = __msa_clti_s_b(filt, 0);                        \
+  ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);               \
+  /* update q2/p2 */                                          \
+  temp0 = filt_r * cnst9h;                                    \
+  temp1 = temp0 + cnst63h;                                    \
+  temp2 = filt_l * cnst9h;                                    \
+  temp3 = temp2 + cnst63h;                                    \
+  FILT2(q2_m, p2_m, q2, p2);                                  \
+  /* update q1/p1 */                                          \
+  temp1 = temp1 + temp0;                                      \
+  temp3 = temp3 + temp2;                                      \
+  FILT2(q1_m, p1_m, q1, p1);                                  \
+  /* update q0/p0 */                                          \
+  temp1 = temp1 + temp0;                                      \
+  temp3 = temp3 + temp2;                                      \
+  FILT2(q0_m, p0_m, q0, p0);                                  \
+} while (0)
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                 \
+                     q0_in, q1_in, q2_in, q3_in,                 \
+                     limit_in, b_limit_in, thresh_in,            \
+                     hev_out, mask_out) do {                     \
+  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+  v16u8 flat_out;                                                \
+                                                                 \
+  /* absolute subtraction of pixel values */                     \
+  p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
+  p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
+  p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
+  q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
+  q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
+  q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
+  p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
+  p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
+  /* calculation of hev */                                       \
+  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+  hev_out = (thresh_in < flat_out);                              \
+  /* calculation of mask */                                      \
+  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+  p1_asub_q1_m = SRAI_B(p1_asub_q1_m, 1);                        \
+  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+  mask_out = (b_limit_in < p0_asub_q0_m);                        \
+  mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+  mask_out = (limit_in < mask_out);                              \
+  mask_out = __msa_xori_b(mask_out, 0xff);                       \
+} while (0)
+
+#define ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) do { \
+  const uint16_t tmp0_h = __msa_copy_s_h((v8i16)in1, in1_idx);  \
+  const uint32_t tmp0_w = __msa_copy_s_w((v4i32)in0, in0_idx);  \
+  SW(tmp0_w, pdst);                                             \
+  SH(tmp0_h, pdst + stride);                                    \
+} while (0)
+
+#define ST6x4_UB(in0, start_in0_idx, in1, start_in1_idx, pdst, stride) do { \
+  uint8_t* ptmp1 = (uint8_t*)pdst;                                          \
+  ST6x1_UB(in0, start_in0_idx, in1, start_in1_idx, ptmp1, 4);               \
+  ptmp1 += stride;                                                          \
+  ST6x1_UB(in0, start_in0_idx + 1, in1, start_in1_idx + 1, ptmp1, 4);       \
+  ptmp1 += stride;                                                          \
+  ST6x1_UB(in0, start_in0_idx + 2, in1, start_in1_idx + 2, ptmp1, 4);       \
+  ptmp1 += stride;                                                          \
+  ST6x1_UB(in0, start_in0_idx + 3, in1, start_in1_idx + 3, ptmp1, 4);       \
+} while (0)
+
+#define LPF_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) do {       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2;                \
+    const v16i8 cnst4b = __msa_ldi_b(4);                             \
+    const v16i8 cnst3b =  __msa_ldi_b(3);                            \
+                                                                     \
+    FLIP_SIGN4(p1_in, p0_in, q0_in, q1_in, p1_m, p0_m, q0_m, q1_m);  \
+    filt = __msa_subs_s_b(p1_m, q1_m);                               \
+    FILT_VAL(q0_m, p0_m, mask, filt);                                \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                            \
+    filt1 = SRAI_B(filt1, 3);                                        \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                            \
+    filt2 = SRAI_B(filt2, 3);                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                              \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                              \
+    q0_in = __msa_xori_b((v16u8)q0_m, 0x80);                         \
+    p0_in = __msa_xori_b((v16u8)p0_m, 0x80);                         \
+} while (0)
+
+#define LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) do {    \
+    v16u8 p1_a_sub_q1, p0_a_sub_q0;                            \
+                                                               \
+    p0_a_sub_q0 = __msa_asub_u_b(p0, q0);                      \
+    p1_a_sub_q1 = __msa_asub_u_b(p1, q1);                      \
+    p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1);  \
+    p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0);    \
+    mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1);           \
+    mask = (mask <= b_limit);                                  \
+} while (0)
+
+static void VFilter16(uint8_t* src, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptemp = src - 4 * stride;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 mask, hev;
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB8(ptemp, stride, p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  ptemp = src - 3 * stride;
+  ST_UB4(p2, p1, p0, q0, ptemp, stride);
+  ptemp += (4 * stride);
+  ST_UB2(q1, q2, ptemp, stride);
+}
+
+static void HFilter16(uint8_t* src, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptmp  = src - 4;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 mask, hev;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  v16u8 row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+  LD_UB8(ptmp, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  ptmp += (8 * stride);
+  LD_UB8(ptmp, stride, row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+  ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+  ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+  ptmp = src - 3;
+  ST6x1_UB(tmp3, 0, tmp2, 0, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp3, 1, tmp2, 1, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp3, 2, tmp2, 2, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp3, 3, tmp2, 3, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 0, tmp2, 4, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 1, tmp2, 5, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 2, tmp2, 6, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 3, tmp2, 7, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 0, tmp5, 0, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 1, tmp5, 1, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 2, tmp5, 2, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 3, tmp5, 3, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 0, tmp5, 4, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 1, tmp5, 5, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 2, tmp5, 6, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 3, tmp5, 7, ptmp, 4);
+}
+
+// on three inner edges
+static void VFilterHorEdge16i(uint8_t* src, int stride,
+                              int b_limit, int limit, int thresh) {
+  v16u8 mask, hev;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
+  const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
+  const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
+
+  LD_UB8((src - 4 * stride), stride, p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  ST_UB4(p1, p0, q0, q1, (src - 2 * stride), stride);
+}
+
+static void VFilter16i(uint8_t* src_y, int stride,
+                       int b_limit, int limit, int thresh) {
+  VFilterHorEdge16i(src_y +  4 * stride, stride, b_limit, limit, thresh);
+  VFilterHorEdge16i(src_y +  8 * stride, stride, b_limit, limit, thresh);
+  VFilterHorEdge16i(src_y + 12 * stride, stride, b_limit, limit, thresh);
+}
+
+static void HFilterVertEdge16i(uint8_t* src, int stride,
+                               int b_limit, int limit, int thresh) {
+  v16u8 mask, hev;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
+  const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
+  const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
+
+  LD_UB8(src - 4, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(src - 4 + (8 * stride), stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+  src -= 2;
+  ST4x8_UB(tmp2, tmp3, src, stride);
+  src += (8 * stride);
+  ST4x8_UB(tmp4, tmp5, src, stride);
+}
+
+static void HFilter16i(uint8_t* src_y, int stride,
+                       int b_limit, int limit, int thresh) {
+  HFilterVertEdge16i(src_y +  4, stride, b_limit, limit, thresh);
+  HFilterVertEdge16i(src_y +  8, stride, b_limit, limit, thresh);
+  HFilterVertEdge16i(src_y + 12, stride, b_limit, limit, thresh);
+}
+
+// 8-pixels wide variants, for chroma filtering
+static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
+                     int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptmp_src_u = src_u - 4 * stride;
+  uint8_t* ptmp_src_v = src_v - 4 * stride;
+  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+  LD_UB8(ptmp_src_u, stride, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+  LD_UB8(ptmp_src_v, stride, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  p2_d = __msa_copy_s_d((v2i64)p2, 0);
+  p1_d = __msa_copy_s_d((v2i64)p1, 0);
+  p0_d = __msa_copy_s_d((v2i64)p0, 0);
+  q0_d = __msa_copy_s_d((v2i64)q0, 0);
+  q1_d = __msa_copy_s_d((v2i64)q1, 0);
+  q2_d = __msa_copy_s_d((v2i64)q2, 0);
+  ptmp_src_u += stride;
+  SD4(p2_d, p1_d, p0_d, q0_d, ptmp_src_u, stride);
+  ptmp_src_u += (4 * stride);
+  SD(q1_d, ptmp_src_u);
+  ptmp_src_u += stride;
+  SD(q2_d, ptmp_src_u);
+  p2_d = __msa_copy_s_d((v2i64)p2, 1);
+  p1_d = __msa_copy_s_d((v2i64)p1, 1);
+  p0_d = __msa_copy_s_d((v2i64)p0, 1);
+  q0_d = __msa_copy_s_d((v2i64)q0, 1);
+  q1_d = __msa_copy_s_d((v2i64)q1, 1);
+  q2_d = __msa_copy_s_d((v2i64)q2, 1);
+  ptmp_src_v += stride;
+  SD4(p2_d, p1_d, p0_d, q0_d, ptmp_src_v, stride);
+  ptmp_src_v += (4 * stride);
+  SD(q1_d, ptmp_src_v);
+  ptmp_src_v += stride;
+  SD(q2_d, ptmp_src_v);
+}
+
+static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
+                     int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptmp_src_u = src_u - 4;
+  uint8_t* ptmp_src_v = src_v - 4;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  v16u8 row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+  LD_UB8(ptmp_src_u, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(ptmp_src_v, stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+  ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+  ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+  ptmp_src_u += 1;
+  ST6x4_UB(tmp3, 0, tmp2, 0, ptmp_src_u, stride);
+  ptmp_src_u += 4 * stride;
+  ST6x4_UB(tmp4, 0, tmp2, 4, ptmp_src_u, stride);
+  ptmp_src_v += 1;
+  ST6x4_UB(tmp6, 0, tmp5, 0, ptmp_src_v, stride);
+  ptmp_src_v += 4 * stride;
+  ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
+}
+
+static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  uint64_t p1_d, p0_d, q0_d, q1_d;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB8(src_u, stride, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+  src_u += (5 * stride);
+  LD_UB8(src_v, stride, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+  src_v += (5 * stride);
+  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  p1_d = __msa_copy_s_d((v2i64)p1, 0);
+  p0_d = __msa_copy_s_d((v2i64)p0, 0);
+  q0_d = __msa_copy_s_d((v2i64)q0, 0);
+  q1_d = __msa_copy_s_d((v2i64)q1, 0);
+  SD4(q1_d, q0_d, p0_d, p1_d, src_u, -stride);
+  p1_d = __msa_copy_s_d((v2i64)p1, 1);
+  p0_d = __msa_copy_s_d((v2i64)p0, 1);
+  q0_d = __msa_copy_s_d((v2i64)q0, 1);
+  q1_d = __msa_copy_s_d((v2i64)q1, 1);
+  SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
+}
+
+static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  v16u8 row9, row10, row11, row12, row13, row14, row15;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB8(src_u, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(src_v, stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
+  ILVL_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
+  src_u += 2;
+  ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, src_u, stride);
+  src_u += 4 * stride;
+  ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src_u, stride);
+  src_v += 2;
+  ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src_v, stride);
+  src_v += 4 * stride;
+  ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, src_v, stride);
+}
+
+static void SimpleVFilter16(uint8_t* src, int stride, int b_limit_in) {
+  v16u8 p1, p0, q1, q0, mask;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB4(src - 2 * stride, stride, p1, p0, q0, q1);
+  LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+  LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
+  ST_UB2(p0, q0, src - stride, stride);
+}
+
+static void SimpleHFilter16(uint8_t* src, int stride, int b_limit_in) {
+  v16u8 p1, p0, q1, q0, mask, row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  uint8_t* ptemp_src = src - 2;
+
+  LD_UB8(ptemp_src, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(ptemp_src + 8 * stride, stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p1, p0, q0, q1);
+  LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+  LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
+  ILVRL_B2_SH(q0, p0, tmp1, tmp0);
+  ptemp_src += 1;
+  ST2x4_UB(tmp1, 0, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+  ST2x4_UB(tmp1, 4, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+  ST2x4_UB(tmp0, 0, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+  ST2x4_UB(tmp0, 4, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+}
+
+static void SimpleVFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
+  SimpleVFilter16(src_y +  4 * stride, stride, b_limit_in);
+  SimpleVFilter16(src_y +  8 * stride, stride, b_limit_in);
+  SimpleVFilter16(src_y + 12 * stride, stride, b_limit_in);
+}
+
+static void SimpleHFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
+  SimpleHFilter16(src_y +  4, stride, b_limit_in);
+  SimpleHFilter16(src_y +  8, stride, b_limit_in);
+  SimpleHFilter16(src_y + 12, stride, b_limit_in);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+//------------------------------------------------------------------------------
+
+// 4x4
+
+static void DC4(uint8_t* dst) {   // DC
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
+  dc >>= 3;
+  dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
+  SW4(dc, dc, dc, dc, dst, BPS);
+}
+
+static void TM4(uint8_t* dst) {
+  const uint8_t* const ptemp = dst - BPS - 1;
+  v8i16 T, d, r0, r1, r2, r3;
+  const v16i8 zero = { 0 };
+  const v8i16 TL = (v8i16)__msa_fill_h(ptemp[0 * BPS]);
+  const v8i16 L0 = (v8i16)__msa_fill_h(ptemp[1 * BPS]);
+  const v8i16 L1 = (v8i16)__msa_fill_h(ptemp[2 * BPS]);
+  const v8i16 L2 = (v8i16)__msa_fill_h(ptemp[3 * BPS]);
+  const v8i16 L3 = (v8i16)__msa_fill_h(ptemp[4 * BPS]);
+  const v16u8 T1 = LD_UB(ptemp + 1);
+
+  T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+  d = T - TL;
+  ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
+  CLIP_SH4_0_255(r0, r1, r2, r3);
+  PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
+}
+
+static void VE4(uint8_t* dst) {    // vertical
+  const uint8_t* const ptop = dst - BPS - 1;
+  const uint32_t val0 = LW(ptop + 0);
+  const uint32_t val1 = LW(ptop + 4);
+  uint32_t out;
+  v16u8 A = { 0 }, B, C, AC, B2, R;
+
+  INSERT_W2_UB(val0, val1, A);
+  B = SLDI_UB(A, A, 1);
+  C = SLDI_UB(A, A, 2);
+  AC = __msa_ave_u_b(A, C);
+  B2 = __msa_ave_u_b(B, B);
+  R = __msa_aver_u_b(AC, B2);
+  out = __msa_copy_s_w((v4i32)R, 0);
+  SW4(out, out, out, out, dst, BPS);
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  const uint8_t* const ptop = dst - 1 - BPS;
+  uint32_t val0 = LW(ptop + 0);
+  uint32_t val1 = LW(ptop + 4);
+  uint32_t val2, val3;
+  v16u8 A, B, C, AC, B2, R, A1 = { 0 };
+
+  INSERT_W2_UB(val0, val1, A1);
+  A = SLDI_UB(A1, A1, 12);
+  A = (v16u8)__msa_insert_b((v16i8)A, 3, ptop[1 * BPS]);
+  A = (v16u8)__msa_insert_b((v16i8)A, 2, ptop[2 * BPS]);
+  A = (v16u8)__msa_insert_b((v16i8)A, 1, ptop[3 * BPS]);
+  A = (v16u8)__msa_insert_b((v16i8)A, 0, ptop[4 * BPS]);
+  B = SLDI_UB(A, A, 1);
+  C = SLDI_UB(A, A, 2);
+  AC = __msa_ave_u_b(A, C);
+  B2 = __msa_ave_u_b(B, B);
+  R = __msa_aver_u_b(AC, B2);
+  val3 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val2 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val1 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val0 = __msa_copy_s_w((v4i32)R, 0);
+  SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  const uint8_t* const ptop = dst - BPS;
+  uint32_t val0 = LW(ptop + 0);
+  uint32_t val1 = LW(ptop + 4);
+  uint32_t val2, val3;
+  v16u8 A = { 0 }, B, C, AC, B2, R;
+
+  INSERT_W2_UB(val0, val1, A);
+  B = SLDI_UB(A, A, 1);
+  C = SLDI_UB(A, A, 2);
+  C = (v16u8)__msa_insert_b((v16i8)C, 6, ptop[7]);
+  AC = __msa_ave_u_b(A, C);
+  B2 = __msa_ave_u_b(B, B);
+  R = __msa_aver_u_b(AC, B2);
+  val0 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val1 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val2 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val3 = __msa_copy_s_w((v4i32)R, 0);
+  SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+// 16x16
+
+static void DC16(uint8_t* dst) {   // DC
+  uint32_t dc = 16;
+  int i;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+  v16u8 out;
+
+  for (i = 0; i < 16; ++i) {
+    dc += dst[-1 + i * BPS];
+  }
+  dc += HADD_UH_U32(dctop);
+  out = (v16u8)__msa_fill_b(dc >> 5);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void TM16(uint8_t* dst) {
+  int j;
+  v8i16 d1, d2;
+  const v16i8 zero = { 0 };
+  const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);
+  const v16i8 T = LD_SB(dst - BPS);
+
+  ILVRL_B2_SH(zero, T, d1, d2);
+  SUB2(d1, TL, d2, TL, d1, d2);
+  for (j = 0; j < 16; j += 4) {
+    v16i8 t0, t1, t2, t3;
+    v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
+    const v8i16 L0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);
+    const v8i16 L1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);
+    const v8i16 L2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);
+    const v8i16 L3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);
+    ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
+    ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
+    CLIP_SH4_0_255(r0, r1, r2, r3);
+    CLIP_SH4_0_255(r4, r5, r6, r7);
+    PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
+    ST_SB4(t0, t1, t2, t3, dst, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void VE16(uint8_t* dst) {   // vertical
+  const v16u8 rtop = LD_UB(dst - BPS);
+  ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst, BPS);
+  ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst + 8 * BPS, BPS);
+}
+
+static void HE16(uint8_t* dst) {   // horizontal
+  int j;
+  for (j = 16; j > 0; j -= 4) {
+    const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);
+    const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);
+    const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);
+    const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);
+    ST_UB4(L0, L1, L2, L3, dst, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+  int j;
+  uint32_t dc = 8;
+  v16u8 out;
+
+  for (j = 0; j < 16; ++j) {
+    dc += dst[-1 + j * BPS];
+  }
+  out = (v16u8)__msa_fill_b(dc >> 4);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void DC16NoLeft(uint8_t* dst) {   // DC with left samples not available
+  uint32_t dc = 8;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+  v16u8 out;
+
+  dc += HADD_UH_U32(dctop);
+  out = (v16u8)__msa_fill_b(dc >> 4);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void DC16NoTopLeft(uint8_t* dst) {   // DC with nothing
+  const v16u8 out = (v16u8)__msa_fill_b(0x80);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+// Chroma
+
+#define STORE8x8(out, dst) do {                 \
+  SD4(out, out, out, out, dst + 0 * BPS, BPS);  \
+  SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
+} while (0)
+
+static void DC8uv(uint8_t* dst) {   // DC
+  uint32_t dc = 8;
+  int i;
+  uint64_t out;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 temp0 = __msa_hadd_u_h(rtop, rtop);
+  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);
+  const v2u64 temp2 = __msa_hadd_u_d(temp1, temp1);
+  v16u8 dctemp;
+
+  for (i = 0; i < 8; ++i) {
+    dc += dst[-1 + i * BPS];
+  }
+  dc += __msa_copy_s_w((v4i32)temp2, 0);
+  dctemp = (v16u8)__msa_fill_b(dc >> 4);
+  out = __msa_copy_s_d((v2i64)dctemp, 0);
+  STORE8x8(out, dst);
+}
+
+static void TM8uv(uint8_t* dst) {
+  int j;
+  const v16i8 T1 = LD_SB(dst - BPS);
+  const v16i8 zero = { 0 };
+  const v8i16 T  = (v8i16)__msa_ilvr_b(zero, T1);
+  const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);
+  const v8i16 d = T - TL;
+
+  for (j = 0; j < 8; j += 4) {
+    v16i8 t0, t1;
+    v8i16 r0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);
+    v8i16 r1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);
+    v8i16 r2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);
+    v8i16 r3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);
+    ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
+    CLIP_SH4_0_255(r0, r1, r2, r3);
+    PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
+    ST4x4_UB(t0, t1, 0, 2, 0, 2, dst, BPS);
+    ST4x4_UB(t0, t1, 1, 3, 1, 3, dst + 4, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void VE8uv(uint8_t* dst) {   // vertical
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const uint64_t out = __msa_copy_s_d((v2i64)rtop, 0);
+  STORE8x8(out, dst);
+}
+
+static void HE8uv(uint8_t* dst) {   // horizontal
+  int j;
+  for (j = 0; j < 8; j += 4) {
+    const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);
+    const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);
+    const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);
+    const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);
+    const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
+    const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
+    const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
+    const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
+    SD4(out0, out1, out2, out3, dst, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  const uint32_t dc = 4;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 temp0 = __msa_hadd_u_h(rtop, rtop);
+  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);
+  const v2u64 temp2 = __msa_hadd_u_d(temp1, temp1);
+  const uint32_t sum_m = __msa_copy_s_w((v4i32)temp2, 0);
+  const v16u8 dcval = (v16u8)__msa_fill_b((dc + sum_m) >> 3);
+  const uint64_t out = __msa_copy_s_d((v2i64)dcval, 0);
+  STORE8x8(out, dst);
+}
+
+static void DC8uvNoTop(uint8_t* dst) {   // DC with no top samples
+  uint32_t dc = 4;
+  int i;
+  uint64_t out;
+  v16u8 dctemp;
+
+  for (i = 0; i < 8; ++i) {
+    dc += dst[-1 + i * BPS];
+  }
+  dctemp = (v16u8)__msa_fill_b(dc >> 3);
+  out = __msa_copy_s_d((v2i64)dctemp, 0);
+  STORE8x8(out, dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t* dst) {   // DC with nothing
+  const uint64_t out = 0x8080808080808080ULL;
+  STORE8x8(out, dst);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
+  VP8TransformWHT = TransformWHT;
+  VP8Transform = TransformTwo;
+  VP8TransformDC = TransformDC;
+  VP8TransformAC3 = TransformAC3;
+
+  VP8VFilter16  = VFilter16;
+  VP8HFilter16  = HFilter16;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8  = VFilter8;
+  VP8HFilter8  = HFilter8;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16  = SimpleVFilter16;
+  VP8SimpleHFilter16  = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8DspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/dec_neon.c b/media/libwebp/dsp/dec_neon.c
index e8341327e4..ea690646d6 100644
--- a/media/libwebp/dsp/dec_neon.c
+++ b/media/libwebp/dsp/dec_neon.c
@@ -1283,12 +1283,12 @@ static void DC4_NEON(uint8_t* dst) {    // DC
   const uint8x8_t A = vld1_u8(dst - BPS);  // top row
   const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
   const uint16x4_t p1 = vpadd_u16(p0, p0);
-  const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
-  const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
-  const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
-  const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
-  const uint16x8_t s0 = vaddq_u16(L0, L1);
-  const uint16x8_t s1 = vaddq_u16(L2, L3);
+  const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+  const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+  const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+  const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+  const uint16x8_t s0 = vaddl_u8(L0, L1);
+  const uint16x8_t s1 = vaddl_u8(L2, L3);
   const uint16x8_t s01 = vaddq_u16(s0, s1);
   const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
@@ -1361,7 +1361,8 @@ static void RD4_NEON(uint8_t* dst) {   // Down-right
   const uint32_t J = dst[-1 + 1 * BPS];
   const uint32_t K = dst[-1 + 2 * BPS];
   const uint32_t L = dst[-1 + 3 * BPS];
-  const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
+  const uint64x1_t LKJI____ =
+      vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
   const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
   const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
   const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
@@ -1427,25 +1428,30 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
 
   if (do_top) {
     const uint8x8_t A = vld1_u8(dst - BPS);  // top row
+#if defined(__aarch64__)
+    const uint16_t p2 = vaddlv_u8(A);
+    sum_top = vdupq_n_u16(p2);
+#else
     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
     const uint16x4_t p1 = vpadd_u16(p0, p0);
     const uint16x4_t p2 = vpadd_u16(p1, p1);
     sum_top = vcombine_u16(p2, p2);
+#endif
   }
 
   if (do_left) {
-    const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
-    const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
-    const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
-    const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
-    const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
-    const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
-    const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
-    const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
-    const uint16x8_t s0 = vaddq_u16(L0, L1);
-    const uint16x8_t s1 = vaddq_u16(L2, L3);
-    const uint16x8_t s2 = vaddq_u16(L4, L5);
-    const uint16x8_t s3 = vaddq_u16(L6, L7);
+    const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+    const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+    const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+    const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+    const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
+    const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
+    const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
+    const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
+    const uint16x8_t s0 = vaddl_u8(L0, L1);
+    const uint16x8_t s1 = vaddl_u8(L2, L3);
+    const uint16x8_t s2 = vaddl_u8(L4, L5);
+    const uint16x8_t s3 = vaddl_u8(L6, L7);
     const uint16x8_t s01 = vaddq_u16(s0, s1);
     const uint16x8_t s23 = vaddq_u16(s2, s3);
     sum_left = vaddq_u16(s01, s23);
@@ -1505,29 +1511,34 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
 
   if (do_top) {
     const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
+#if defined(__aarch64__)
+    const uint16_t p3 = vaddlvq_u8(A);
+    sum_top = vdupq_n_u16(p3);
+#else
     const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
     const uint16x4_t p2 = vpadd_u16(p1, p1);
     const uint16x4_t p3 = vpadd_u16(p2, p2);
     sum_top = vcombine_u16(p3, p3);
+#endif
   }
 
   if (do_left) {
     int i;
     sum_left = vdupq_n_u16(0);
     for (i = 0; i < 16; i += 8) {
-      const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
-      const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
-      const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
-      const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
-      const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
-      const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
-      const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
-      const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
-      const uint16x8_t s0 = vaddq_u16(L0, L1);
-      const uint16x8_t s1 = vaddq_u16(L2, L3);
-      const uint16x8_t s2 = vaddq_u16(L4, L5);
-      const uint16x8_t s3 = vaddq_u16(L6, L7);
+      const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
+      const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
+      const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
+      const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
+      const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
+      const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
+      const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
+      const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
+      const uint16x8_t s0 = vaddl_u8(L0, L1);
+      const uint16x8_t s1 = vaddl_u8(L2, L3);
+      const uint16x8_t s2 = vaddl_u8(L4, L5);
+      const uint16x8_t s3 = vaddl_u8(L6, L7);
       const uint16x8_t s01 = vaddq_u16(s0, s1);
       const uint16x8_t s23 = vaddq_u16(s2, s3);
       const uint16x8_t sum = vaddq_u16(s01, s23);
diff --git a/media/libwebp/dsp/dec_sse2.c b/media/libwebp/dsp/dec_sse2.c
index f187a5bb48..b90c082793 100644
--- a/media/libwebp/dsp/dec_sse2.c
+++ b/media/libwebp/dsp/dec_sse2.c
@@ -326,7 +326,7 @@ static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
   const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
   const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
   const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
-  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i sign_bit = _mm_set1_epi8((char)0x80);
   *pi = _mm_adds_epi8(*pi, delta);
   *qi = _mm_subs_epi8(*qi, delta);
   FLIP_SIGN_BIT2(*pi, *qi);
@@ -338,9 +338,9 @@ static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
                                          const __m128i* const q0,
                                          const __m128i* const q1,
                                          int thresh, __m128i* const mask) {
-  const __m128i m_thresh = _mm_set1_epi8(thresh);
+  const __m128i m_thresh = _mm_set1_epi8((char)thresh);
   const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
-  const __m128i kFE = _mm_set1_epi8(0xFE);
+  const __m128i kFE = _mm_set1_epi8((char)0xFE);
   const __m128i t2 = _mm_and_si128(t1, kFE);  // set lsb of each byte to zero
   const __m128i t3 = _mm_srli_epi16(t2, 1);   // abs(p1 - q1) / 2
 
@@ -360,7 +360,7 @@ static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0,
                                        __m128i* const q0, __m128i* const q1,
                                        int thresh) {
   __m128i a, mask;
-  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i sign_bit = _mm_set1_epi8((char)0x80);
   // convert p1/q1 to int8_t (for GetBaseDelta_SSE2)
   const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
   const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
@@ -380,7 +380,7 @@ static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
                                        const __m128i* const mask,
                                        int hev_thresh) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i sign_bit = _mm_set1_epi8((char)0x80);
   const __m128i k64 = _mm_set1_epi8(64);
   const __m128i k3 = _mm_set1_epi8(3);
   const __m128i k4 = _mm_set1_epi8(4);
@@ -427,7 +427,7 @@ static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
                                        const __m128i* const mask,
                                        int hev_thresh) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i sign_bit = _mm_set1_epi8((char)0x80);
   __m128i a, not_hev;
 
   // compute hev mask
@@ -941,7 +941,7 @@ static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
   const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
   const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
   const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
-  const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0);
+  const __m128i IXABCD = _mm_insert_epi16(_XABCD, (short)(I | (X << 8)), 0);
   const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
diff --git a/media/libwebp/dsp/dsp.h b/media/libwebp/dsp/dsp.h
index 4e509bd2ca..ce1679ea53 100644
--- a/media/libwebp/dsp/dsp.h
+++ b/media/libwebp/dsp/dsp.h
@@ -27,6 +27,23 @@ extern "C" {
 #define BPS 32   // this is the common stride for enc/dec
 
 //------------------------------------------------------------------------------
+// WEBP_RESTRICT
+
+// Declares a pointer with the restrict type qualifier if available.
+// This allows code to hint to the compiler that only this pointer references a
+// particular object or memory region within the scope of the block in which it
+// is declared. This may allow for improved optimizations due to the lack of
+// pointer aliasing. See also:
+// https://en.cppreference.com/w/c/language/restrict
+#if defined(__GNUC__)
+#define WEBP_RESTRICT __restrict__
+#elif defined(_MSC_VER)
+#define WEBP_RESTRICT __restrict
+#else
+#define WEBP_RESTRICT
+#endif
+
+//------------------------------------------------------------------------------
 // CPU detection
 
 #if defined(__GNUC__)
@@ -51,9 +68,7 @@ extern "C" {
 # define __has_builtin(x) 0
 #endif
 
-// for now, none of the optimizations below are available in emscripten
-#if !defined(EMSCRIPTEN)
-
+#if !defined(HAVE_CONFIG_H)
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
     (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
@@ -63,23 +78,37 @@ extern "C" {
     (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
 #endif
+#endif
 
 // WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
 // files without intrinsics, allowing the corresponding Init() to be called.
 // Files containing intrinsics will need to be built targeting the instruction
 // set so should succeed on one of the earlier tests.
-#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
+#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2))
 #define WEBP_USE_SSE2
 #endif
 
-#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
+#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2)
+#define WEBP_HAVE_SSE2
+#endif
+
+#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41))
 #define WEBP_USE_SSE41
 #endif
 
+#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41)
+#define WEBP_HAVE_SSE41
+#endif
+
+#undef WEBP_MSC_SSE41
+#undef WEBP_MSC_SSE2
+
 // The intrinsics currently cause compiler errors with arm-nacl-gcc and the
 // inline assembly would need to be modified for use with Native Client.
-#if (defined(__ARM_NEON__) || \
-     defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
+#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \
+     (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
     !defined(__native_client__)
 #define WEBP_USE_NEON
 #endif
@@ -90,11 +119,20 @@ extern "C" {
 #define WEBP_USE_NEON
 #endif
 
-#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
+// Note: ARM64 is supported in Visual Studio 2017, but requires the direct
+// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in
+// arm_neon.h.
+#if defined(_MSC_VER) && \
+  ((_MSC_VER >= 1700 && defined(_M_ARM)) || \
+   (_MSC_VER >= 1920 && defined(_M_ARM64)))
 #define WEBP_USE_NEON
 #define WEBP_USE_INTRINSICS
 #endif
 
+#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
+#define WEBP_HAVE_NEON
+#endif
+
 #if defined(__mips__) && !defined(__mips64) && \
     defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
 #define WEBP_USE_MIPS32
@@ -110,13 +148,11 @@ extern "C" {
 #define WEBP_USE_MSA
 #endif
 
-#endif  /* EMSCRIPTEN */
-
 #ifndef WEBP_DSP_OMIT_C_CODE
 #define WEBP_DSP_OMIT_C_CODE 1
 #endif
 
-#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
+#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE
 #define WEBP_NEON_OMIT_C_CODE 1
 #else
 #define WEBP_NEON_OMIT_C_CODE 0
@@ -193,6 +229,12 @@ extern "C" {
 #endif
 #endif
 
+// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
+// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
+#if !defined(WEBP_OFFSET_PTR)
+#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
+#endif
+
 // Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
 #if !defined(WEBP_SWAP_16BIT_CSP)
 #define WEBP_SWAP_16BIT_CSP 0
@@ -246,9 +288,9 @@ extern VP8Fdct VP8FTransform2;   // performs two transforms at a time
 extern VP8WHT VP8FTransformWHT;
 // Predictions
 // *dst is the destination block. *top and *left can be NULL.
-typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left,
+typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left,
                               const uint8_t* top);
-typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top);
+typedef void (*VP8Intra4Preds)(uint8_t* dst, const uint8_t* top);
 extern VP8Intra4Preds VP8EncPredLuma4;
 extern VP8IntraPreds VP8EncPredLuma16;
 extern VP8IntraPreds VP8EncPredChroma8;
@@ -572,26 +614,29 @@ extern void (*WebPApplyAlphaMultiply4444)(
 
 // Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
 // Returns true if alpha[] plane has non-trivial values different from 0xff.
-extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
-                                int width, int height,
-                                uint8_t* dst, int dst_stride);
+extern int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT alpha,
+                                int alpha_stride, int width, int height,
+                                uint8_t* WEBP_RESTRICT dst, int dst_stride);
 
 // Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
 // A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
-extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
-                                        int width, int height,
-                                        uint32_t* dst, int dst_stride);
+extern void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT alpha,
+                                        int alpha_stride, int width, int height,
+                                        uint32_t* WEBP_RESTRICT dst,
+                                        int dst_stride);
 
 // Extract the alpha values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlpha).
 // Returns true if there's only trivial 0xff alpha values.
-extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
-                               int width, int height,
-                               uint8_t* alpha, int alpha_stride);
+extern int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT argb,
+                               int argb_stride, int width, int height,
+                               uint8_t* WEBP_RESTRICT alpha,
+                               int alpha_stride);
 
 // Extract the green values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlphaToGreen).
-extern void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+extern void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
+                                uint8_t* WEBP_RESTRICT alpha, int size);
 
 // Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
 // Un-Multiply operation transforms x into x * 255 / A.
@@ -604,34 +649,42 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
                       int inverse);
 
 // Same for a row of single values, with side alpha values.
-extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+extern void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
+                           const uint8_t* WEBP_RESTRICT const alpha,
                            int width, int inverse);
 
 // Same a WebPMultRow(), but for several 'num_rows' rows.
-void WebPMultRows(uint8_t* ptr, int stride,
-                  const uint8_t* alpha, int alpha_stride,
+void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
+                  const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
                   int width, int num_rows, int inverse);
 
 // Plain-C versions, used as fallback by some implementations.
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
+                   const uint8_t* WEBP_RESTRICT const alpha,
                    int width, int inverse);
 void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
 
 #ifdef WORDS_BIGENDIAN
 // ARGB packing function: a/r/g/b input is rgba or bgra order.
-extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
-                            const uint8_t* g, const uint8_t* b, int len,
-                            uint32_t* out);
+extern void (*WebPPackARGB)(const uint8_t* WEBP_RESTRICT a,
+                            const uint8_t* WEBP_RESTRICT r,
+                            const uint8_t* WEBP_RESTRICT g,
+                            const uint8_t* WEBP_RESTRICT b,
+                            int len, uint32_t* WEBP_RESTRICT out);
 #endif
 
 // RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
-extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                           int len, int step, uint32_t* out);
+extern void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
+                           const uint8_t* WEBP_RESTRICT g,
+                           const uint8_t* WEBP_RESTRICT b,
+                           int len, int step, uint32_t* WEBP_RESTRICT out);
 
 // This function returns true if src[i] contains a value different from 0xff.
 extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
 // This function returns true if src[4*i] contains a value different from 0xff.
 extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+// replaces transparent values in src[] by 'color'.
+extern void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
 
 // To be called first before using the above.
 void WebPInitAlphaProcessing(void);
diff --git a/media/libwebp/dsp/enc.c b/media/libwebp/dsp/enc.c
new file mode 100644
index 0000000000..69a2f5c577
--- /dev/null
+++ b/media/libwebp/dsp/enc.c
@@ -0,0 +1,830 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical encoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>  // for abs()
+
+#include "../dsp/dsp.h"
+#include "../enc/vp8i_enc.h"
+
+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+}
+
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE int clip_max(int v, int max) {
+  return (v > max) ? max : v;
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+const int VP8DspScan[16 + 4 + 4] = {
+  // Luma
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
+
+  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
+  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
+};
+
+// general-purpose util function
+void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
+                         VP8Histogram* const histo) {
+  int max_value = 0, last_non_zero = 1;
+  int k;
+  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
+    const int value = distribution[k];
+    if (value > 0) {
+      if (value > max_value) max_value = value;
+      last_non_zero = k;
+    }
+  }
+  histo->max_value = max_value;
+  histo->last_non_zero = last_non_zero;
+}
+
+#if !WEBP_NEON_OMIT_C_CODE
+static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
+                               int start_block, int end_block,
+                               VP8Histogram* const histo) {
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int k;
+    int16_t out[16];
+
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin.
+    for (k = 0; k < 16; ++k) {
+      const int v = abs(out[k]) >> 3;
+      const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
+      ++distribution[clipped_value];
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+// run-time tables (~4k)
+
+static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
+
+// We declare this variable 'volatile' to prevent instruction reordering
+// and make sure it's set to true _last_ (so as to be thread-safe)
+static volatile int tables_ok = 0;
+
+static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
+  if (!tables_ok) {
+    int i;
+    for (i = -255; i <= 255 + 255; ++i) {
+      clip1[255 + i] = clip_8b(i);
+    }
+    tables_ok = 1;
+  }
+}
+
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+#if !WEBP_NEON_OMIT_C_CODE
+
+#define STORE(x, y, v) \
+  dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  int C[4 * 4], *tmp;
+  int i;
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // vertical pass
+    const int a = in[0] + in[8];
+    const int b = in[0] - in[8];
+    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
+    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
+    tmp[0] = a + d;
+    tmp[1] = b + c;
+    tmp[2] = b - c;
+    tmp[3] = a - d;
+    tmp += 4;
+    in++;
+  }
+
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // horizontal pass
+    const int dc = tmp[0] + 4;
+    const int a =  dc +  tmp[8];
+    const int b =  dc -  tmp[8];
+    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
+    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+    STORE(0, i, a + d);
+    STORE(1, i, b + c);
+    STORE(2, i, b - c);
+    STORE(3, i, a - d);
+    tmp++;
+  }
+}
+
+static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                         int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  int i;
+  int tmp[16];
+  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
+    const int d0 = src[0] - ref[0];   // 9bit dynamic range ([-255,255])
+    const int d1 = src[1] - ref[1];
+    const int d2 = src[2] - ref[2];
+    const int d3 = src[3] - ref[3];
+    const int a0 = (d0 + d3);         // 10b                      [-510,510]
+    const int a1 = (d1 + d2);
+    const int a2 = (d1 - d2);
+    const int a3 = (d0 - d3);
+    tmp[0 + i * 4] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
+    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
+    tmp[2 + i * 4] = (a0 - a1) * 8;
+    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int a0 = (tmp[0 + i] + tmp[12 + i]);  // 15b
+    const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
+    const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
+    const int a3 = (tmp[0 + i] - tmp[12 + i]);
+    out[0 + i] = (a0 + a1 + 7) >> 4;            // 12b
+    out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
+    out[8 + i] = (a0 - a1 + 7) >> 4;
+    out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
+  }
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
+                          int16_t* out) {
+  VP8FTransform(src, ref, out);
+  VP8FTransform(src + 4, ref + 4, out + 16);
+}
+
+#if !WEBP_NEON_OMIT_C_CODE
+static void FTransformWHT_C(const int16_t* in, int16_t* out) {
+  // input is 12b signed
+  int32_t tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i, in += 64) {
+    const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
+    const int a1 = (in[1 * 16] + in[3 * 16]);
+    const int a2 = (in[1 * 16] - in[3 * 16]);
+    const int a3 = (in[0 * 16] - in[2 * 16]);
+    tmp[0 + i * 4] = a0 + a1;   // 14b
+    tmp[1 + i * 4] = a3 + a2;
+    tmp[2 + i * 4] = a3 - a2;
+    tmp[3 + i * 4] = a0 - a1;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int a0 = (tmp[0 + i] + tmp[8 + i]);  // 15b
+    const int a1 = (tmp[4 + i] + tmp[12+ i]);
+    const int a2 = (tmp[4 + i] - tmp[12+ i]);
+    const int a3 = (tmp[0 + i] - tmp[8 + i]);
+    const int b0 = a0 + a1;    // 16b
+    const int b1 = a3 + a2;
+    const int b2 = a3 - a2;
+    const int b3 = a0 - a1;
+    out[ 0 + i] = b0 >> 1;     // 15b
+    out[ 4 + i] = b1 >> 1;
+    out[ 8 + i] = b2 >> 1;
+    out[12 + i] = b3 >> 1;
+  }
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+#undef MUL
+#undef STORE
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
+  int j;
+  for (j = 0; j < size; ++j) {
+    memset(dst + j * BPS, value, size);
+  }
+}
+
+static WEBP_INLINE void VerticalPred(uint8_t* dst,
+                                     const uint8_t* top, int size) {
+  int j;
+  if (top != NULL) {
+    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
+  } else {
+    Fill(dst, 127, size);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred(uint8_t* dst,
+                                       const uint8_t* left, int size) {
+  if (left != NULL) {
+    int j;
+    for (j = 0; j < size; ++j) {
+      memset(dst + j * BPS, left[j], size);
+    }
+  } else {
+    Fill(dst, 129, size);
+  }
+}
+
+static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top, int size) {
+  int y;
+  if (left != NULL) {
+    if (top != NULL) {
+      const uint8_t* const clip = clip1 + 255 - left[-1];
+      for (y = 0; y < size; ++y) {
+        const uint8_t* const clip_table = clip + left[y];
+        int x;
+        for (x = 0; x < size; ++x) {
+          dst[x] = clip_table[top[x]];
+        }
+        dst += BPS;
+      }
+    } else {
+      HorizontalPred(dst, left, size);
+    }
+  } else {
+    // true motion without left samples (hence: with default 129 value)
+    // is equivalent to VE prediction where you just copy the top samples.
+    // Note that if top samples are not available, the default value is
+    // then 129, and not 127 as in the VerticalPred case.
+    if (top != NULL) {
+      VerticalPred(dst, top, size);
+    } else {
+      Fill(dst, 129, size);
+    }
+  }
+}
+
+static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
+                               const uint8_t* top,
+                               int size, int round, int shift) {
+  int DC = 0;
+  int j;
+  if (top != NULL) {
+    for (j = 0; j < size; ++j) DC += top[j];
+    if (left != NULL) {   // top and left present
+      for (j = 0; j < size; ++j) DC += left[j];
+    } else {      // top, but no left
+      DC += DC;
+    }
+    DC = (DC + round) >> shift;
+  } else if (left != NULL) {   // left but no top
+    for (j = 0; j < size; ++j) DC += left[j];
+    DC += DC;
+    DC = (DC + round) >> shift;
+  } else {   // no top, no left, nothing.
+    DC = 0x80;
+  }
+  Fill(dst, DC, size);
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
+                               const uint8_t* top) {
+  // U block
+  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
+  VerticalPred(C8VE8 + dst, top, 8);
+  HorizontalPred(C8HE8 + dst, left, 8);
+  TrueMotion(C8TM8 + dst, left, top, 8);
+  // V block
+  dst += 8;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
+  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
+  VerticalPred(C8VE8 + dst, top, 8);
+  HorizontalPred(C8HE8 + dst, left, 8);
+  TrueMotion(C8TM8 + dst, left, top, 8);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds_C(uint8_t* dst,
+                           const uint8_t* left, const uint8_t* top) {
+  DCMode(I16DC16 + dst, left, top, 16, 16, 5);
+  VerticalPred(I16VE16 + dst, top, 16);
+  HorizontalPred(I16HE16 + dst, left, 16);
+  TrueMotion(I16TM16 + dst, left, top, 16);
+}
+
+//------------------------------------------------------------------------------
+// luma 4x4 prediction
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+  const uint8_t vals[4] = {
+    AVG3(top[-1], top[0], top[1]),
+    AVG3(top[ 0], top[1], top[2]),
+    AVG3(top[ 1], top[2], top[3]),
+    AVG3(top[ 2], top[3], top[4])
+  };
+  int i;
+  for (i = 0; i < 4; ++i) {
+    memcpy(dst + i * BPS, vals, 4);
+  }
+}
+
+static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
+}
+
+static void DC4(uint8_t* dst, const uint8_t* top) {
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+  Fill(dst, dc >> 3, 4);
+}
+
+static void RD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
+  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
+  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
+  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
+  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
+  DST(3, 0)                                     = AVG3(D, C, B);
+}
+
+static void LD4(uint8_t* dst, const uint8_t* top) {
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  const int E = top[4];
+  const int F = top[5];
+  const int G = top[6];
+  const int H = top[7];
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
+  DST(3, 3)                                     = AVG3(G, H, H);
+}
+
+static void VR4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
+
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
+}
+
+static void VL4(uint8_t* dst, const uint8_t* top) {
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  const int E = top[4];
+  const int F = top[5];
+  const int G = top[6];
+  const int H = top[7];
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
+}
+
+static void HU4(uint8_t* dst, const uint8_t* top) {
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static void HD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+static void TM4(uint8_t* dst, const uint8_t* top) {
+  int x, y;
+  const uint8_t* const clip = clip1 + 255 - top[-1];
+  for (y = 0; y < 4; ++y) {
+    const uint8_t* const clip_table = clip + top[-2 - y];
+    for (x = 0; x < 4; ++x) {
+      dst[x] = clip_table[top[x]];
+    }
+    dst += BPS;
+  }
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
+                              int w, int h) {
+  int count = 0;
+  int y, x;
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      const int diff = (int)a[x] - b[x];
+      count += diff * diff;
+    }
+    a += BPS;
+    b += BPS;
+  }
+  return count;
+}
+
+static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 16, 16);
+}
+static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 16, 8);
+}
+static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 8, 8);
+}
+static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 4, 4);
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
+  int k, x, y;
+  for (k = 0; k < 4; ++k) {
+    uint32_t avg = 0;
+    for (y = 0; y < 4; ++y) {
+      for (x = 0; x < 4; ++x) {
+        avg += ref[x + y * BPS];
+      }
+    }
+    dc[k] = avg;
+    ref += 4;   // go to next 4x4 block.
+  }
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+#if !WEBP_NEON_OMIT_C_CODE
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
+static int TTransform(const uint8_t* in, const uint16_t* w) {
+  int sum = 0;
+  int tmp[16];
+  int i;
+  // horizontal pass
+  for (i = 0; i < 4; ++i, in += BPS) {
+    const int a0 = in[0] + in[2];
+    const int a1 = in[1] + in[3];
+    const int a2 = in[1] - in[3];
+    const int a3 = in[0] - in[2];
+    tmp[0 + i * 4] = a0 + a1;
+    tmp[1 + i * 4] = a3 + a2;
+    tmp[2 + i * 4] = a3 - a2;
+    tmp[3 + i * 4] = a0 - a1;
+  }
+  // vertical pass
+  for (i = 0; i < 4; ++i, ++w) {
+    const int a0 = tmp[0 + i] + tmp[8 + i];
+    const int a1 = tmp[4 + i] + tmp[12+ i];
+    const int a2 = tmp[4 + i] - tmp[12+ i];
+    const int a3 = tmp[0 + i] - tmp[8 + i];
+    const int b0 = a0 + a1;
+    const int b1 = a3 + a2;
+    const int b2 = a3 - a2;
+    const int b3 = a0 - a1;
+
+    sum += w[ 0] * abs(b0);
+    sum += w[ 4] * abs(b1);
+    sum += w[ 8] * abs(b2);
+    sum += w[12] * abs(b3);
+  }
+  return sum;
+}
+
+static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  const int sum1 = TTransform(a, w);
+  const int sum2 = TTransform(b, w);
+  return abs(sum2 - sum1) >> 5;
+}
+
+static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_C(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+// Simple quantization
+static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
+                           const VP8Matrix* const mtx) {
+  int last = -1;
+  int n;
+  for (n = 0; n < 16; ++n) {
+    const int j = kZigzag[n];
+    const int sign = (in[j] < 0);
+    const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    if (coeff > mtx->zthresh_[j]) {
+      const uint32_t Q = mtx->q_[j];
+      const uint32_t iQ = mtx->iq_[j];
+      const uint32_t B = mtx->bias_[j];
+      int level = QUANTDIV(coeff, iQ, B);
+      if (level > MAX_LEVEL) level = MAX_LEVEL;
+      if (sign) level = -level;
+      in[j] = level * (int)Q;
+      out[n] = level;
+      if (level) last = n;
+    } else {
+      out[n] = 0;
+      in[j] = 0;
+    }
+  }
+  return (last >= 0);
+}
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
+                             const VP8Matrix* const mtx) {
+  int nz;
+  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+
+//------------------------------------------------------------------------------
+// Block copy
+
+static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
+  int y;
+  for (y = 0; y < h; ++y) {
+    memcpy(dst, src, w);
+    src += BPS;
+    dst += BPS;
+  }
+}
+
+static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
+  Copy(src, dst, 4, 4);
+}
+
+static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
+  Copy(src, dst, 16, 8);
+}
+
+//------------------------------------------------------------------------------
+// Initialization
+
+// Speed-critical function pointers. We have to initialize them to the default
+// implementations within VP8EncDspInit().
+VP8CHisto VP8CollectHistogram;
+VP8Idct VP8ITransform;
+VP8Fdct VP8FTransform;
+VP8Fdct VP8FTransform2;
+VP8WHT VP8FTransformWHT;
+VP8Intra4Preds VP8EncPredLuma4;
+VP8IntraPreds VP8EncPredLuma16;
+VP8IntraPreds VP8EncPredChroma8;
+VP8Metric VP8SSE16x16;
+VP8Metric VP8SSE8x8;
+VP8Metric VP8SSE16x8;
+VP8Metric VP8SSE4x4;
+VP8WMetric VP8TDisto4x4;
+VP8WMetric VP8TDisto16x16;
+VP8MeanMetric VP8Mean16x4;
+VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8Quantize2Blocks VP8EncQuantize2Blocks;
+VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
+VP8BlockCopy VP8Copy4x4;
+VP8BlockCopy VP8Copy16x8;
+
+extern void VP8EncDspInitSSE2(void);
+extern void VP8EncDspInitSSE41(void);
+extern void VP8EncDspInitNEON(void);
+extern void VP8EncDspInitMIPS32(void);
+extern void VP8EncDspInitMIPSdspR2(void);
+extern void VP8EncDspInitMSA(void);
+
+WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
+  VP8DspInit();  // common inverse transforms
+  InitTables();
+
+  // default C implementations
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8ITransform = ITransform_C;
+  VP8FTransform = FTransform_C;
+  VP8FTransformWHT = FTransformWHT_C;
+  VP8TDisto4x4 = Disto4x4_C;
+  VP8TDisto16x16 = Disto16x16_C;
+  VP8CollectHistogram = CollectHistogram_C;
+  VP8SSE16x16 = SSE16x16_C;
+  VP8SSE16x8 = SSE16x8_C;
+  VP8SSE8x8 = SSE8x8_C;
+  VP8SSE4x4 = SSE4x4_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+  VP8EncQuantizeBlock = QuantizeBlock_C;
+  VP8EncQuantize2Blocks = Quantize2Blocks_C;
+#endif
+
+  VP8FTransform2 = FTransform2_C;
+  VP8EncPredLuma4 = Intra4Preds_C;
+  VP8EncPredLuma16 = Intra16Preds_C;
+  VP8EncPredChroma8 = IntraChromaPreds_C;
+  VP8Mean16x4 = Mean16x4_C;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
+  VP8Copy4x4 = Copy4x4_C;
+  VP8Copy16x8 = Copy16x8_C;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_HAVE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspInitSSE2();
+#if defined(WEBP_HAVE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8EncDspInitSSE41();
+      }
+#endif
+    }
+#endif
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8EncDspInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspInitMIPSdspR2();
+    }
+#endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8EncDspInitMSA();
+    }
+#endif
+  }
+
+#if defined(WEBP_HAVE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8EncDspInitNEON();
+  }
+#endif
+
+  assert(VP8ITransform != NULL);
+  assert(VP8FTransform != NULL);
+  assert(VP8FTransformWHT != NULL);
+  assert(VP8TDisto4x4 != NULL);
+  assert(VP8TDisto16x16 != NULL);
+  assert(VP8CollectHistogram != NULL);
+  assert(VP8SSE16x16 != NULL);
+  assert(VP8SSE16x8 != NULL);
+  assert(VP8SSE8x8 != NULL);
+  assert(VP8SSE4x4 != NULL);
+  assert(VP8EncQuantizeBlock != NULL);
+  assert(VP8EncQuantize2Blocks != NULL);
+  assert(VP8FTransform2 != NULL);
+  assert(VP8EncPredLuma4 != NULL);
+  assert(VP8EncPredLuma16 != NULL);
+  assert(VP8EncPredChroma8 != NULL);
+  assert(VP8Mean16x4 != NULL);
+  assert(VP8EncQuantizeBlockWHT != NULL);
+  assert(VP8Copy4x4 != NULL);
+  assert(VP8Copy16x8 != NULL);
+}
diff --git a/media/libwebp/dsp/enc_mips32.c b/media/libwebp/dsp/enc_mips32.c
new file mode 100644
index 0000000000..ee26dfb493
--- /dev/null
+++ b/media/libwebp/dsp/enc_mips32.c
@@ -0,0 +1,677 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of speed-critical encoding functions.
+//
+// Author(s): Djordje Pesut    (djordje.pesut@imgtec.com)
+//            Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+//            Slobodan Prijic  (slobodan.prijic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../dsp/mips_macro.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/cost_enc.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+// macro for one vertical pass in ITransformOne
+// MUL macro inlined
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to load from in buffer
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+// TEMP4..TEMP5 - temporary registers
+#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
+  "lh      %[temp16],      " #A "(%[temp20])                 \n\t"          \
+  "lh      %[temp18],      " #B "(%[temp20])                 \n\t"          \
+  "lh      %[temp17],      " #C "(%[temp20])                 \n\t"          \
+  "lh      %[temp19],      " #D "(%[temp20])                 \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp16],      %[temp18]       \n\t"          \
+  "subu    %[temp16],      %[temp16],      %[temp18]         \n\t"          \
+  "mul     %[" #TEMP0 "],    %[temp17],      %[kC2]          \n\t"          \
+  "mul     %[temp18],      %[temp19],      %[kC1]            \n\t"          \
+  "mul     %[temp17],      %[temp17],      %[kC1]            \n\t"          \
+  "mul     %[temp19],      %[temp19],      %[kC2]            \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\n"          \
+  "sra     %[temp18],      %[temp18],      16                \n\n"          \
+  "sra     %[temp17],      %[temp17],      16                \n\n"          \
+  "sra     %[temp19],      %[temp19],      16                \n\n"          \
+  "subu    %[" #TEMP2 "],    %[" #TEMP0 "],    %[temp18]     \n\t"          \
+  "addu    %[" #TEMP3 "],    %[temp17],      %[temp19]       \n\t"          \
+  "addu    %[" #TEMP0 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"          \
+  "addu    %[" #TEMP1 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP2 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP3 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"
+
+// macro for one horizontal pass in ITransformOne
+// MUL and STORE macros inlined
+// a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A - offset in bytes to load from ref and store to dst buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12)                       \
+  "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4               \n\t"          \
+  "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]          \n\t"          \
+  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]          \n\t"          \
+  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]          \n\t"          \
+  "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]          \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16              \n\t"          \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16              \n\t"          \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16              \n\t"          \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16              \n\t"          \
+  "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]    \n\t"          \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[temp19]         \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[temp18]         \n\t"          \
+  "subu    %[" #TEMP8 "],    %[temp17],      %[temp18]         \n\t"          \
+  "subu    %[" #TEMP12 "],   %[temp16],      %[temp19]         \n\t"          \
+  "lw      %[temp20],      0(%[args])                          \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    3               \n\t"          \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    3               \n\t"          \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    3               \n\t"          \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   3               \n\t"          \
+  "lbu     %[temp16],      0+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp17],      1+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp18],      2+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp19],      3+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[" #TEMP0 "]     \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[" #TEMP4 "]     \n\t"          \
+  "addu    %[" #TEMP8 "],    %[temp18],      %[" #TEMP8 "]     \n\t"          \
+  "addu    %[" #TEMP12 "],   %[temp19],      %[" #TEMP12 "]    \n\t"          \
+  "slt     %[temp16],      %[" #TEMP0 "],    $zero             \n\t"          \
+  "slt     %[temp17],      %[" #TEMP4 "],    $zero             \n\t"          \
+  "slt     %[temp18],      %[" #TEMP8 "],    $zero             \n\t"          \
+  "slt     %[temp19],      %[" #TEMP12 "],   $zero             \n\t"          \
+  "movn    %[" #TEMP0 "],    $zero,          %[temp16]         \n\t"          \
+  "movn    %[" #TEMP4 "],    $zero,          %[temp17]         \n\t"          \
+  "movn    %[" #TEMP8 "],    $zero,          %[temp18]         \n\t"          \
+  "movn    %[" #TEMP12 "],   $zero,          %[temp19]         \n\t"          \
+  "addiu   %[temp20],      $zero,          255                 \n\t"          \
+  "slt     %[temp16],      %[" #TEMP0 "],    %[temp20]         \n\t"          \
+  "slt     %[temp17],      %[" #TEMP4 "],    %[temp20]         \n\t"          \
+  "slt     %[temp18],      %[" #TEMP8 "],    %[temp20]         \n\t"          \
+  "slt     %[temp19],      %[" #TEMP12 "],   %[temp20]         \n\t"          \
+  "movz    %[" #TEMP0 "],    %[temp20],      %[temp16]         \n\t"          \
+  "movz    %[" #TEMP4 "],    %[temp20],      %[temp17]         \n\t"          \
+  "lw      %[temp16],      8(%[args])                          \n\t"          \
+  "movz    %[" #TEMP8 "],    %[temp20],      %[temp18]         \n\t"          \
+  "movz    %[" #TEMP12 "],   %[temp20],      %[temp19]         \n\t"          \
+  "sb      %[" #TEMP0 "],    0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP4 "],    1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP8 "],    2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
+
+// Does one or two inverse transforms.
+static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
+                                             const int16_t* in,
+                                             uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
+  int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
+  const int* args[3] = {(const int*)ref, (const int*)in, (const int*)dst};
+
+  __asm__ volatile(
+    "lw      %[temp20],      4(%[args])                      \n\t"
+    VERTICAL_PASS(0, 16,  8, 24, temp4,  temp0,  temp1,  temp2,  temp3)
+    VERTICAL_PASS(2, 18, 10, 26, temp8,  temp4,  temp5,  temp6,  temp7)
+    VERTICAL_PASS(4, 20, 12, 28, temp12, temp8,  temp9,  temp10, temp11)
+    VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
+
+    HORIZONTAL_PASS(0, temp0, temp4, temp8,  temp12)
+    HORIZONTAL_PASS(1, temp1, temp5, temp9,  temp13)
+    HORIZONTAL_PASS(2, temp2, temp6, temp10, temp14)
+    HORIZONTAL_PASS(3, temp3, temp7, temp11, temp15)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+    : [args]"r"(args), [kC1]"r"(kC1), [kC2]"r"(kC2)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
+                              uint8_t* dst, int do_two) {
+  ITransformOne_MIPS32(ref, in, dst);
+  if (do_two) {
+    ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
+  }
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+// macro for one pass through for loop in QuantizeBlock
+// QUANTDIV macro inlined
+// J - offset in bytes (kZigzag[n] * 2)
+// K - offset in bytes (kZigzag[n] * 4)
+// N - offset in bytes (n * 2)
+#define QUANTIZE_ONE(J, K, N)                                               \
+  "lh           %[temp0],       " #J "(%[ppin])                     \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppsharpen])                \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppzthresh])                \n\t"   \
+  "sra          %[sign],        %[temp0],           15              \n\t"   \
+  "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \
+  "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \
+  "addu         %[coeff],       %[coeff],           %[temp1]        \n\t"   \
+  "slt          %[temp4],       %[temp2],           %[coeff]        \n\t"   \
+  "addiu        %[temp5],       $zero,              0               \n\t"   \
+  "addiu        %[level],       $zero,              0               \n\t"   \
+  "beqz         %[temp4],       2f                                  \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppiq])                     \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppbias])                   \n\t"   \
+  "lhu          %[temp3],       " #J "(%[ppq])                      \n\t"   \
+  "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \
+  "addu         %[level],       %[level],           %[temp2]        \n\t"   \
+  "sra          %[level],       %[level],           17              \n\t"   \
+  "slt          %[temp4],       %[max_level],       %[level]        \n\t"   \
+  "movn         %[level],       %[max_level],       %[temp4]        \n\t"   \
+  "xor          %[level],       %[level],           %[sign]         \n\t"   \
+  "subu         %[level],       %[level],           %[sign]         \n\t"   \
+  "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \
+"2:                                                                 \n\t"   \
+  "sh           %[temp5],       " #J "(%[ppin])                     \n\t"   \
+  "sh           %[level],       " #N "(%[pout])                     \n\t"
+
+static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
+                                const VP8Matrix* const mtx) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  int sign, coeff, level, i;
+  int max_level = MAX_LEVEL;
+
+  int16_t* ppin             = &in[0];
+  int16_t* pout             = &out[0];
+  const uint16_t* ppsharpen = &mtx->sharpen_[0];
+  const uint32_t* ppzthresh = &mtx->zthresh_[0];
+  const uint16_t* ppq       = &mtx->q_[0];
+  const uint16_t* ppiq      = &mtx->iq_[0];
+  const uint32_t* ppbias    = &mtx->bias_[0];
+
+  __asm__ volatile(
+    QUANTIZE_ONE( 0,  0,  0)
+    QUANTIZE_ONE( 2,  4,  2)
+    QUANTIZE_ONE( 8, 16,  4)
+    QUANTIZE_ONE(16, 32,  6)
+    QUANTIZE_ONE(10, 20,  8)
+    QUANTIZE_ONE( 4,  8, 10)
+    QUANTIZE_ONE( 6, 12, 12)
+    QUANTIZE_ONE(12, 24, 14)
+    QUANTIZE_ONE(18, 36, 16)
+    QUANTIZE_ONE(24, 48, 18)
+    QUANTIZE_ONE(26, 52, 20)
+    QUANTIZE_ONE(20, 40, 22)
+    QUANTIZE_ONE(14, 28, 24)
+    QUANTIZE_ONE(22, 44, 26)
+    QUANTIZE_ONE(28, 56, 28)
+    QUANTIZE_ONE(30, 60, 30)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
+      [level]"=&r"(level)
+    : [pout]"r"(pout), [ppin]"r"(ppin),
+      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
+      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
+      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
+    : "memory", "hi", "lo"
+  );
+
+  // moved out from macro to increase possibility for earlier breaking
+  for (i = 15; i >= 0; i--) {
+    if (out[i]) return 1;
+  }
+  return 0;
+}
+
+static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
+                                  const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
+#undef QUANTIZE_ONE
+
+// macro for one horizontal pass in Disto4x4 (TTransform)
+// two calls of function TTransform are merged into single one
+// A - offset in bytes to load from a and b buffers
+// E..H - offsets in bytes to store first results to tmp buffer
+// E1..H1 - offsets in bytes to store second results to tmp buffer
+#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1)                  \
+  "lbu    %[temp0],  0+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp1],  1+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp2],  2+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp3],  3+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp4],  0+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp5],  1+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp6],  2+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp7],  3+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]         \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]         \n\t"                \
+  "addu   %[temp2],  %[temp1],    %[temp3]         \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp3]         \n\t"                \
+  "addu   %[temp3],  %[temp4],    %[temp6]         \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp6]         \n\t"                \
+  "addu   %[temp6],  %[temp5],    %[temp7]         \n\t"                \
+  "subu   %[temp5],  %[temp5],    %[temp7]         \n\t"                \
+  "addu   %[temp7],  %[temp8],    %[temp2]         \n\t"                \
+  "subu   %[temp2],  %[temp8],    %[temp2]         \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]         \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]         \n\t"                \
+  "addu   %[temp1],  %[temp3],    %[temp6]         \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp6]         \n\t"                \
+  "addu   %[temp6],  %[temp4],    %[temp5]         \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp5]         \n\t"                \
+  "sw     %[temp7],  " #E "(%[tmp])                \n\t"                \
+  "sw     %[temp2],  " #H "(%[tmp])                \n\t"                \
+  "sw     %[temp8],  " #F "(%[tmp])                \n\t"                \
+  "sw     %[temp0],  " #G "(%[tmp])                \n\t"                \
+  "sw     %[temp1],  " #E1 "(%[tmp])               \n\t"                \
+  "sw     %[temp3],  " #H1 "(%[tmp])               \n\t"                \
+  "sw     %[temp6],  " #F1 "(%[tmp])               \n\t"                \
+  "sw     %[temp4],  " #G1 "(%[tmp])               \n\t"
+
+// macro for one vertical pass in Disto4x4 (TTransform)
+// two calls of function TTransform are merged into single one
+// since only one accu is available in mips32r1 instruction set
+//   first is done second call of function TTransform and after
+//   that first one.
+//   const int sum1 = TTransform(a, w);
+//   const int sum2 = TTransform(b, w);
+//   return abs(sum2 - sum1) >> 5;
+//   (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1)
+// A..D - offsets in bytes to load first results from tmp buffer
+// A1..D1 - offsets in bytes to load second results from tmp buffer
+// E..H - offsets in bytes to load from w buffer
+#define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H)     \
+  "lw     %[temp0],  " #A1 "(%[tmp])         \n\t"                \
+  "lw     %[temp1],  " #C1 "(%[tmp])         \n\t"                \
+  "lw     %[temp2],  " #B1 "(%[tmp])         \n\t"                \
+  "lw     %[temp3],  " #D1 "(%[tmp])         \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
+  "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \
+  "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \
+  "subu   %[temp8],  %[temp8],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp0],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
+  "sra    %[temp4],  %[temp3],    31         \n\t"                \
+  "sra    %[temp5],  %[temp1],    31         \n\t"                \
+  "sra    %[temp6],  %[temp0],    31         \n\t"                \
+  "sra    %[temp7],  %[temp8],    31         \n\t"                \
+  "xor    %[temp3],  %[temp3],    %[temp4]   \n\t"                \
+  "xor    %[temp1],  %[temp1],    %[temp5]   \n\t"                \
+  "xor    %[temp0],  %[temp0],    %[temp6]   \n\t"                \
+  "xor    %[temp8],  %[temp8],    %[temp7]   \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp4]   \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp5]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp6]   \n\t"                \
+  "subu   %[temp8],  %[temp8],    %[temp7]   \n\t"                \
+  "lhu    %[temp4],  " #E "(%[w])            \n\t"                \
+  "lhu    %[temp5],  " #F "(%[w])            \n\t"                \
+  "lhu    %[temp6],  " #G "(%[w])            \n\t"                \
+  "lhu    %[temp7],  " #H "(%[w])            \n\t"                \
+  "madd   %[temp4],  %[temp3]                \n\t"                \
+  "madd   %[temp5],  %[temp1]                \n\t"                \
+  "madd   %[temp6],  %[temp0]                \n\t"                \
+  "madd   %[temp7],  %[temp8]                \n\t"                \
+  "lw     %[temp0],  " #A "(%[tmp])          \n\t"                \
+  "lw     %[temp1],  " #C "(%[tmp])          \n\t"                \
+  "lw     %[temp2],  " #B "(%[tmp])          \n\t"                \
+  "lw     %[temp3],  " #D "(%[tmp])          \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
+  "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \
+  "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \
+  "subu   %[temp1],  %[temp8],    %[temp1]   \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
+  "sra    %[temp2],  %[temp3],    31         \n\t"                \
+  "xor    %[temp3],  %[temp3],    %[temp2]   \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp2]   \n\t"                \
+  "msub   %[temp4],  %[temp3]                \n\t"                \
+  "sra    %[temp2],  %[temp8],    31         \n\t"                \
+  "sra    %[temp3],  %[temp0],    31         \n\t"                \
+  "sra    %[temp4],  %[temp1],    31         \n\t"                \
+  "xor    %[temp8],  %[temp8],    %[temp2]   \n\t"                \
+  "xor    %[temp0],  %[temp0],    %[temp3]   \n\t"                \
+  "xor    %[temp1],  %[temp1],    %[temp4]   \n\t"                \
+  "subu   %[temp8],  %[temp8],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp3]   \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp4]   \n\t"                \
+  "msub   %[temp5],  %[temp8]                \n\t"                \
+  "msub   %[temp6],  %[temp0]                \n\t"                \
+  "msub   %[temp7],  %[temp1]                \n\t"
+
+static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
+  int tmp[32];
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+
+  __asm__ volatile(
+    HORIZONTAL_PASS(0,   0,  4,  8, 12,    64,  68,  72,  76)
+    HORIZONTAL_PASS(1,  16, 20, 24, 28,    80,  84,  88,  92)
+    HORIZONTAL_PASS(2,  32, 36, 40, 44,    96, 100, 104, 108)
+    HORIZONTAL_PASS(3,  48, 52, 56, 60,   112, 116, 120, 124)
+    "mthi   $zero                             \n\t"
+    "mtlo   $zero                             \n\t"
+    VERTICAL_PASS( 0, 16, 32, 48,     64, 80,  96, 112,   0,  8, 16, 24)
+    VERTICAL_PASS( 4, 20, 36, 52,     68, 84, 100, 116,   2, 10, 18, 26)
+    VERTICAL_PASS( 8, 24, 40, 56,     72, 88, 104, 120,   4, 12, 20, 28)
+    VERTICAL_PASS(12, 28, 44, 60,     76, 92, 108, 124,   6, 14, 22, 30)
+    "mflo   %[temp0]                          \n\t"
+    "sra    %[temp1],  %[temp0],  31          \n\t"
+    "xor    %[temp0],  %[temp0],  %[temp1]    \n\t"
+    "subu   %[temp0],  %[temp0],  %[temp1]    \n\t"
+    "sra    %[temp0],  %[temp0],  5           \n\t"
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [a]"r"(a), [b]"r"(b), [w]"r"(w), [tmp]"r"(tmp)
+    : "memory", "hi", "lo"
+  );
+
+  return temp0;
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
+                             const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+// macro for one horizontal pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A - offset in bytes to load from src and ref buffers
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                  \
+  "lw     %[" #TEMP1 "],  0(%[args])                           \n\t"    \
+  "lw     %[" #TEMP2 "],  4(%[args])                           \n\t"    \
+  "lbu    %[temp16],    0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp17],    0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "lbu    %[temp18],    1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp19],    1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[temp20],    %[temp16],    %[temp17]                \n\t"    \
+  "lbu    %[temp16],    2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp17],    2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[" #TEMP0 "],  %[temp18],    %[temp19]              \n\t"    \
+  "lbu    %[temp18],    3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp19],    3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[" #TEMP1 "],  %[temp16],    %[temp17]              \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp18],    %[temp19]              \n\t"    \
+  "addu   %[" #TEMP3 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \
+  "addu   %[temp20],    %[" #TEMP0 "],  %[" #TEMP1 "]          \n\t"    \
+  "subu   %[" #TEMP0 "],  %[" #TEMP0 "],  %[" #TEMP1 "]        \n\t"    \
+  "mul    %[temp16],    %[" #TEMP2 "],  %[c5352]               \n\t"    \
+  "mul    %[temp17],    %[" #TEMP2 "],  %[c2217]               \n\t"    \
+  "mul    %[temp18],    %[" #TEMP0 "],  %[c5352]               \n\t"    \
+  "mul    %[temp19],    %[" #TEMP0 "],  %[c2217]               \n\t"    \
+  "addu   %[" #TEMP1 "],  %[" #TEMP3 "],  %[temp20]            \n\t"    \
+  "subu   %[temp20],    %[" #TEMP3 "],  %[temp20]              \n\t"    \
+  "sll    %[" #TEMP0 "],  %[" #TEMP1 "],  3                    \n\t"    \
+  "sll    %[" #TEMP2 "],  %[temp20],    3                      \n\t"    \
+  "addiu  %[temp16],    %[temp16],    1812                     \n\t"    \
+  "addiu  %[temp17],    %[temp17],    937                      \n\t"    \
+  "addu   %[temp16],    %[temp16],    %[temp19]                \n\t"    \
+  "subu   %[temp17],    %[temp17],    %[temp18]                \n\t"    \
+  "sra    %[" #TEMP1 "],  %[temp16],    9                      \n\t"    \
+  "sra    %[" #TEMP3 "],  %[temp17],    9                      \n\t"
+
+// macro for one vertical pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to store to out buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)    \
+  "addu   %[temp16],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "subu   %[temp19],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "addu   %[temp17],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "subu   %[temp18],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "mul    %[" #TEMP8 "],  %[temp19],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP12 "], %[temp18],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP4 "],  %[temp19],    %[c5352]         \n\t"    \
+  "mul    %[temp18],    %[temp18],    %[c5352]           \n\t"    \
+  "addiu  %[temp16],    %[temp16],    7                  \n\t"    \
+  "addu   %[" #TEMP0 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP0 "],  %[" #TEMP0 "],  4              \n\t"    \
+  "addu   %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "]  \n\t"    \
+  "subu   %[" #TEMP4 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP4 "],  %[" #TEMP4 "],  4              \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  30000          \n\t"    \
+  "addiu  %[" #TEMP12 "], %[" #TEMP12 "], 12000          \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  21000          \n\t"    \
+  "subu   %[" #TEMP8 "],  %[" #TEMP8 "],  %[temp18]      \n\t"    \
+  "sra    %[" #TEMP12 "], %[" #TEMP12 "], 16             \n\t"    \
+  "sra    %[" #TEMP8 "],  %[" #TEMP8 "],  16             \n\t"    \
+  "addiu  %[temp16],    %[" #TEMP12 "], 1                \n\t"    \
+  "movn   %[" #TEMP12 "], %[temp16],    %[temp19]        \n\t"    \
+  "sh     %[" #TEMP0 "],  " #A "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP4 "],  " #C "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"
+
+static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
+                              int16_t* out) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+  int temp17, temp18, temp19, temp20;
+  const int c2217 = 2217;
+  const int c5352 = 5352;
+  const int* const args[3] =
+      { (const int*)src, (const int*)ref, (const int*)out };
+
+  __asm__ volatile(
+    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
+    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
+    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
+    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
+    "lw   %[temp20],    8(%[args])                     \n\t"
+    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
+    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
+    VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
+    VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+    : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
+    : "memory", "hi", "lo"
+  );
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+#if !defined(WORK_AROUND_GCC)
+
+#define GET_SSE_INNER(A, B, C, D)                               \
+  "lbu     %[temp0],    " #A "(%[a])                 \n\t"      \
+  "lbu     %[temp1],    " #A "(%[b])                 \n\t"      \
+  "lbu     %[temp2],    " #B "(%[a])                 \n\t"      \
+  "lbu     %[temp3],    " #B "(%[b])                 \n\t"      \
+  "lbu     %[temp4],    " #C "(%[a])                 \n\t"      \
+  "lbu     %[temp5],    " #C "(%[b])                 \n\t"      \
+  "lbu     %[temp6],    " #D "(%[a])                 \n\t"      \
+  "lbu     %[temp7],    " #D "(%[b])                 \n\t"      \
+  "subu    %[temp0],    %[temp0],     %[temp1]       \n\t"      \
+  "subu    %[temp2],    %[temp2],     %[temp3]       \n\t"      \
+  "subu    %[temp4],    %[temp4],     %[temp5]       \n\t"      \
+  "subu    %[temp6],    %[temp6],     %[temp7]       \n\t"      \
+  "madd    %[temp0],    %[temp0]                     \n\t"      \
+  "madd    %[temp2],    %[temp2]                     \n\t"      \
+  "madd    %[temp4],    %[temp4]                     \n\t"      \
+  "madd    %[temp6],    %[temp6]                     \n\t"
+
+#define GET_SSE(A, B, C, D)               \
+  GET_SSE_INNER(A, A + 1, A + 2, A + 3)   \
+  GET_SSE_INNER(B, B + 1, B + 2, B + 3)   \
+  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
+  GET_SSE_INNER(D, D + 1, D + 2, D + 3)
+
+static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+     GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
+     GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
+     GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
+     GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
+     GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
+     GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
+     GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
+     GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
+     GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
+     GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
+     GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+#undef GET_SSE
+#undef GET_SSE_INNER
+
+#endif  // !WORK_AROUND_GCC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
+  VP8ITransform = ITransform_MIPS32;
+  VP8FTransform = FTransform_MIPS32;
+
+  VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
+
+  VP8TDisto4x4 = Disto4x4_MIPS32;
+  VP8TDisto16x16 = Disto16x16_MIPS32;
+
+#if !defined(WORK_AROUND_GCC)
+  VP8SSE16x16 = SSE16x16_MIPS32;
+  VP8SSE8x8 = SSE8x8_MIPS32;
+  VP8SSE16x8 = SSE16x8_MIPS32;
+  VP8SSE4x4 = SSE4x4_MIPS32;
+#endif
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/enc_mips_dsp_r2.c b/media/libwebp/dsp/enc_mips_dsp_r2.c
new file mode 100644
index 0000000000..cf4c85b59c
--- /dev/null
+++ b/media/libwebp/dsp/enc_mips_dsp_r2.c
@@ -0,0 +1,1517 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of speed-critical encoding functions.
+//
+// Author(s): Darko Laus (darko.laus@imgtec.com)
+//            Mirko Raus (mirko.raus@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/mips_macro.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+// O - output
+// I - input (macro doesn't change it)
+#define ADD_SUB_HALVES_X4(O0, O1, O2, O3, O4, O5, O6, O7,                      \
+                          I0, I1, I2, I3, I4, I5, I6, I7)                      \
+  "addq.ph          %[" #O0 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
+  "subq.ph          %[" #O1 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
+  "addq.ph          %[" #O2 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
+  "subq.ph          %[" #O3 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
+  "addq.ph          %[" #O4 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
+  "subq.ph          %[" #O5 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
+  "addq.ph          %[" #O6 "],   %[" #I6 "],  %[" #I7 "]     \n\t"            \
+  "subq.ph          %[" #O7 "],   %[" #I6 "],  %[" #I7 "]     \n\t"
+
+// IO - input/output
+#define ABS_X8(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7)                         \
+  "absq_s.ph        %[" #IO0 "],   %[" #IO0 "]                \n\t"            \
+  "absq_s.ph        %[" #IO1 "],   %[" #IO1 "]                \n\t"            \
+  "absq_s.ph        %[" #IO2 "],   %[" #IO2 "]                \n\t"            \
+  "absq_s.ph        %[" #IO3 "],   %[" #IO3 "]                \n\t"            \
+  "absq_s.ph        %[" #IO4 "],   %[" #IO4 "]                \n\t"            \
+  "absq_s.ph        %[" #IO5 "],   %[" #IO5 "]                \n\t"            \
+  "absq_s.ph        %[" #IO6 "],   %[" #IO6 "]                \n\t"            \
+  "absq_s.ph        %[" #IO7 "],   %[" #IO7 "]                \n\t"
+
+// dpa.w.ph $ac0 temp0 ,temp1
+//  $ac += temp0[31..16] * temp1[31..16] + temp0[15..0] * temp1[15..0]
+// dpax.w.ph $ac0 temp0 ,temp1
+//  $ac += temp0[31..16] * temp1[15..0] + temp0[15..0] * temp1[31..16]
+// O - output
+// I - input (macro doesn't change it)
+#define MUL_HALF(O0, I0, I1, I2, I3, I4, I5, I6, I7,                           \
+                 I8, I9, I10, I11, I12, I13, I14, I15)                         \
+    "mult            $ac0,      $zero,     $zero              \n\t"            \
+    "dpa.w.ph        $ac0,      %[" #I2 "],  %[" #I0 "]       \n\t"            \
+    "dpax.w.ph       $ac0,      %[" #I5 "],  %[" #I6 "]       \n\t"            \
+    "dpa.w.ph        $ac0,      %[" #I8 "],  %[" #I9 "]       \n\t"            \
+    "dpax.w.ph       $ac0,      %[" #I11 "], %[" #I4 "]       \n\t"            \
+    "dpa.w.ph        $ac0,      %[" #I12 "], %[" #I7 "]       \n\t"            \
+    "dpax.w.ph       $ac0,      %[" #I13 "], %[" #I1 "]       \n\t"            \
+    "dpa.w.ph        $ac0,      %[" #I14 "], %[" #I3 "]       \n\t"            \
+    "dpax.w.ph       $ac0,      %[" #I15 "], %[" #I10 "]      \n\t"            \
+    "mflo            %[" #O0 "],  $ac0                        \n\t"
+
+#define OUTPUT_EARLY_CLOBBER_REGS_17()                                         \
+  OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
+  [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
+  [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
+  [temp17]"=&r"(temp17)
+
+// macro for one horizontal pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A - offset in bytes to load from src and ref buffers
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                         \
+  "lw              %[" #TEMP0 "],   0(%[args])                          \n\t"  \
+  "lw              %[" #TEMP1 "],   4(%[args])                          \n\t"  \
+  "lw              %[" #TEMP2 "],   " XSTR(BPS) "*" #A "(%[" #TEMP0 "]) \n\t"  \
+  "lw              %[" #TEMP3 "],   " XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t"  \
+  "preceu.ph.qbl   %[" #TEMP0 "],   %[" #TEMP2 "]                       \n\t"  \
+  "preceu.ph.qbl   %[" #TEMP1 "],   %[" #TEMP3 "]                       \n\t"  \
+  "preceu.ph.qbr   %[" #TEMP2 "],   %[" #TEMP2 "]                       \n\t"  \
+  "preceu.ph.qbr   %[" #TEMP3 "],   %[" #TEMP3 "]                       \n\t"  \
+  "subq.ph         %[" #TEMP0 "],   %[" #TEMP0 "],   %[" #TEMP1 "]      \n\t"  \
+  "subq.ph         %[" #TEMP2 "],   %[" #TEMP2 "],   %[" #TEMP3 "]      \n\t"  \
+  "rotr            %[" #TEMP0 "],   %[" #TEMP0 "],   16                 \n\t"  \
+  "addq.ph         %[" #TEMP1 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
+  "subq.ph         %[" #TEMP3 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
+  "seh             %[" #TEMP0 "],   %[" #TEMP1 "]                       \n\t"  \
+  "sra             %[temp16],     %[" #TEMP1 "],   16                   \n\t"  \
+  "seh             %[temp19],     %[" #TEMP3 "]                         \n\t"  \
+  "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   16                 \n\t"  \
+  "subu            %[" #TEMP2 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
+  "addu            %[" #TEMP0 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
+  "mul             %[temp17],     %[temp19],     %[c2217]               \n\t"  \
+  "mul             %[temp18],     %[" #TEMP3 "],   %[c5352]             \n\t"  \
+  "mul             %[" #TEMP1 "],   %[temp19],     %[c5352]             \n\t"  \
+  "mul             %[temp16],     %[" #TEMP3 "],   %[c2217]             \n\t"  \
+  "sll             %[" #TEMP2 "],   %[" #TEMP2 "],   3                  \n\t"  \
+  "sll             %[" #TEMP0 "],   %[" #TEMP0 "],   3                  \n\t"  \
+  "subu            %[" #TEMP3 "],   %[temp17],     %[temp18]            \n\t"  \
+  "addu            %[" #TEMP1 "],   %[temp16],     %[" #TEMP1 "]        \n\t"  \
+  "addiu           %[" #TEMP3 "],   %[" #TEMP3 "],   937                \n\t"  \
+  "addiu           %[" #TEMP1 "],   %[" #TEMP1 "],   1812               \n\t"  \
+  "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   9                  \n\t"  \
+  "sra             %[" #TEMP1 "],   %[" #TEMP1 "],   9                  \n\t"
+
+// macro for one vertical pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to store to out buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)                 \
+  "addu            %[temp16],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
+  "subu            %[temp19],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
+  "addu            %[temp17],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
+  "subu            %[temp18],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
+  "mul             %[" #TEMP8 "],   %[temp19],     %[c2217]         \n\t"      \
+  "mul             %[" #TEMP12 "],  %[temp18],     %[c2217]         \n\t"      \
+  "mul             %[" #TEMP4 "],   %[temp19],     %[c5352]         \n\t"      \
+  "mul             %[temp18],     %[temp18],     %[c5352]           \n\t"      \
+  "addiu           %[temp16],     %[temp16],     7                  \n\t"      \
+  "addu            %[" #TEMP0 "],   %[temp16],     %[temp17]        \n\t"      \
+  "sra             %[" #TEMP0 "],   %[" #TEMP0 "],   4              \n\t"      \
+  "addu            %[" #TEMP12 "],  %[" #TEMP12 "],  %[" #TEMP4 "]  \n\t"      \
+  "subu            %[" #TEMP4 "],   %[temp16],     %[temp17]        \n\t"      \
+  "sra             %[" #TEMP4 "],   %[" #TEMP4 "],   4              \n\t"      \
+  "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   30000          \n\t"      \
+  "addiu           %[" #TEMP12 "],  %[" #TEMP12 "],  12000          \n\t"      \
+  "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   21000          \n\t"      \
+  "subu            %[" #TEMP8 "],   %[" #TEMP8 "],   %[temp18]      \n\t"      \
+  "sra             %[" #TEMP12 "],  %[" #TEMP12 "],  16             \n\t"      \
+  "sra             %[" #TEMP8 "],   %[" #TEMP8 "],   16             \n\t"      \
+  "addiu           %[temp16],     %[" #TEMP12 "],  1                \n\t"      \
+  "movn            %[" #TEMP12 "],  %[temp16],     %[temp19]        \n\t"      \
+  "sh              %[" #TEMP0 "],   " #A "(%[temp20])               \n\t"      \
+  "sh              %[" #TEMP4 "],   " #C "(%[temp20])               \n\t"      \
+  "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
+  "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
+
+static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
+                                 int16_t* out) {
+  const int c2217 = 2217;
+  const int c5352 = 5352;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+  int temp17, temp18, temp19, temp20;
+  const int* const args[3] =
+      { (const int*)src, (const int*)ref, (const int*)out };
+
+  __asm__ volatile (
+    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
+    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
+    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
+    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
+    "lw            %[temp20],     8(%[args])                  \n\t"
+    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
+    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
+    VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
+    VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
+    OUTPUT_EARLY_CLOBBER_REGS_18(),
+      [temp0]"=&r"(temp0), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+    : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
+    : "memory", "hi", "lo"
+  );
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ulw              %[temp1],   0(%[in])                 \n\t"
+    "ulw              %[temp2],   16(%[in])                \n\t"
+    LOAD_IN_X2(temp5, temp6, 24, 26)
+    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
+    LOAD_IN_X2(temp1, temp2, 8, 10)
+    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
+                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
+                  temp13, temp11, temp14, temp12)
+    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
+    "ulw              %[temp17],  4(%[in])                 \n\t"
+    "ulw              %[temp18],  20(%[in])                \n\t"
+    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
+    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
+    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
+    LOAD_IN_X2(temp17, temp18, 12, 14)
+    LOAD_IN_X2(temp9, temp10, 28, 30)
+    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
+                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
+                  temp15, temp4, temp16, temp17)
+    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
+    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
+    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
+
+    // horizontal
+    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
+    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
+    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
+    "repl.ph          %[temp2],   0x4                      \n\t"
+    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
+    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
+    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
+    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
+    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
+    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
+                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
+                  temp6, temp17, temp8, temp18)
+    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
+                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
+                  temp18, temp12, temp17, temp16)
+    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
+    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
+    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
+                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
+                   temp6)
+    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
+                          temp16, temp11, temp10, temp15, temp14)
+    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
+                            temp11, temp10, temp11, temp14, temp15)
+    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
+                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18()
+    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2), [ref]"r"(ref)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
+                                 uint8_t* dst, int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
+                              const uint16_t* const w) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
+
+  __asm__ volatile (
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11,
+                            temp12, temp1, temp2, temp3, temp4)
+    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
+                          temp7, temp2, temp4, temp6, temp8)
+    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
+                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
+    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
+                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
+    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
+                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
+    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
+    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
+                        0, 4, 8, 12,
+                        1, 1, 1, 1,
+                        16)
+    MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11,
+                            temp12, temp1, temp2, temp3, temp4)
+    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
+                          temp7, temp2, temp4, temp6, temp8)
+    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
+                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
+    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
+                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
+    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
+                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
+    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
+    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
+                        0, 4, 8, 12,
+                        1, 1, 1, 1,
+                        16)
+    MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
+    OUTPUT_EARLY_CLOBBER_REGS_17()
+    : [a]"r"(a), [b]"r"(b), [w]"r"(w)
+    : "memory", "hi", "lo"
+  );
+  return abs(temp3 - temp17) >> 5;
+}
+
+static int Disto16x16_MIPSdspR2(const uint8_t* const a,
+                                const uint8_t* const b,
+                                const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+#define FILL_PART(J, SIZE)                                            \
+    "usw        %[value],  0+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+    "usw        %[value],  4+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+  ".if " #SIZE " == 16                                     \n\t"      \
+    "usw        %[value],  8+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+    "usw        %[value], 12+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+  ".endif                                                  \n\t"
+
+#define FILL_8_OR_16(DST, VALUE, SIZE) do {                         \
+  int value = (VALUE);                                              \
+  __asm__ volatile (                                                \
+    "replv.qb   %[value],  %[value]                      \n\t"      \
+    FILL_PART( 0, SIZE)                                             \
+    FILL_PART( 1, SIZE)                                             \
+    FILL_PART( 2, SIZE)                                             \
+    FILL_PART( 3, SIZE)                                             \
+    FILL_PART( 4, SIZE)                                             \
+    FILL_PART( 5, SIZE)                                             \
+    FILL_PART( 6, SIZE)                                             \
+    FILL_PART( 7, SIZE)                                             \
+  ".if " #SIZE " == 16                                   \n\t"      \
+    FILL_PART( 8, 16)                                               \
+    FILL_PART( 9, 16)                                               \
+    FILL_PART(10, 16)                                               \
+    FILL_PART(11, 16)                                               \
+    FILL_PART(12, 16)                                               \
+    FILL_PART(13, 16)                                               \
+    FILL_PART(14, 16)                                               \
+    FILL_PART(15, 16)                                               \
+  ".endif                                                \n\t"      \
+    : [value]"+&r"(value)                                           \
+    : [dst]"r"((DST))                                               \
+    : "memory"                                                      \
+  );                                                                \
+} while (0)
+
+#define VERTICAL_PRED(DST, TOP, SIZE)                                          \
+static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST),                     \
+                                           const uint8_t* (TOP)) {             \
+  int j;                                                                       \
+  if ((TOP)) {                                                                 \
+    for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE));       \
+  } else {                                                                     \
+    FILL_8_OR_16((DST), 127, (SIZE));                                          \
+  }                                                                            \
+}
+
+VERTICAL_PRED(dst, top, 8)
+VERTICAL_PRED(dst, top, 16)
+
+#undef VERTICAL_PRED
+
+#define HORIZONTAL_PRED(DST, LEFT, SIZE)                                       \
+static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST),                   \
+                                             const uint8_t* (LEFT)) {          \
+  if (LEFT) {                                                                  \
+    int j;                                                                     \
+    for (j = 0; j < (SIZE); ++j) {                                             \
+      memset((DST) + j * BPS, (LEFT)[j], (SIZE));                              \
+    }                                                                          \
+  } else {                                                                     \
+    FILL_8_OR_16((DST), 129, (SIZE));                                          \
+  }                                                                            \
+}
+
+HORIZONTAL_PRED(dst, left, 8)
+HORIZONTAL_PRED(dst, left, 16)
+
+#undef HORIZONTAL_PRED
+
+#define CLIPPING()                                                             \
+  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
+  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
+  "addu.ph         %[temp2],   %[temp2],   %[leftY_1]    \n\t"                 \
+  "addu.ph         %[temp0],   %[temp0],   %[leftY_1]    \n\t"                 \
+  "addu.ph         %[temp3],   %[temp3],   %[leftY_1]    \n\t"                 \
+  "addu.ph         %[temp1],   %[temp1],   %[leftY_1]    \n\t"                 \
+  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
+  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
+  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
+  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
+  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
+  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"
+
+#define CLIP_8B_TO_DST(DST, LEFT, TOP, SIZE) do {                              \
+  int leftY_1 = ((int)(LEFT)[y] << 16) + (LEFT)[y];                            \
+  int temp0, temp1, temp2, temp3;                                              \
+  __asm__ volatile (                                                           \
+    "replv.ph        %[leftY_1], %[leftY_1]              \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "ulw             %[temp1],   4(%[top])               \n\t"                 \
+    "subu.ph         %[leftY_1], %[leftY_1], %[left_1]   \n\t"                 \
+    CLIPPING()                                                                 \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+    "usw             %[temp1],   4(%[dst])               \n\t"                 \
+  ".if " #SIZE " == 16                                   \n\t"                 \
+    "ulw             %[temp0],   8(%[top])               \n\t"                 \
+    "ulw             %[temp1],   12(%[top])              \n\t"                 \
+    CLIPPING()                                                                 \
+    "usw             %[temp0],   8(%[dst])               \n\t"                 \
+    "usw             %[temp1],   12(%[dst])              \n\t"                 \
+  ".endif                                                \n\t"                 \
+    : [leftY_1]"+&r"(leftY_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),       \
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
+    : [left_1]"r"(left_1), [top]"r"((TOP)), [dst]"r"((DST))                    \
+    : "memory"                                                                 \
+  );                                                                           \
+} while (0)
+
+#define CLIP_TO_DST(DST, LEFT, TOP, SIZE) do {                                 \
+  int y;                                                                       \
+  const int left_1 = ((int)(LEFT)[-1] << 16) + (LEFT)[-1];                     \
+  for (y = 0; y < (SIZE); ++y) {                                               \
+    CLIP_8B_TO_DST((DST), (LEFT), (TOP), (SIZE));                              \
+    (DST) += BPS;                                                              \
+  }                                                                            \
+} while (0)
+
+#define TRUE_MOTION(DST, LEFT, TOP, SIZE)                                      \
+static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\
+                                         const uint8_t* (TOP)) {               \
+  if ((LEFT) != NULL) {                                                        \
+    if ((TOP) != NULL) {                                                       \
+      CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE));                               \
+    } else {                                                                   \
+      HorizontalPred##SIZE((DST), (LEFT));                                     \
+    }                                                                          \
+  } else {                                                                     \
+    /* true motion without left samples (hence: with default 129 value)    */  \
+    /* is equivalent to VE prediction where you just copy the top samples. */  \
+    /* Note that if top samples are not available, the default value is    */  \
+    /* then 129, and not 127 as in the VerticalPred case.                  */  \
+    if ((TOP) != NULL) {                                                       \
+      VerticalPred##SIZE((DST), (TOP));                                        \
+    } else {                                                                   \
+      FILL_8_OR_16((DST), 129, (SIZE));                                        \
+    }                                                                          \
+  }                                                                            \
+}
+
+TRUE_MOTION(dst, left, top, 8)
+TRUE_MOTION(dst, left, top, 16)
+
+#undef TRUE_MOTION
+#undef CLIP_TO_DST
+#undef CLIP_8B_TO_DST
+#undef CLIPPING
+
+static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
+                                 const uint8_t* top) {
+  int DC, DC1;
+  int temp0, temp1, temp2, temp3;
+
+  __asm__ volatile(
+    "beqz        %[top],   2f                  \n\t"
+    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, top,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
+    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
+    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
+    "addu        %[DC],    %[temp0], %[temp2]  \n\t"
+    "move        %[DC1],   %[DC]               \n\t"
+    "beqz        %[left],  1f                  \n\t"
+    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
+    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
+    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
+    "addu        %[DC1],   %[temp0], %[temp2]  \n\t"
+  "1:                                          \n\t"
+    "addu        %[DC],   %[DC],     %[DC1]    \n\t"
+    "j           3f                            \n\t"
+  "2:                                          \n\t"
+    "beqz        %[left],  4f                  \n\t"
+    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
+    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
+    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
+    "addu        %[DC],    %[temp0], %[temp2]  \n\t"
+    "addu        %[DC],    %[DC],    %[DC]     \n\t"
+  "3:                                          \n\t"
+    "shra_r.w    %[DC],    %[DC],    5         \n\t"
+    "j           5f                            \n\t"
+  "4:                                          \n\t"
+    "li          %[DC],    0x80                \n\t"
+  "5:                                          \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
+    : [left]"r"(left), [top]"r"(top)
+    : "memory"
+  );
+
+  FILL_8_OR_16(dst, DC, 16);
+}
+
+static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
+                                const uint8_t* top) {
+  int DC, DC1;
+  int temp0, temp1, temp2, temp3;
+
+  __asm__ volatile(
+    "beqz        %[top],   2f                  \n\t"
+    "ulw         %[temp0], 0(%[top])           \n\t"
+    "ulw         %[temp1], 4(%[top])           \n\t"
+    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
+    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
+    "addu        %[DC],    %[temp0], %[temp1]  \n\t"
+    "move        %[DC1],   %[DC]               \n\t"
+    "beqz        %[left],  1f                  \n\t"
+    "ulw         %[temp2], 0(%[left])          \n\t"
+    "ulw         %[temp3], 4(%[left])          \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[DC1],   %[temp2], %[temp3]  \n\t"
+  "1:                                          \n\t"
+    "addu        %[DC],    %[DC],    %[DC1]    \n\t"
+    "j           3f                            \n\t"
+  "2:                                          \n\t"
+    "beqz        %[left],  4f                  \n\t"
+    "ulw         %[temp2], 0(%[left])          \n\t"
+    "ulw         %[temp3], 4(%[left])          \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[DC],    %[temp2], %[temp3]  \n\t"
+    "addu        %[DC],    %[DC],    %[DC]     \n\t"
+  "3:                                          \n\t"
+    "shra_r.w    %[DC], %[DC], 4               \n\t"
+    "j           5f                            \n\t"
+  "4:                                          \n\t"
+    "li          %[DC], 0x80                   \n\t"
+  "5:                                          \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
+    : [left]"r"(left), [top]"r"(top)
+    : "memory"
+  );
+
+  FILL_8_OR_16(dst, DC, 8);
+}
+
+static void DC4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1;
+  __asm__ volatile(
+    "ulw          %[temp0],   0(%[top])               \n\t"
+    "ulw          %[temp1],   -5(%[top])              \n\t"
+    "raddu.w.qb   %[temp0],   %[temp0]                \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]   \n\t"
+    "addiu        %[temp0],   %[temp0],    4          \n\t"
+    "srl          %[temp0],   %[temp0],    3          \n\t"
+    "replv.qb     %[temp0],   %[temp0]                \n\t"
+    "usw          %[temp0],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw          %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw          %[temp0],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw          %[temp0],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void TM4(uint8_t* dst, const uint8_t* top) {
+  int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
+  const int c35 = 0xff00ff;
+  __asm__ volatile (
+    "lbu              %[temp1],  0(%[top])                     \n\t"
+    "lbu              %[a10],    1(%[top])                     \n\t"
+    "lbu              %[temp2],  2(%[top])                     \n\t"
+    "lbu              %[a32],    3(%[top])                     \n\t"
+    "ulw              %[temp0],  -5(%[top])                    \n\t"
+    "lbu              %[temp4],  -1(%[top])                    \n\t"
+    "append           %[a10],    %[temp1],   16                \n\t"
+    "append           %[a32],    %[temp2],   16                \n\t"
+    "replv.ph         %[temp4],  %[temp4]                      \n\t"
+    "shrl.ph          %[temp1],  %[temp0],   8                 \n\t"
+    "and              %[temp0],  %[temp0],   %[c35]            \n\t"
+    "subu.ph          %[temp1],  %[temp1],   %[temp4]          \n\t"
+    "subu.ph          %[temp0],  %[temp0],   %[temp4]          \n\t"
+    "srl              %[temp2],  %[temp1],   16                \n\t"
+    "srl              %[temp3],  %[temp0],   16                \n\t"
+    "replv.ph         %[temp2],  %[temp2]                      \n\t"
+    "replv.ph         %[temp3],  %[temp3]                      \n\t"
+    "replv.ph         %[temp4],  %[temp1]                      \n\t"
+    "replv.ph         %[temp5],  %[temp0]                      \n\t"
+    "addu.ph          %[temp0],  %[temp3],   %[a10]            \n\t"
+    "addu.ph          %[temp1],  %[temp3],   %[a32]            \n\t"
+    "addu.ph          %[temp3],  %[temp2],   %[a10]            \n\t"
+    "addu.ph          %[temp2],  %[temp2],   %[a32]            \n\t"
+    "shll_s.ph        %[temp0],  %[temp0],   7                 \n\t"
+    "shll_s.ph        %[temp1],  %[temp1],   7                 \n\t"
+    "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
+    "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
+    "precrqu_s.qb.ph  %[temp0],  %[temp1],   %[temp0]          \n\t"
+    "precrqu_s.qb.ph  %[temp1],  %[temp2],   %[temp3]          \n\t"
+    "addu.ph          %[temp2],  %[temp5],   %[a10]            \n\t"
+    "addu.ph          %[temp3],  %[temp5],   %[a32]            \n\t"
+    "addu.ph          %[temp5],  %[temp4],   %[a10]            \n\t"
+    "addu.ph          %[temp4],  %[temp4],   %[a32]            \n\t"
+    "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
+    "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
+    "shll_s.ph        %[temp4],  %[temp4],   7                 \n\t"
+    "shll_s.ph        %[temp5],  %[temp5],   7                 \n\t"
+    "precrqu_s.qb.ph  %[temp2],  %[temp3],   %[temp2]          \n\t"
+    "precrqu_s.qb.ph  %[temp3],  %[temp4],   %[temp5]          \n\t"
+    "usw              %[temp1],  0*" XSTR(BPS) "(%[dst])       \n\t"
+    "usw              %[temp0],  1*" XSTR(BPS) "(%[dst])       \n\t"
+    "usw              %[temp3],  2*" XSTR(BPS) "(%[dst])       \n\t"
+    "usw              %[temp2],  3*" XSTR(BPS) "(%[dst])       \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [a10]"=&r"(a10), [a32]"=&r"(a32)
+    : [c35]"r"(c35), [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void VE4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  __asm__ volatile(
+    "ulw             %[temp0],   -1(%[top])              \n\t"
+    "ulh             %[temp1],   3(%[top])               \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
+    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
+    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
+    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
+    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
+    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
+    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
+    "usw             %[temp4],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp4],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp4],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void HE4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  __asm__ volatile(
+    "ulw             %[temp0],   -4(%[top])              \n\t"
+    "lbu             %[temp1],   -5(%[top])              \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
+    "replv.ph        %[temp4],   %[temp1]                \n\t"
+    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "packrl.ph       %[temp6],   %[temp2],    %[temp4]   \n\t"
+    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
+    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
+    "addq.ph         %[temp3],   %[temp3],    %[temp5]   \n\t"
+    "addq.ph         %[temp3],   %[temp3],    %[temp2]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp6]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp4]   \n\t"
+    "shra_r.ph       %[temp3],   %[temp3],    2          \n\t"
+    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
+    "replv.qb        %[temp0],   %[temp3]                \n\t"
+    "replv.qb        %[temp1],   %[temp2]                \n\t"
+    "srl             %[temp3],   %[temp3],    16         \n\t"
+    "srl             %[temp2],   %[temp2],    16         \n\t"
+    "replv.qb        %[temp3],   %[temp3]                \n\t"
+    "replv.qb        %[temp2],   %[temp2]                \n\t"
+    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp2],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp1],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void RD4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  int temp6, temp7, temp8, temp9, temp10, temp11;
+  __asm__ volatile(
+    "ulw             %[temp0],    -5(%[top])               \n\t"
+    "ulw             %[temp1],    -1(%[top])               \n\t"
+    "preceu.ph.qbl   %[temp2],    %[temp0]                 \n\t"
+    "preceu.ph.qbr   %[temp3],    %[temp0]                 \n\t"
+    "preceu.ph.qbr   %[temp4],    %[temp1]                 \n\t"
+    "preceu.ph.qbl   %[temp5],    %[temp1]                 \n\t"
+    "packrl.ph       %[temp6],    %[temp2],    %[temp3]    \n\t"
+    "packrl.ph       %[temp7],    %[temp4],    %[temp2]    \n\t"
+    "packrl.ph       %[temp8],    %[temp5],    %[temp4]    \n\t"
+    "shll.ph         %[temp6],    %[temp6],    1           \n\t"
+    "addq.ph         %[temp9],    %[temp2],    %[temp6]    \n\t"
+    "shll.ph         %[temp7],    %[temp7],    1           \n\t"
+    "addq.ph         %[temp9],    %[temp9],    %[temp3]    \n\t"
+    "shll.ph         %[temp8],    %[temp8],    1           \n\t"
+    "shra_r.ph       %[temp9],    %[temp9],    2           \n\t"
+    "addq.ph         %[temp10],   %[temp4],    %[temp7]    \n\t"
+    "addq.ph         %[temp11],   %[temp5],    %[temp8]    \n\t"
+    "addq.ph         %[temp10],   %[temp10],   %[temp2]    \n\t"
+    "addq.ph         %[temp11],   %[temp11],   %[temp4]    \n\t"
+    "shra_r.ph       %[temp10],   %[temp10],   2           \n\t"
+    "shra_r.ph       %[temp11],   %[temp11],   2           \n\t"
+    "lbu             %[temp0],    3(%[top])                \n\t"
+    "lbu             %[temp1],    2(%[top])                \n\t"
+    "lbu             %[temp2],    1(%[top])                \n\t"
+    "sll             %[temp1],    %[temp1],    1           \n\t"
+    "addu            %[temp0],    %[temp0],    %[temp1]    \n\t"
+    "addu            %[temp0],    %[temp0],    %[temp2]    \n\t"
+    "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]    \n\t"
+    "shra_r.w        %[temp0],    %[temp0],    2           \n\t"
+    "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]   \n\t"
+    "usw             %[temp9],    3*" XSTR(BPS) "(%[dst])  \n\t"
+    "usw             %[temp10],   1*" XSTR(BPS) "(%[dst])  \n\t"
+    "prepend         %[temp9],    %[temp11],   8           \n\t"
+    "prepend         %[temp10],   %[temp0],    8           \n\t"
+    "usw             %[temp9],    2*" XSTR(BPS) "(%[dst])  \n\t"
+    "usw             %[temp10],   0*" XSTR(BPS) "(%[dst])  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void VR4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    "ulw              %[temp0],   -4(%[top])              \n\t"
+    "ulw              %[temp1],   0(%[top])               \n\t"
+    "preceu.ph.qbl    %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbr    %[temp0],   %[temp0]                \n\t"
+    "preceu.ph.qbla   %[temp3],   %[temp1]                \n\t"
+    "preceu.ph.qbra   %[temp1],   %[temp1]                \n\t"
+    "packrl.ph        %[temp7],   %[temp3],    %[temp2]   \n\t"
+    "addqh_r.ph       %[temp4],   %[temp1],    %[temp3]   \n\t"
+    "move             %[temp6],   %[temp1]                \n\t"
+    "append           %[temp1],   %[temp2],    16         \n\t"
+    "shll.ph          %[temp9],   %[temp6],    1          \n\t"
+    "addqh_r.ph       %[temp5],   %[temp7],    %[temp6]   \n\t"
+    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
+    "addu.ph          %[temp3],   %[temp7],    %[temp3]   \n\t"
+    "addu.ph          %[temp1],   %[temp1],    %[temp6]   \n\t"
+    "packrl.ph        %[temp7],   %[temp2],    %[temp0]   \n\t"
+    "addu.ph          %[temp6],   %[temp0],    %[temp2]   \n\t"
+    "addu.ph          %[temp3],   %[temp3],    %[temp9]   \n\t"
+    "addu.ph          %[temp1],   %[temp1],    %[temp8]   \n\t"
+    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
+    "shra_r.ph        %[temp3],   %[temp3],    2          \n\t"
+    "shra_r.ph        %[temp1],   %[temp1],    2          \n\t"
+    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
+    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
+    "precrq.ph.w      %[temp8],   %[temp4],    %[temp5]   \n\t"
+    "append           %[temp4],   %[temp5],    16         \n\t"
+    "precrq.ph.w      %[temp2],   %[temp3],    %[temp1]   \n\t"
+    "append           %[temp3],   %[temp1],    16         \n\t"
+    "precr.qb.ph      %[temp8],   %[temp8],    %[temp4]   \n\t"
+    "precr.qb.ph      %[temp3],   %[temp2],    %[temp3]   \n\t"
+    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "append           %[temp3],   %[temp6],    8          \n\t"
+    "srl              %[temp6],   %[temp6],    16         \n\t"
+    "append           %[temp8],   %[temp6],    8          \n\t"
+    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void LD4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  int temp6, temp7, temp8, temp9, temp10, temp11;
+  __asm__ volatile(
+    "ulw             %[temp0],    0(%[top])               \n\t"
+    "ulw             %[temp1],    4(%[top])               \n\t"
+    "preceu.ph.qbl   %[temp2],    %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp3],    %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp4],    %[temp1]                \n\t"
+    "preceu.ph.qbl   %[temp5],    %[temp1]                \n\t"
+    "packrl.ph       %[temp6],    %[temp2],    %[temp3]   \n\t"
+    "packrl.ph       %[temp7],    %[temp4],    %[temp2]   \n\t"
+    "packrl.ph       %[temp8],    %[temp5],    %[temp4]   \n\t"
+    "shll.ph         %[temp6],    %[temp6],    1          \n\t"
+    "addq.ph         %[temp9],    %[temp2],    %[temp6]   \n\t"
+    "shll.ph         %[temp7],    %[temp7],    1          \n\t"
+    "addq.ph         %[temp9],    %[temp9],    %[temp3]   \n\t"
+    "shll.ph         %[temp8],    %[temp8],    1          \n\t"
+    "shra_r.ph       %[temp9],    %[temp9],    2          \n\t"
+    "addq.ph         %[temp10],   %[temp4],    %[temp7]   \n\t"
+    "addq.ph         %[temp11],   %[temp5],    %[temp8]   \n\t"
+    "addq.ph         %[temp10],   %[temp10],   %[temp2]   \n\t"
+    "addq.ph         %[temp11],   %[temp11],   %[temp4]   \n\t"
+    "shra_r.ph       %[temp10],   %[temp10],   2          \n\t"
+    "shra_r.ph       %[temp11],   %[temp11],   2          \n\t"
+    "srl             %[temp1],    %[temp1],    24         \n\t"
+    "sll             %[temp1],    %[temp1],    1          \n\t"
+    "raddu.w.qb      %[temp5],    %[temp5]                \n\t"
+    "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]   \n\t"
+    "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]  \n\t"
+    "addu            %[temp1],    %[temp1],    %[temp5]   \n\t"
+    "shra_r.w        %[temp1],    %[temp1],    2          \n\t"
+    "usw             %[temp9],    0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp10],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "prepend         %[temp9],    %[temp11],   8          \n\t"
+    "prepend         %[temp10],   %[temp1],    8          \n\t"
+    "usw             %[temp9],    1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp10],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void VL4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    "ulw              %[temp0],   0(%[top])               \n\t"
+    "ulw              %[temp1],   4(%[top])               \n\t"
+    "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
+    "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
+    "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
+    "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
+    "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
+    "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
+    "shll.ph          %[temp9],   %[temp2],    1          \n\t"
+    "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
+    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
+    "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
+    "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
+    "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
+    "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
+    "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
+    "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
+    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
+    "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
+    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
+    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
+    "precrq.ph.w      %[temp8],   %[temp5],    %[temp4]   \n\t"
+    "append           %[temp5],   %[temp4],    16         \n\t"
+    "precrq.ph.w      %[temp3],   %[temp2],    %[temp0]   \n\t"
+    "append           %[temp2],   %[temp0],    16         \n\t"
+    "precr.qb.ph      %[temp8],   %[temp8],    %[temp5]   \n\t"
+    "precr.qb.ph      %[temp3],   %[temp3],    %[temp2]   \n\t"
+    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "prepend          %[temp8],   %[temp6],    8          \n\t"
+    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "srl              %[temp6],   %[temp6],    16         \n\t"
+    "prepend          %[temp3],   %[temp6],    8          \n\t"
+    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void HD4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    "ulw              %[temp0],   -5(%[top])              \n\t"
+    "ulw              %[temp1],   -1(%[top])              \n\t"
+    "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
+    "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
+    "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
+    "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
+    "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
+    "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
+    "shll.ph          %[temp9],   %[temp2],    1          \n\t"
+    "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
+    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
+    "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
+    "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
+    "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
+    "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
+    "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
+    "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
+    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
+    "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
+    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
+    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
+    "precrq.ph.w      %[temp1],   %[temp2],    %[temp5]   \n\t"
+    "precrq.ph.w      %[temp3],   %[temp0],    %[temp4]   \n\t"
+    "precr.qb.ph      %[temp7],   %[temp6],    %[temp1]   \n\t"
+    "precr.qb.ph      %[temp6],   %[temp1],    %[temp3]   \n\t"
+    "usw              %[temp7],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp6],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "append           %[temp2],   %[temp5],    16         \n\t"
+    "append           %[temp0],   %[temp4],    16         \n\t"
+    "precr.qb.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "precr.qb.ph      %[temp4],   %[temp2],    %[temp0]   \n\t"
+    "usw              %[temp5],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void HU4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  __asm__ volatile (
+    "ulw             %[temp0],   -5(%[top])              \n\t"
+    "preceu.ph.qbl   %[temp1],   %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "packrl.ph       %[temp3],   %[temp1],    %[temp2]   \n\t"
+    "replv.qb        %[temp7],   %[temp2]                \n\t"
+    "addqh_r.ph      %[temp4],   %[temp1],    %[temp3]   \n\t"
+    "addqh_r.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "shll.ph         %[temp6],   %[temp3],    1          \n\t"
+    "addu.ph         %[temp3],   %[temp2],    %[temp3]   \n\t"
+    "addu.ph         %[temp6],   %[temp1],    %[temp6]   \n\t"
+    "shll.ph         %[temp0],   %[temp2],    1          \n\t"
+    "addu.ph         %[temp6],   %[temp6],    %[temp2]   \n\t"
+    "addu.ph         %[temp0],   %[temp3],    %[temp0]   \n\t"
+    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
+    "shra_r.ph       %[temp0],   %[temp0],    2          \n\t"
+    "packrl.ph       %[temp3],   %[temp6],    %[temp5]   \n\t"
+    "precrq.ph.w     %[temp2],   %[temp6],    %[temp4]   \n\t"
+    "append          %[temp0],   %[temp5],    16         \n\t"
+    "precr.qb.ph     %[temp3],   %[temp3],    %[temp2]   \n\t"
+    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "precr.qb.ph     %[temp1],   %[temp7],    %[temp0]   \n\t"
+    "usw             %[temp7],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    "packrl.ph       %[temp2],   %[temp1],    %[temp3]   \n\t"
+    "usw             %[temp1],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp2],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
+                                       const uint8_t* top) {
+  // U block
+  DCMode8(C8DC8 + dst, left, top);
+  VerticalPred8(C8VE8 + dst, top);
+  HorizontalPred8(C8HE8 + dst, left);
+  TrueMotion8(C8TM8 + dst, left, top);
+  // V block
+  dst += 8;
+  if (top) top += 8;
+  if (left) left += 16;
+  DCMode8(C8DC8 + dst, left, top);
+  VerticalPred8(C8VE8 + dst, top);
+  HorizontalPred8(C8HE8 + dst, left);
+  TrueMotion8(C8TM8 + dst, left, top);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds_MIPSdspR2(uint8_t* dst,
+                                   const uint8_t* left, const uint8_t* top) {
+  DCMode16(I16DC16 + dst, left, top);
+  VerticalPred16(I16VE16 + dst, top);
+  HorizontalPred16(I16HE16 + dst, left);
+  TrueMotion16(I16TM16 + dst, left, top);
+}
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+#if !defined(WORK_AROUND_GCC)
+
+#define GET_SSE_INNER(A)                                                  \
+  "lw               %[temp0],    " #A "(%[a])                  \n\t"      \
+  "lw               %[temp1],    " #A "(%[b])                  \n\t"      \
+  "preceu.ph.qbr    %[temp2],    %[temp0]                      \n\t"      \
+  "preceu.ph.qbl    %[temp0],    %[temp0]                      \n\t"      \
+  "preceu.ph.qbr    %[temp3],    %[temp1]                      \n\t"      \
+  "preceu.ph.qbl    %[temp1],    %[temp1]                      \n\t"      \
+  "subq.ph          %[temp2],    %[temp2],    %[temp3]         \n\t"      \
+  "subq.ph          %[temp0],    %[temp0],    %[temp1]         \n\t"      \
+  "dpa.w.ph         $ac0,        %[temp2],    %[temp2]         \n\t"      \
+  "dpa.w.ph         $ac0,        %[temp0],    %[temp0]         \n\t"
+
+#define GET_SSE(A, B, C, D)               \
+  GET_SSE_INNER(A)                        \
+  GET_SSE_INNER(B)                        \
+  GET_SSE_INNER(C)                        \
+  GET_SSE_INNER(D)
+
+static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3;
+  __asm__ volatile (
+    "mult   $zero,    $zero                            \n\t"
+    GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+    GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+    GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+    GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+    GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+    GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+    GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+    GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+    GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
+    GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
+    GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
+    GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
+    GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
+    GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
+    GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
+    GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
+    "mflo   %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3;
+  __asm__ volatile (
+    "mult   $zero,    $zero                            \n\t"
+    GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+    GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+    GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+    GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+    GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+    GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+    GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+    GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+    "mflo   %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3;
+  __asm__ volatile (
+    "mult   $zero,    $zero                            \n\t"
+    GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
+    GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
+    GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
+    GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
+    "mflo   %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3;
+  __asm__ volatile (
+    "mult   $zero,    $zero                            \n\t"
+    GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
+    "mflo   %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+#undef GET_SSE
+#undef GET_SSE_INNER
+
+#endif  // !WORK_AROUND_GCC
+
+#undef FILL_8_OR_16
+#undef FILL_PART
+#undef OUTPUT_EARLY_CLOBBER_REGS_17
+#undef MUL_HALF
+#undef ABS_X8
+#undef ADD_SUB_HALVES_X4
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+// macro for one pass through for loop in QuantizeBlock reading 2 values at time
+// QUANTDIV macro inlined
+// J - offset in bytes (kZigzag[n] * 2)
+// K - offset in bytes (kZigzag[n] * 4)
+// N - offset in bytes (n * 2)
+// N1 - offset in bytes ((n + 1) * 2)
+#define QUANTIZE_ONE(J, K, N, N1)                                         \
+  "ulw         %[temp1],     " #J "(%[ppin])                 \n\t"        \
+  "ulw         %[temp2],     " #J "(%[ppsharpen])            \n\t"        \
+  "lhu         %[temp3],     " #K "(%[ppzthresh])            \n\t"        \
+  "lhu         %[temp6],     " #K "+4(%[ppzthresh])          \n\t"        \
+  "absq_s.ph   %[temp4],     %[temp1]                        \n\t"        \
+  "ins         %[temp3],     %[temp6],         16,       16  \n\t"        \
+  "addu.ph     %[coeff],     %[temp4],         %[temp2]      \n\t"        \
+  "shra.ph     %[sign],      %[temp1],         15            \n\t"        \
+  "li          %[level],     0x10001                         \n\t"        \
+  "cmp.lt.ph   %[temp3],     %[coeff]                        \n\t"        \
+  "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
+  "pick.ph     %[temp5],     %[level],         $0            \n\t"        \
+  "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
+  "beqz        %[temp5],     0f                              \n\t"        \
+  "lhu         %[temp3],     " #J "(%[ppq])                  \n\t"        \
+  "beq         %[temp5],     %[level],         1f            \n\t"        \
+  "andi        %[temp5],     %[temp5],         0x1           \n\t"        \
+  "andi        %[temp4],     %[coeff],         0xffff        \n\t"        \
+  "beqz        %[temp5],     2f                              \n\t"        \
+  "mul         %[level],     %[temp4],         %[temp1]      \n\t"        \
+  "sh          $0,           " #J "+2(%[ppin])               \n\t"        \
+  "sh          $0,           " #N1 "(%[pout])                \n\t"        \
+  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
+  "sra         %[level],     %[level],         17            \n\t"        \
+  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
+  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
+  "andi        %[temp6],     %[sign],          0xffff        \n\t"        \
+  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
+  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
+  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
+  "or          %[ret],       %[ret],           %[level]      \n\t"        \
+  "sh          %[level],     " #N "(%[pout])                 \n\t"        \
+  "sh          %[temp5],     " #J "(%[ppin])                 \n\t"        \
+  "j           3f                                            \n\t"        \
+"2:                                                          \n\t"        \
+  "lhu         %[temp1],     " #J "+2(%[ppiq])               \n\t"        \
+  "srl         %[temp5],     %[coeff],         16            \n\t"        \
+  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
+  "lw          %[temp2],     " #K "+4(%[ppbias])             \n\t"        \
+  "lhu         %[temp3],     " #J "+2(%[ppq])                \n\t"        \
+  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
+  "sra         %[level],     %[level],         17            \n\t"        \
+  "srl         %[temp6],     %[sign],          16            \n\t"        \
+  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
+  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
+  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
+  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
+  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
+  "sh          $0,           " #J "(%[ppin])                 \n\t"        \
+  "sh          $0,           " #N "(%[pout])                 \n\t"        \
+  "or          %[ret],       %[ret],           %[level]      \n\t"        \
+  "sh          %[temp5],     " #J "+2(%[ppin])               \n\t"        \
+  "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
+  "j           3f                                            \n\t"        \
+"1:                                                          \n\t"        \
+  "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
+  "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
+  "ulw         %[temp3],     " #J "(%[ppq])                  \n\t"        \
+  "andi        %[temp5],     %[coeff],         0xffff        \n\t"        \
+  "srl         %[temp0],     %[coeff],         16            \n\t"        \
+  "lhu         %[temp6],     " #J "+2(%[ppiq])               \n\t"        \
+  "lw          %[coeff],     " #K "+4(%[ppbias])             \n\t"        \
+  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
+  "mul         %[temp4],     %[temp0],         %[temp6]      \n\t"        \
+  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
+  "addu        %[temp4],     %[temp4],         %[coeff]      \n\t"        \
+  "precrq.ph.w %[level],     %[temp4],         %[level]      \n\t"        \
+  "shra.ph     %[level],     %[level],         1             \n\t"        \
+  "cmp.lt.ph   %[max_level1],%[level]                        \n\t"        \
+  "pick.ph     %[level],     %[max_level],     %[level]      \n\t"        \
+  "xor         %[level],     %[level],         %[sign]       \n\t"        \
+  "subu.ph     %[level],     %[level],         %[sign]       \n\t"        \
+  "mul.ph      %[temp3],     %[level],         %[temp3]      \n\t"        \
+  "or          %[ret],       %[ret],           %[level]      \n\t"        \
+  "sh          %[level],     " #N "(%[pout])                 \n\t"        \
+  "srl         %[level],     %[level],         16            \n\t"        \
+  "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
+  "usw         %[temp3],     " #J "(%[ppin])                 \n\t"        \
+  "j           3f                                            \n\t"        \
+"0:                                                          \n\t"        \
+  "sh          $0,           " #N "(%[pout])                 \n\t"        \
+  "sh          $0,           " #N1 "(%[pout])                \n\t"        \
+  "usw         $0,           " #J "(%[ppin])                 \n\t"        \
+"3:                                                          \n\t"
+
+static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
+                                   const VP8Matrix* const mtx) {
+  int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
+  int sign, coeff, level;
+  int max_level = MAX_LEVEL;
+  int max_level1 = max_level << 16 | max_level;
+  int ret = 0;
+
+  int16_t* ppin             = &in[0];
+  int16_t* pout             = &out[0];
+  const uint16_t* ppsharpen = &mtx->sharpen_[0];
+  const uint32_t* ppzthresh = &mtx->zthresh_[0];
+  const uint16_t* ppq       = &mtx->q_[0];
+  const uint16_t* ppiq      = &mtx->iq_[0];
+  const uint32_t* ppbias    = &mtx->bias_[0];
+
+  __asm__ volatile (
+    QUANTIZE_ONE( 0,  0,  0,  2)
+    QUANTIZE_ONE( 4,  8, 10, 12)
+    QUANTIZE_ONE( 8, 16,  4,  8)
+    QUANTIZE_ONE(12, 24, 14, 24)
+    QUANTIZE_ONE(16, 32,  6, 16)
+    QUANTIZE_ONE(20, 40, 22, 26)
+    QUANTIZE_ONE(24, 48, 18, 20)
+    QUANTIZE_ONE(28, 56, 28, 30)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
+      [level]"=&r"(level), [temp6]"=&r"(temp6), [ret]"+&r"(ret)
+    : [ppin]"r"(ppin), [pout]"r"(pout), [max_level1]"r"(max_level1),
+      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
+      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
+      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
+    : "memory", "hi", "lo"
+  );
+
+  return (ret != 0);
+}
+
+static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
+                                     const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
+#undef QUANTIZE_ONE
+
+// macro for one horizontal pass in FTransformWHT
+// temp0..temp7 holds tmp[0]..tmp[15]
+// A, B, C, D - offset in bytes to load from in buffer
+// TEMP0, TEMP1 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS_WHT(A, B, C, D, TEMP0, TEMP1)                          \
+  "lh              %[" #TEMP0 "],  " #A "(%[in])            \n\t"              \
+  "lh              %[" #TEMP1 "],  " #B "(%[in])            \n\t"              \
+  "lh              %[temp8],     " #C "(%[in])              \n\t"              \
+  "lh              %[temp9],     " #D "(%[in])              \n\t"              \
+  "ins             %[" #TEMP1 "],  %[" #TEMP0 "],  16,  16  \n\t"              \
+  "ins             %[temp9],     %[temp8],     16,  16      \n\t"              \
+  "subq.ph         %[temp8],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
+  "addq.ph         %[temp9],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
+  "precrq.ph.w     %[" #TEMP0 "],  %[temp8],     %[temp9]   \n\t"              \
+  "append          %[temp8],     %[temp9],     16           \n\t"              \
+  "subq.ph         %[" #TEMP1 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
+  "addq.ph         %[" #TEMP0 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
+  "rotr            %[" #TEMP1 "],  %[" #TEMP1 "],  16       \n\t"
+
+// macro for one vertical pass in FTransformWHT
+// temp0..temp7 holds tmp[0]..tmp[15]
+// A, B, C, D - offsets in bytes to store to out buffer
+// TEMP0, TEMP2, TEMP4 and TEMP6 - registers for corresponding tmp elements
+#define VERTICAL_PASS_WHT(A, B, C, D, TEMP0, TEMP2, TEMP4, TEMP6)              \
+  "addq.ph         %[temp8],     %[" #TEMP0 "],  %[" #TEMP4 "]    \n\t"        \
+  "addq.ph         %[temp9],     %[" #TEMP2 "],  %[" #TEMP6 "]    \n\t"        \
+  "subq.ph         %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
+  "subq.ph         %[" #TEMP6 "],  %[" #TEMP0 "],  %[" #TEMP4 "]  \n\t"        \
+  "addqh.ph        %[" #TEMP0 "],  %[temp8],     %[temp9]         \n\t"        \
+  "subqh.ph        %[" #TEMP4 "],  %[" #TEMP6 "],  %[" #TEMP2 "]  \n\t"        \
+  "addqh.ph        %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
+  "subqh.ph        %[" #TEMP6 "],  %[temp8],     %[temp9]         \n\t"        \
+  "usw             %[" #TEMP0 "],  " #A "(%[out])                 \n\t"        \
+  "usw             %[" #TEMP2 "],  " #B "(%[out])                 \n\t"        \
+  "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
+  "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
+
+static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+
+  __asm__ volatile (
+    HORIZONTAL_PASS_WHT(  0,  32,  64,  96, temp0, temp1)
+    HORIZONTAL_PASS_WHT(128, 160, 192, 224, temp2, temp3)
+    HORIZONTAL_PASS_WHT(256, 288, 320, 352, temp4, temp5)
+    HORIZONTAL_PASS_WHT(384, 416, 448, 480, temp6, temp7)
+    VERTICAL_PASS_WHT(0,  8, 16, 24, temp0, temp2, temp4, temp6)
+    VERTICAL_PASS_WHT(4, 12, 20, 28, temp1, temp3, temp5, temp7)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [in]"r"(in), [out]"r"(out)
+    : "memory"
+  );
+}
+
+#undef VERTICAL_PASS_WHT
+#undef HORIZONTAL_PASS_WHT
+
+// macro for converting coefficients to bin
+// convert 8 coeffs at time
+// A, B, C, D - offsets in bytes to load from out buffer
+#define CONVERT_COEFFS_TO_BIN(A, B, C, D)                                      \
+  "ulw        %[temp0],  " #A "(%[out])                \n\t"                   \
+  "ulw        %[temp1],  " #B "(%[out])                \n\t"                   \
+  "ulw        %[temp2],  " #C "(%[out])                \n\t"                   \
+  "ulw        %[temp3],  " #D "(%[out])                \n\t"                   \
+  "absq_s.ph  %[temp0],  %[temp0]                      \n\t"                   \
+  "absq_s.ph  %[temp1],  %[temp1]                      \n\t"                   \
+  "absq_s.ph  %[temp2],  %[temp2]                      \n\t"                   \
+  "absq_s.ph  %[temp3],  %[temp3]                      \n\t"                   \
+  "shra.ph    %[temp0],  %[temp0],    3                \n\t"                   \
+  "shra.ph    %[temp1],  %[temp1],    3                \n\t"                   \
+  "shra.ph    %[temp2],  %[temp2],    3                \n\t"                   \
+  "shra.ph    %[temp3],  %[temp3],    3                \n\t"                   \
+  "shll_s.ph  %[temp0],  %[temp0],    10               \n\t"                   \
+  "shll_s.ph  %[temp1],  %[temp1],    10               \n\t"                   \
+  "shll_s.ph  %[temp2],  %[temp2],    10               \n\t"                   \
+  "shll_s.ph  %[temp3],  %[temp3],    10               \n\t"                   \
+  "shrl.ph    %[temp0],  %[temp0],    10               \n\t"                   \
+  "shrl.ph    %[temp1],  %[temp1],    10               \n\t"                   \
+  "shrl.ph    %[temp2],  %[temp2],    10               \n\t"                   \
+  "shrl.ph    %[temp3],  %[temp3],    10               \n\t"                   \
+  "shll.ph    %[temp0],  %[temp0],    2                \n\t"                   \
+  "shll.ph    %[temp1],  %[temp1],    2                \n\t"                   \
+  "shll.ph    %[temp2],  %[temp2],    2                \n\t"                   \
+  "shll.ph    %[temp3],  %[temp3],    2                \n\t"                   \
+  "ext        %[temp4],  %[temp0],    0,       16      \n\t"                   \
+  "ext        %[temp0],  %[temp0],    16,      16      \n\t"                   \
+  "addu       %[temp4],  %[temp4],    %[dist]          \n\t"                   \
+  "addu       %[temp0],  %[temp0],    %[dist]          \n\t"                   \
+  "ext        %[temp5],  %[temp1],    0,       16      \n\t"                   \
+  "lw         %[temp8],  0(%[temp4])                   \n\t"                   \
+  "ext        %[temp1],  %[temp1],    16,      16      \n\t"                   \
+  "addu       %[temp5],  %[temp5],    %[dist]          \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp4])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp0])                   \n\t"                   \
+  "addu       %[temp1],  %[temp1],    %[dist]          \n\t"                   \
+  "ext        %[temp6],  %[temp2],    0,       16      \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp0])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp5])                   \n\t"                   \
+  "ext        %[temp2],  %[temp2],    16,      16      \n\t"                   \
+  "addu       %[temp6],  %[temp6],    %[dist]          \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp5])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp1])                   \n\t"                   \
+  "addu       %[temp2],  %[temp2],    %[dist]          \n\t"                   \
+  "ext        %[temp7],  %[temp3],    0,       16      \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp1])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp6])                   \n\t"                   \
+  "ext        %[temp3],  %[temp3],    16,      16      \n\t"                   \
+  "addu       %[temp7],  %[temp7],    %[dist]          \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp6])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp2])                   \n\t"                   \
+  "addu       %[temp3],  %[temp3],    %[dist]          \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp2])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp7])                   \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp7])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp3])                   \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp3])                   \n\t"
+
+static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
+                                       int start_block, int end_block,
+                                       VP8Histogram* const histo) {
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin.
+    __asm__ volatile (
+      CONVERT_COEFFS_TO_BIN( 0,  4,  8, 12)
+      CONVERT_COEFFS_TO_BIN(16, 20, 24, 28)
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+        [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+      : [dist]"r"(distribution), [out]"r"(out), [max_coeff]"r"(max_coeff)
+      : "memory"
+    );
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+#undef CONVERT_COEFFS_TO_BIN
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
+  VP8FTransform = FTransform_MIPSdspR2;
+  VP8FTransformWHT = FTransformWHT_MIPSdspR2;
+  VP8ITransform = ITransform_MIPSdspR2;
+
+  VP8TDisto4x4 = Disto4x4_MIPSdspR2;
+  VP8TDisto16x16 = Disto16x16_MIPSdspR2;
+
+  VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
+  VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
+  VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
+
+#if !defined(WORK_AROUND_GCC)
+  VP8SSE16x16 = SSE16x16_MIPSdspR2;
+  VP8SSE8x8 = SSE8x8_MIPSdspR2;
+  VP8SSE16x8 = SSE16x8_MIPSdspR2;
+  VP8SSE4x4 = SSE4x4_MIPSdspR2;
+#endif
+
+  VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
+
+  VP8CollectHistogram = CollectHistogram_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/enc_msa.c b/media/libwebp/dsp/enc_msa.c
new file mode 100644
index 0000000000..229582e4a6
--- /dev/null
+++ b/media/libwebp/dsp/enc_msa.c
@@ -0,0 +1,896 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of encoder dsp functions.
+//
+// Author:  Prashant Patil   (prashant.patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include <stdlib.h>
+#include "../dsp/msa_macro.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Transforms
+
+#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  v4i32 a1_m, b1_m, c1_m, d1_m;                                     \
+  const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091);              \
+  const v4i32 sinpi8sqrt2 = __msa_fill_w(35468);                    \
+  v4i32 c_tmp1_m = in1 * sinpi8sqrt2;                               \
+  v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1;                         \
+  v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1;                         \
+  v4i32 d_tmp2_m = in3 * sinpi8sqrt2;                               \
+                                                                    \
+  ADDSUB2(in0, in2, a1_m, b1_m);                                    \
+  SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16);                               \
+  c_tmp2_m = c_tmp2_m + in3;                                        \
+  c1_m = c_tmp1_m - c_tmp2_m;                                       \
+  SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16);                               \
+  d_tmp1_m = d_tmp1_m + in1;                                        \
+  d1_m = d_tmp1_m + d_tmp2_m;                                       \
+  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);      \
+} while (0)
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  v8i16 input0, input1;
+  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+  v4i32 res0, res1, res2, res3;
+  v16i8 dest0, dest1, dest2, dest3;
+  const v16i8 zero = { 0 };
+
+  LD_SH2(in, 8, input0, input1);
+  UNPCK_SH_SW(input0, in0, in1);
+  UNPCK_SH_SW(input1, in2, in3);
+  IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+  IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+  SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+  TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+  LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
+  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+             res0, res1, res2, res3);
+  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+             res0, res1, res2, res3);
+  ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+  CLIP_SW4_0_255(res0, res1, res2, res3);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                           int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
+                           int16_t* out) {
+  uint64_t out0, out1, out2, out3;
+  uint32_t in0, in1, in2, in3;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  v8i16 t0, t1, t2, t3;
+  v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
+  const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+  const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
+  const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+  const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
+  const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };
+
+  LW4(src, BPS, in0, in1, in2, in3);
+  INSERT_W4_UB(in0, in1, in2, in3, src0);
+  LW4(ref, BPS, in0, in1, in2, in3);
+  INSERT_W4_UB(in0, in1, in2, in3, src1);
+  ILVRL_B2_UB(src0, src1, srcl0, srcl1);
+  HSUB_UB2_SH(srcl0, srcl1, t0, t1);
+  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
+  ADDSUB2(t2, t3, t0, t1);
+  t0 = SRLI_H(t0, 3);
+  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
+  tmp0 = __msa_hadd_s_w(t3, t3);
+  tmp2 = __msa_hsub_s_w(t3, t3);
+  FILL_W2_SW(1812, 937, tmp1, tmp3);
+  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
+  SRAI_W2_SW(tmp1, tmp3, 9);
+  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
+  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
+  ADDSUB2(t2, t3, t0, t1);
+  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
+  tmp0 = __msa_hadd_s_w(t3, t3);
+  tmp2 = __msa_hsub_s_w(t3, t3);
+  ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
+  SRAI_W2_SW(tmp0, tmp2, 4);
+  FILL_W2_SW(12000, 51000, tmp1, tmp3);
+  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
+  SRAI_W2_SW(tmp1, tmp3, 16);
+  UNPCK_R_SH_SW(t1, tmp4);
+  tmp5 = __msa_ceqi_w(tmp4, 0);
+  tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
+  tmp5 = __msa_fill_w(1);
+  tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
+  tmp1 += tmp5;
+  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
+  out0 = __msa_copy_s_d((v2i64)t0, 0);
+  out1 = __msa_copy_s_d((v2i64)t0, 1);
+  out2 = __msa_copy_s_d((v2i64)t1, 0);
+  out3 = __msa_copy_s_d((v2i64)t1, 1);
+  SD4(out0, out1, out2, out3, out, 8);
+}
+
+static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
+  v8i16 in0 = { 0 };
+  v8i16 in1 = { 0 };
+  v8i16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0, out1;
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+
+  in0 = __msa_insert_h(in0, 0, in[  0]);
+  in0 = __msa_insert_h(in0, 1, in[ 64]);
+  in0 = __msa_insert_h(in0, 2, in[128]);
+  in0 = __msa_insert_h(in0, 3, in[192]);
+  in0 = __msa_insert_h(in0, 4, in[ 16]);
+  in0 = __msa_insert_h(in0, 5, in[ 80]);
+  in0 = __msa_insert_h(in0, 6, in[144]);
+  in0 = __msa_insert_h(in0, 7, in[208]);
+  in1 = __msa_insert_h(in1, 0, in[ 48]);
+  in1 = __msa_insert_h(in1, 1, in[112]);
+  in1 = __msa_insert_h(in1, 2, in[176]);
+  in1 = __msa_insert_h(in1, 3, in[240]);
+  in1 = __msa_insert_h(in1, 4, in[ 32]);
+  in1 = __msa_insert_h(in1, 5, in[ 96]);
+  in1 = __msa_insert_h(in1, 6, in[160]);
+  in1 = __msa_insert_h(in1, 7, in[224]);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, out0, out1);
+  SRAI_H2_SH(out0, out1, 1);
+  ST_SH2(out0, out1, out, 8);
+}
+
+static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
+  int sum;
+  uint32_t in0_m, in1_m, in2_m, in3_m;
+  v16i8 src0 = { 0 };
+  v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
+  v4i32 dst0, dst1;
+  const v16i8 zero = { 0 };
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+
+  LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
+  INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
+  ILVRL_B2_SH(zero, src0, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+  tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
+  tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
+  LD_SH2(w, 8, tmp2, tmp3);
+  DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
+  dst0 = dst0 + dst1;
+  sum = HADD_SW_S32(dst0);
+  return sum;
+}
+
+static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
+  const int sum1 = TTransform_MSA(a, w);
+  const int sum2 = TTransform_MSA(b, w);
+  return abs(sum2 - sum1) >> 5;
+}
+
+static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
+                          const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_MSA(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Histogram
+
+static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
+                                 int start_block, int end_block,
+                                 VP8Histogram* const histo) {
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    {
+      int k;
+      v8i16 coeff0, coeff1;
+      const v8i16 zero = { 0 };
+      const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);
+      LD_SH2(&out[0], 8, coeff0, coeff1);
+      coeff0 = __msa_add_a_h(coeff0, zero);
+      coeff1 = __msa_add_a_h(coeff1, zero);
+      SRAI_H2_SH(coeff0, coeff1, 3);
+      coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);
+      coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);
+      ST_SH2(coeff0, coeff1, &out[0], 8);
+      for (k = 0; k < 16; ++k) {
+        ++distribution[out[k]];
+      }
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+// luma 4x4 prediction
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+  const v16u8 A1 = { 0 };
+  const uint64_t val_m = LD(top - 1);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+  const v16u8 B = SLDI_UB(A, A, 1);
+  const v16u8 C = SLDI_UB(A, A, 2);
+  const v16u8 AC = __msa_ave_u_b(A, C);
+  const v16u8 B2 = __msa_ave_u_b(B, B);
+  const v16u8 R = __msa_aver_u_b(AC, B2);
+  const uint32_t out = __msa_copy_s_w((v4i32)R, 0);
+  SW4(out, out, out, out, dst, BPS);
+}
+
+static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
+}
+
+static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+  dc >>= 3;
+  dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
+  SW4(dc, dc, dc, dc, dst, BPS);
+}
+
+static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
+  const v16u8 A2 = { 0 };
+  const uint64_t val_m = LD(top - 5);
+  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
+  const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
+  const v16u8 B = SLDI_UB(A, A, 1);
+  const v16u8 C = SLDI_UB(A, A, 2);
+  const v16u8 AC = __msa_ave_u_b(A, C);
+  const v16u8 B2 = __msa_ave_u_b(B, B);
+  const v16u8 R0 = __msa_aver_u_b(AC, B2);
+  const v16u8 R1 = SLDI_UB(R0, R0, 1);
+  const v16u8 R2 = SLDI_UB(R1, R1, 1);
+  const v16u8 R3 = SLDI_UB(R2, R2, 1);
+  const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
+  const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
+  const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
+  const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
+  SW4(val3, val2, val1, val0, dst, BPS);
+}
+
+static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
+  const v16u8 A1 = { 0 };
+  const uint64_t val_m = LD(top);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+  const v16u8 B = SLDI_UB(A, A, 1);
+  const v16u8 C1 = SLDI_UB(A, A, 2);
+  const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
+  const v16u8 AC = __msa_ave_u_b(A, C);
+  const v16u8 B2 = __msa_ave_u_b(B, B);
+  const v16u8 R0 = __msa_aver_u_b(AC, B2);
+  const v16u8 R1 = SLDI_UB(R0, R0, 1);
+  const v16u8 R2 = SLDI_UB(R1, R1, 1);
+  const v16u8 R3 = SLDI_UB(R2, R2, 1);
+  const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
+  const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
+  const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
+  const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
+  SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
+}
+
+static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  const int E = top[4];
+  const int F = top[5];
+  const int G = top[6];
+  const int H = top[7];
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
+}
+
+static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+  const v16i8 zero = { 0 };
+  const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
+  const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
+  const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]);
+  const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]);
+  const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]);
+  const v16u8 T1 = LD_UB(top);
+  const v8i16 T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+  const v8i16 d = T - TL;
+  v8i16 r0, r1, r2, r3;
+  ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
+  CLIP_SH4_0_255(r0, r1, r2, r3);
+  PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
+}
+
+// luma 16x16 prediction
+
+#define STORE16x16(out, dst) do {                                        \
+    ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS);  \
+    ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);  \
+} while (0)
+
+static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
+  if (top != NULL) {
+    const v16u8 out = LD_UB(top);
+    STORE16x16(out, dst);
+  } else {
+    const v16u8 out = (v16u8)__msa_fill_b(0x7f);
+    STORE16x16(out, dst);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
+                                            const uint8_t* left) {
+  if (left != NULL) {
+    int j;
+    for (j = 0; j < 16; j += 4) {
+      const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
+      const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
+      const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
+      const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
+      ST_UB4(L0, L1, L2, L3, dst, BPS);
+      dst += 4 * BPS;
+      left += 4;
+    }
+  } else {
+    const v16u8 out = (v16u8)__msa_fill_b(0x81);
+    STORE16x16(out, dst);
+  }
+}
+
+static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
+                                        const uint8_t* top) {
+  if (left != NULL) {
+    if (top != NULL) {
+      int j;
+      v8i16 d1, d2;
+      const v16i8 zero = { 0 };
+      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
+      const v16u8 T = LD_UB(top);
+      ILVRL_B2_SH(zero, T, d1, d2);
+      SUB2(d1, TL, d2, TL, d1, d2);
+      for (j = 0; j < 16; j += 4) {
+        v16i8 t0, t1, t2, t3;
+        v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
+        const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
+        const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
+        const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
+        const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
+        ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
+        ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
+        CLIP_SH4_0_255(r0, r1, r2, r3);
+        CLIP_SH4_0_255(r4, r5, r6, r7);
+        PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
+        ST_SB4(t0, t1, t2, t3, dst, BPS);
+        dst += 4 * BPS;
+      }
+    } else {
+      HorizontalPred16x16(dst, left);
+    }
+  } else {
+    if (top != NULL) {
+      VerticalPred16x16(dst, top);
+    } else {
+      const v16u8 out = (v16u8)__msa_fill_b(0x81);
+      STORE16x16(out, dst);
+    }
+  }
+}
+
+static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
+                                    const uint8_t* top) {
+  int DC;
+  v16u8 out;
+  if (top != NULL && left != NULL) {
+    const v16u8 rtop = LD_UB(top);
+    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+    const v16u8 rleft = LD_UB(left);
+    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
+    const v8u16 dctemp = dctop + dcleft;
+    DC = HADD_UH_U32(dctemp);
+    DC = (DC + 16) >> 5;
+  } else if (left != NULL) {   // left but no top
+    const v16u8 rleft = LD_UB(left);
+    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
+    DC = HADD_UH_U32(dcleft);
+    DC = (DC + DC + 16) >> 5;
+  } else if (top != NULL) {   // top but no left
+    const v16u8 rtop = LD_UB(top);
+    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+    DC = HADD_UH_U32(dctop);
+    DC = (DC + DC + 16) >> 5;
+  } else {   // no top, no left, nothing.
+    DC = 0x80;
+  }
+  out = (v16u8)__msa_fill_b(DC);
+  STORE16x16(out, dst);
+}
+
+static void Intra16Preds_MSA(uint8_t* dst,
+                             const uint8_t* left, const uint8_t* top) {
+  DCMode16x16(I16DC16 + dst, left, top);
+  VerticalPred16x16(I16VE16 + dst, top);
+  HorizontalPred16x16(I16HE16 + dst, left);
+  TrueMotion16x16(I16TM16 + dst, left, top);
+}
+
+// Chroma 8x8 prediction
+
+#define CALC_DC8(in, out) do {                              \
+  const v8u16 temp0 = __msa_hadd_u_h(in, in);               \
+  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);         \
+  const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1);  \
+  const v2i64 temp3 = __msa_splati_d(temp2, 1);             \
+  const v2i64 temp4 = temp3 + temp2;                        \
+  const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4);       \
+  const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0);      \
+  out = __msa_copy_s_d(temp6, 0);                           \
+} while (0)
+
+#define STORE8x8(out, dst) do {                 \
+  SD4(out, out, out, out, dst + 0 * BPS, BPS);  \
+  SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
+} while (0)
+
+static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
+  if (top != NULL) {
+    const uint64_t out = LD(top);
+    STORE8x8(out, dst);
+  } else {
+    const uint64_t out = 0x7f7f7f7f7f7f7f7fULL;
+    STORE8x8(out, dst);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
+  if (left != NULL) {
+    int j;
+    for (j = 0; j < 8; j += 4) {
+      const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
+      const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
+      const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
+      const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
+      const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
+      const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
+      const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
+      const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
+      SD4(out0, out1, out2, out3, dst, BPS);
+      dst += 4 * BPS;
+      left += 4;
+    }
+  } else {
+    const uint64_t out = 0x8181818181818181ULL;
+    STORE8x8(out, dst);
+  }
+}
+
+static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
+                                      const uint8_t* top) {
+  if (left != NULL) {
+    if (top != NULL) {
+      int j;
+      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
+      const v16u8 T1 = LD_UB(top);
+      const v16i8 zero = { 0 };
+      const v8i16 T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+      const v8i16 d = T - TL;
+      for (j = 0; j < 8; j += 4) {
+        uint64_t out0, out1, out2, out3;
+        v16i8 t0, t1;
+        v8i16 r0 = (v8i16)__msa_fill_h(left[j + 0]);
+        v8i16 r1 = (v8i16)__msa_fill_h(left[j + 1]);
+        v8i16 r2 = (v8i16)__msa_fill_h(left[j + 2]);
+        v8i16 r3 = (v8i16)__msa_fill_h(left[j + 3]);
+        ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
+        CLIP_SH4_0_255(r0, r1, r2, r3);
+        PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
+        out0 = __msa_copy_s_d((v2i64)t0, 0);
+        out1 = __msa_copy_s_d((v2i64)t0, 1);
+        out2 = __msa_copy_s_d((v2i64)t1, 0);
+        out3 = __msa_copy_s_d((v2i64)t1, 1);
+        SD4(out0, out1, out2, out3, dst, BPS);
+        dst += 4 * BPS;
+      }
+    } else {
+      HorizontalPred8x8(dst, left);
+    }
+  } else {
+    if (top != NULL) {
+      VerticalPred8x8(dst, top);
+    } else {
+      const uint64_t out = 0x8181818181818181ULL;
+      STORE8x8(out, dst);
+    }
+  }
+}
+
+static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
+  uint64_t out;
+  v16u8 src = { 0 };
+  if (top != NULL && left != NULL) {
+    const uint64_t left_m = LD(left);
+    const uint64_t top_m = LD(top);
+    INSERT_D2_UB(left_m, top_m, src);
+    CALC_DC8(src, out);
+  } else if (left != NULL) {   // left but no top
+    const uint64_t left_m = LD(left);
+    INSERT_D2_UB(left_m, left_m, src);
+    CALC_DC8(src, out);
+  } else if (top != NULL) {   // top but no left
+    const uint64_t top_m = LD(top);
+    INSERT_D2_UB(top_m, top_m, src);
+    CALC_DC8(src, out);
+  } else {   // no top, no left, nothing.
+    src = (v16u8)__msa_fill_b(0x80);
+    out = __msa_copy_s_d((v2i64)src, 0);
+  }
+  STORE8x8(out, dst);
+}
+
+static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
+                                 const uint8_t* top) {
+  // U block
+  DCMode8x8(C8DC8 + dst, left, top);
+  VerticalPred8x8(C8VE8 + dst, top);
+  HorizontalPred8x8(C8HE8 + dst, left);
+  TrueMotion8x8(C8TM8 + dst, left, top);
+  // V block
+  dst += 8;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
+  DCMode8x8(C8DC8 + dst, left, top);
+  VerticalPred8x8(C8VE8 + dst, top);
+  HorizontalPred8x8(C8HE8 + dst, left);
+  TrueMotion8x8(C8TM8 + dst, left, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+#define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  v16u8 tmp0, tmp1;                                                        \
+  v8i16 tmp2, tmp3;                                                        \
+  ILVRL_B2_UB(in0, in1, tmp0, tmp1);                                       \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                     \
+  DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1);                         \
+  ILVRL_B2_UB(in2, in3, tmp0, tmp1);                                       \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                     \
+  DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
+} while (0)
+
+#define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  v16u8 tmp0, tmp1;                                                         \
+  v8i16 tmp2, tmp3;                                                         \
+  ILVRL_B2_UB(in0, in1, tmp0, tmp1);                                        \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                      \
+  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1);                         \
+  ILVRL_B2_UB(in2, in3, tmp0, tmp1);                                        \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                      \
+  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
+} while (0)
+
+static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v4i32 out0, out1, out2, out3;
+
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+  a += 8 * BPS;
+  b += 8 * BPS;
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+  out0 += out1;
+  out2 += out3;
+  out0 += out2;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v4i32 out0, out1, out2, out3;
+
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+  out0 += out1;
+  out2 += out3;
+  out0 += out2;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v16u8 t0, t1, t2, t3;
+  v4i32 out0, out1, out2, out3;
+
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3);
+  PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
+  ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3);
+  PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
+  out0 += out1;
+  out2 += out3;
+  out0 += out2;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum = 0;
+  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
+  v8i16 diff0, diff1;
+  v4i32 out0, out1;
+
+  LW4(a, BPS, src0, src1, src2, src3);
+  LW4(b, BPS, ref0, ref1, ref2, ref3);
+  INSERT_W4_UB(src0, src1, src2, src3, src);
+  INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+  ILVRL_B2_UB(src, ref, tmp0, tmp1);
+  HSUB_UB2_SH(tmp0, tmp1, diff0, diff1);
+  DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1);
+  out0 += out1;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+
+static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
+                             const VP8Matrix* const mtx) {
+  int sum;
+  v8i16 in0, in1, sh0, sh1, out0, out1;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
+  v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3;
+  const v8i16 zero = { 0 };
+  const v8i16 zigzag0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
+  const v8i16 zigzag1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
+  const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL);
+
+  LD_SH2(&in[0], 8, in0, in1);
+  LD_SH2(&mtx->sharpen_[0], 8, sh0, sh1);
+  tmp4 = __msa_add_a_h(in0, zero);
+  tmp5 = __msa_add_a_h(in1, zero);
+  ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1);
+  ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3);
+  HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3);
+  sign0 = (in0 < zero);
+  sign1 = (in1 < zero);                           // sign
+  LD_SH2(&mtx->iq_[0], 8, tmp0, tmp1);            // iq
+  ILVRL_H2_SW(zero, tmp0, t0, t1);
+  ILVRL_H2_SW(zero, tmp1, t2, t3);
+  LD_SW4(&mtx->bias_[0], 4, b0, b1, b2, b3);      // bias
+  MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3);
+  ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3);
+  SRAI_W4_SW(b0, b1, b2, b3, 17);
+  PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3);
+  tmp0 = (tmp2 > maxlevel);
+  tmp1 = (tmp3 > maxlevel);
+  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
+  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
+  SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
+  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
+  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
+  LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3);   // zthresh
+  t0 = (s0 > t0);
+  t1 = (s1 > t1);
+  t2 = (s2 > t2);
+  t3 = (s3 > t3);
+  PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1);
+  tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0);
+  tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1);
+  LD_SH2(&mtx->q_[0], 8, tmp0, tmp1);
+  MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1);
+  VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1);
+  ST_SH2(in0, in1, &in[0], 8);
+  ST_SH2(out0, out1, &out[0], 8);
+  out0 = __msa_add_a_h(out0, out1);
+  sum = HADD_SH_S32(out0);
+  return (sum > 0);
+}
+
+static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
+                               const VP8Matrix* const mtx) {
+  int nz;
+  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
+  VP8ITransform = ITransform_MSA;
+  VP8FTransform = FTransform_MSA;
+  VP8FTransformWHT = FTransformWHT_MSA;
+
+  VP8TDisto4x4 = Disto4x4_MSA;
+  VP8TDisto16x16 = Disto16x16_MSA;
+  VP8CollectHistogram = CollectHistogram_MSA;
+
+  VP8EncPredLuma4 = Intra4Preds_MSA;
+  VP8EncPredLuma16 = Intra16Preds_MSA;
+  VP8EncPredChroma8 = IntraChromaPreds_MSA;
+
+  VP8SSE16x16 = SSE16x16_MSA;
+  VP8SSE16x8 = SSE16x8_MSA;
+  VP8SSE8x8 = SSE8x8_MSA;
+  VP8SSE4x4 = SSE4x4_MSA;
+
+  VP8EncQuantizeBlock = QuantizeBlock_MSA;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/enc_neon.c b/media/libwebp/dsp/enc_neon.c
new file mode 100644
index 0000000000..657be9b21b
--- /dev/null
+++ b/media/libwebp/dsp/enc_neon.c
@@ -0,0 +1,938 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of speed-critical encoding functions.
+//
+// adapted from libvpx (https://www.webmproject.org/code/)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <assert.h>
+
+#include "../dsp/neon.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+// Inverse transform.
+// This code is pretty much the same as TransformOne in the dec_neon.c, except
+// for subtraction to *ref. See the comments there for algorithmic explanations.
+
+static const int16_t kC1 = 20091;
+static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
+
+// This code works but is *slower* than the inlined-asm version below
+// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
+// WEBP_USE_INTRINSICS define.
+// With gcc-4.8, it's a little faster speed than inlined-assembly.
+#if defined(WEBP_USE_INTRINSICS)
+
+// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
+static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
+  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
+}
+
+// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
+// to the corresponding rows of 'dst'.
+static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
+                                                 const int16x8_t dst01,
+                                                 const int16x8_t dst23) {
+  // Unsigned saturate to 8b.
+  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
+  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
+
+  // Store the results.
+  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
+  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
+  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
+  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
+}
+
+static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
+                                    const int16x8_t row23,
+                                    const uint8_t* const ref,
+                                    uint8_t* const dst) {
+  uint32x2_t dst01 = vdup_n_u32(0);
+  uint32x2_t dst23 = vdup_n_u32(0);
+
+  // Load the source pixels.
+  dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
+  dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
+  dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
+  dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
+
+  {
+    // Convert to 16b.
+    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
+    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
+
+    // Descale with rounding.
+    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
+    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
+    // Add the inverse transform.
+    SaturateAndStore4x4_NEON(dst, out01, out23);
+  }
+}
+
+static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
+                                          const int16x8_t in1,
+                                          int16x8x2_t* const out) {
+  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
+  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
+  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
+                                                  // b0 d0 b1 d1 b2 d2 ...
+  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
+}
+
+static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
+  // {rows} = in0 | in4
+  //          in8 | in12
+  // B1 = in4 | in12
+  const int16x8_t B1 =
+      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
+  // C0 = kC1 * in4 | kC1 * in12
+  // C1 = kC2 * in4 | kC2 * in12
+  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
+  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
+  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
+                                vget_low_s16(rows->val[1]));   // in0 + in8
+  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
+                                vget_low_s16(rows->val[1]));   // in0 - in8
+  // c = kC2 * in4 - kC1 * in12
+  // d = kC1 * in4 + kC2 * in12
+  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
+  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
+  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
+  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
+  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
+  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
+  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
+  Transpose8x2_NEON(E0, E1, rows);
+}
+
+static void ITransformOne_NEON(const uint8_t* ref,
+                               const int16_t* in, uint8_t* dst) {
+  int16x8x2_t rows;
+  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
+  TransformPass_NEON(&rows);
+  TransformPass_NEON(&rows);
+  Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
+}
+
+#else
+
+static void ITransformOne_NEON(const uint8_t* ref,
+                               const int16_t* in, uint8_t* dst) {
+  const int kBPS = BPS;
+  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
+
+  __asm__ volatile (
+    "vld1.16         {q1, q2}, [%[in]]           \n"
+    "vld1.16         {d0}, [%[kC1C2]]            \n"
+
+    // d2: in[0]
+    // d3: in[8]
+    // d4: in[4]
+    // d5: in[12]
+    "vswp            d3, d4                      \n"
+
+    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
+    // q9 = {in[4], in[12]} * kC2 >> 16
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    // d22 = a = in[0] + in[8]
+    // d23 = b = in[0] - in[8]
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    //  q8 = in[4]/[12] * kC1 >> 16
+    "vshr.s16        q8, q8, #1                  \n"
+
+    // Add {in[4], in[12]} back after the multiplication.
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    // d20 = c = in[4]*kC2 - in[12]*kC1
+    // d21 = d = in[4]*kC1 + in[12]*kC2
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    // d2 = tmp[0] = a + d
+    // d3 = tmp[1] = b + c
+    // d4 = tmp[2] = b - c
+    // d5 = tmp[3] = a - d
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    "vswp            d3, d4                      \n"
+
+    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
+    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    // d22 = a = tmp[0] + tmp[8]
+    // d23 = b = tmp[0] - tmp[8]
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    "vshr.s16        q8, q8, #1                  \n"
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    // d20 = c = in[4]*kC2 - in[12]*kC1
+    // d21 = d = in[4]*kC1 + in[12]*kC2
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    // d2 = tmp[0] = a + d
+    // d3 = tmp[1] = b + c
+    // d4 = tmp[2] = b - c
+    // d5 = tmp[3] = a - d
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
+
+    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
+
+    // (val) + 4 >> 3
+    "vrshr.s16       d2, d2, #3                  \n"
+    "vrshr.s16       d3, d3, #3                  \n"
+    "vrshr.s16       d4, d4, #3                  \n"
+    "vrshr.s16       d5, d5, #3                  \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    // Must accumulate before saturating
+    "vmovl.u8        q8, d6                      \n"
+    "vmovl.u8        q9, d7                      \n"
+
+    "vqadd.s16       q1, q1, q8                  \n"
+    "vqadd.s16       q2, q2, q9                  \n"
+
+    "vqmovun.s16     d0, q1                      \n"
+    "vqmovun.s16     d1, q2                      \n"
+
+    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[1], [%[dst]]             \n"
+
+    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
+    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
+    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
+  );
+}
+
+#endif    // WEBP_USE_INTRINSICS
+
+static void ITransform_NEON(const uint8_t* ref,
+                            const int16_t* in, uint8_t* dst, int do_two) {
+  ITransformOne_NEON(ref, in, dst);
+  if (do_two) {
+    ITransformOne_NEON(ref + 4, in + 16, dst + 4);
+  }
+}
+
+// Load all 4x4 pixels into a single uint8x16_t variable.
+static uint8x16_t Load4x4_NEON(const uint8_t* src) {
+  uint32x4_t out = vdupq_n_u32(0);
+  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
+  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
+  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
+  out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
+  return vreinterpretq_u8_u32(out);
+}
+
+// Forward transform.
+
+#if defined(WEBP_USE_INTRINSICS)
+
+static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
+                                              const int16x4_t B,
+                                              const int16x4_t C,
+                                              const int16x4_t D,
+                                              int16x8_t* const out01,
+                                              int16x8_t* const out32) {
+  const int16x4x2_t AB = vtrn_s16(A, B);
+  const int16x4x2_t CD = vtrn_s16(C, D);
+  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
+                                     vreinterpret_s32_s16(CD.val[0]));
+  const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
+                                     vreinterpret_s32_s16(CD.val[1]));
+  *out01 = vreinterpretq_s16_s64(
+      vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
+                   vreinterpret_s64_s32(tmp13.val[0])));
+  *out32 = vreinterpretq_s16_s64(
+      vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
+                   vreinterpret_s64_s32(tmp02.val[1])));
+}
+
+static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
+                                              const uint8x8_t b) {
+  return vreinterpretq_s16_u16(vsubl_u8(a, b));
+}
+
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
+  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
+  {
+    const uint8x16_t S0 = Load4x4_NEON(src);
+    const uint8x16_t R0 = Load4x4_NEON(ref);
+    const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
+    const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
+    const int16x4_t D0 = vget_low_s16(D0D1);
+    const int16x4_t D1 = vget_high_s16(D0D1);
+    const int16x4_t D2 = vget_low_s16(D2D3);
+    const int16x4_t D3 = vget_high_s16(D2D3);
+    Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
+  }
+  {    // 1rst pass
+    const int32x4_t kCst937 = vdupq_n_s32(937);
+    const int32x4_t kCst1812 = vdupq_n_s32(1812);
+    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
+    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
+    const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
+    const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
+                                    vget_high_s16(a0a1_2));
+    const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
+                                    vget_high_s16(a0a1_2));
+    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
+    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
+    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
+    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
+    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
+    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
+    Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
+  }
+  {    // 2nd pass
+    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
+    const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
+    const int32x4_t kCst51000 = vdupq_n_s32(51000);
+    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
+    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
+    const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
+    const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
+    const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
+    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
+    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
+    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
+    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
+    const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
+    const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
+    const int16x4_t a3_eq_0 =
+        vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
+    const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
+    vst1_s16(out +  0, out0);
+    vst1_s16(out +  4, out1);
+    vst1_s16(out +  8, out2);
+    vst1_s16(out + 12, out3);
+  }
+}
+
+#else
+
+// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
+static const int16_t kCoeff16[] = {
+  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
+};
+static const int32_t kCoeff32[] = {
+   1812,  1812,  1812,  1812,
+    937,   937,   937,   937,
+  12000, 12000, 12000, 12000,
+  51000, 51000, 51000, 51000
+};
+
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
+  const int kBPS = BPS;
+  const uint8_t* src_ptr = src;
+  const uint8_t* ref_ptr = ref;
+  const int16_t* coeff16 = kCoeff16;
+  const int32_t* coeff32 = kCoeff32;
+
+  __asm__ volatile (
+    // load src into q4, q5 in high half
+    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d11}, [%[src_ptr]]               \n"
+
+    // load ref into q6, q7 in high half
+    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d15}, [%[ref_ptr]]               \n"
+
+    // Pack the high values in to q4 and q6
+    "vtrn.32     q4, q5                       \n"
+    "vtrn.32     q6, q7                       \n"
+
+    // d[0-3] = src - ref
+    "vsubl.u8    q0, d8, d12                  \n"
+    "vsubl.u8    q1, d9, d13                  \n"
+
+    // load coeff16 into q8(d16=5352, d17=2217)
+    "vld1.16     {q8}, [%[coeff16]]           \n"
+
+    // load coeff32 high half into q9 = 1812, q10 = 937
+    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
+
+    // load coeff32 low half into q11=12000, q12=51000
+    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
+
+    // part 1
+    // Transpose. Register dN is the same as dN in C
+    "vtrn.32         d0, d2                   \n"
+    "vtrn.32         d1, d3                   \n"
+    "vtrn.16         d0, d1                   \n"
+    "vtrn.16         d2, d3                   \n"
+
+    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
+    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
+    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
+    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
+
+    "vadd.s16        d0, d4, d5               \n" // a0 + a1
+    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
+    "vsub.s16        d2, d4, d5               \n" // a0 - a1
+    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
+
+    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
+    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
+    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
+    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
+
+    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
+    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
+    "vshrn.s32       d1, q9, #9               \n"
+    "vshrn.s32       d3, q10, #9              \n"
+
+    // part 2
+    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+    "vtrn.32         d0, d2                   \n"
+    "vtrn.32         d1, d3                   \n"
+    "vtrn.16         d0, d1                   \n"
+    "vtrn.16         d2, d3                   \n"
+
+    "vmov.s16        d26, #7                  \n"
+
+    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
+    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
+    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
+    "vadd.s16        d4, d4, d26              \n" // a1 + 7
+    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
+
+    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
+    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
+
+    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
+    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
+
+    "vceq.s16        d4, d7, #0               \n"
+
+    "vshr.s16        d0, d0, #4               \n"
+    "vshr.s16        d2, d2, #4               \n"
+
+    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
+    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
+
+    "vmvn            d4, d4                   \n" // !(d1 == 0)
+    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
+    "vshrn.s32       d1, q11, #16             \n"
+    // op[4] += (d1!=0)
+    "vsub.s16        d1, d1, d4               \n"
+    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
+    "vshrn.s32       d3, q12, #16             \n"
+
+    // set result to out array
+    "vst1.16         {q0, q1}, [%[out]]   \n"
+    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
+      [coeff32] "+r"(coeff32)          // modified registers
+    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
+      [out] "r"(out)                   // constants
+    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+      "q10", "q11", "q12", "q13"       // clobbered
+  );
+}
+
+#endif
+
+#define LOAD_LANE_16b(VALUE, LANE) do {             \
+  (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
+  src += stride;                                    \
+} while (0)
+
+static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
+  const int stride = 16;
+  const int16x4_t zero = vdup_n_s16(0);
+  int32x4x4_t tmp0;
+  int16x4x4_t in;
+  INIT_VECTOR4(in, zero, zero, zero, zero);
+  LOAD_LANE_16b(in.val[0], 0);
+  LOAD_LANE_16b(in.val[1], 0);
+  LOAD_LANE_16b(in.val[2], 0);
+  LOAD_LANE_16b(in.val[3], 0);
+  LOAD_LANE_16b(in.val[0], 1);
+  LOAD_LANE_16b(in.val[1], 1);
+  LOAD_LANE_16b(in.val[2], 1);
+  LOAD_LANE_16b(in.val[3], 1);
+  LOAD_LANE_16b(in.val[0], 2);
+  LOAD_LANE_16b(in.val[1], 2);
+  LOAD_LANE_16b(in.val[2], 2);
+  LOAD_LANE_16b(in.val[3], 2);
+  LOAD_LANE_16b(in.val[0], 3);
+  LOAD_LANE_16b(in.val[1], 3);
+  LOAD_LANE_16b(in.val[2], 3);
+  LOAD_LANE_16b(in.val[3], 3);
+
+  {
+    // a0 = in[0 * 16] + in[2 * 16]
+    // a1 = in[1 * 16] + in[3 * 16]
+    // a2 = in[1 * 16] - in[3 * 16]
+    // a3 = in[0 * 16] - in[2 * 16]
+    const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
+    const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
+    const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
+    const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
+    tmp0.val[0] = vaddq_s32(a0, a1);
+    tmp0.val[1] = vaddq_s32(a3, a2);
+    tmp0.val[2] = vsubq_s32(a3, a2);
+    tmp0.val[3] = vsubq_s32(a0, a1);
+  }
+  {
+    const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
+    // a0 = tmp[0 + i] + tmp[ 8 + i]
+    // a1 = tmp[4 + i] + tmp[12 + i]
+    // a2 = tmp[4 + i] - tmp[12 + i]
+    // a3 = tmp[0 + i] - tmp[ 8 + i]
+    const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
+    const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
+    const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
+    const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
+    const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
+    const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
+    const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
+    const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
+    const int16x4_t out0 = vmovn_s32(b0);
+    const int16x4_t out1 = vmovn_s32(b1);
+    const int16x4_t out2 = vmovn_s32(b2);
+    const int16x4_t out3 = vmovn_s32(b3);
+
+    vst1_s16(out +  0, out0);
+    vst1_s16(out +  4, out1);
+    vst1_s16(out +  8, out2);
+    vst1_s16(out + 12, out3);
+  }
+}
+#undef LOAD_LANE_16b
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// a 0123, b 0123
+// a 4567, b 4567
+// a 89ab, b 89ab
+// a cdef, b cdef
+//
+// transpose
+//
+// a 048c, b 048c
+// a 159d, b 159d
+// a 26ae, b 26ae
+// a 37bf, b 37bf
+//
+static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
+  const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
+  const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
+  const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
+                                        vreinterpretq_s32_s16(q2_tmp1.val[0]));
+  const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
+                                        vreinterpretq_s32_s16(q2_tmp1.val[1]));
+  q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
+  q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
+  q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
+  q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
+  return q4_in;
+}
+
+static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
+    const int16x8x4_t q4_in) {
+  // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
+  // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
+  const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
+  const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
+  const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
+  const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
+  int16x8x4_t q4_out;
+  // tmp[0] = a0 + a1
+  // tmp[1] = a3 + a2
+  // tmp[2] = a3 - a2
+  // tmp[3] = a0 - a1
+  INIT_VECTOR4(q4_out,
+               vabsq_s16(vaddq_s16(q_a0, q_a1)),
+               vabsq_s16(vaddq_s16(q_a3, q_a2)),
+               vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
+  return q4_out;
+}
+
+static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
+  const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
+                                                        q4_in.val[2]));
+  const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
+                                                        q4_in.val[3]));
+  const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
+                                                        q4_in.val[3]));
+  const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
+                                                        q4_in.val[2]));
+  int16x8x4_t q4_out;
+
+  INIT_VECTOR4(q4_out,
+               vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
+               vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
+  return q4_out;
+}
+
+static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
+  const uint16x8_t q_w07 = vld1q_u16(&w[0]);
+  const uint16x8_t q_w8f = vld1q_u16(&w[8]);
+  int16x4x4_t d4_w;
+  INIT_VECTOR4(d4_w,
+               vget_low_s16(vreinterpretq_s16_u16(q_w07)),
+               vget_high_s16(vreinterpretq_s16_u16(q_w07)),
+               vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
+               vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
+  return d4_w;
+}
+
+static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
+                                           const int16x4x4_t d4_w) {
+  int32x2_t d_sum;
+  // sum += w[ 0] * abs(b0);
+  // sum += w[ 4] * abs(b1);
+  // sum += w[ 8] * abs(b2);
+  // sum += w[12] * abs(b3);
+  int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
+  int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
+  int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
+  int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
+  q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
+  q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
+  q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
+  q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
+
+  q_sum0 = vaddq_s32(q_sum0, q_sum1);
+  q_sum2 = vaddq_s32(q_sum2, q_sum3);
+  q_sum2 = vaddq_s32(q_sum0, q_sum2);
+  d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
+  d_sum = vpadd_s32(d_sum, d_sum);
+  return d_sum;
+}
+
+#define LOAD_LANE_32b(src, VALUE, LANE) \
+    (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
+static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
+                         const uint16_t* const w) {
+  uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
+  uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
+  uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
+  uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
+  uint8x8x4_t d4_in;
+
+  // load data a, b
+  LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
+  LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
+  LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
+  LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
+  LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
+  LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
+  LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
+  LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
+  INIT_VECTOR4(d4_in,
+               vreinterpret_u8_u32(d_in_ab_0123),
+               vreinterpret_u8_u32(d_in_ab_4567),
+               vreinterpret_u8_u32(d_in_ab_89ab),
+               vreinterpret_u8_u32(d_in_ab_cdef));
+
+  {
+    // Vertical pass first to avoid a transpose (vertical and horizontal passes
+    // are commutative because w/kWeightY is symmetric) and subsequent
+    // transpose.
+    const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
+    const int16x4x4_t d4_w = DistoLoadW_NEON(w);
+    // horizontal pass
+    const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
+    const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
+    int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
+
+    // abs(sum2 - sum1) >> 5
+    d_sum = vabs_s32(d_sum);
+    d_sum = vshr_n_s32(d_sum, 5);
+    return vget_lane_s32(d_sum, 0);
+  }
+}
+#undef LOAD_LANE_32b
+
+static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_NEON(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+
+static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
+                                  int start_block, int end_block,
+                                  VP8Histogram* const histo) {
+  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    {
+      int k;
+      const int16x8_t a0 = vld1q_s16(out + 0);
+      const int16x8_t b0 = vld1q_s16(out + 8);
+      const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
+      const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
+      const uint16x8_t a2 = vshrq_n_u16(a1, 3);
+      const uint16x8_t b2 = vshrq_n_u16(b1, 3);
+      const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
+      const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
+      vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
+      vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
+      // Convert coefficients to bin.
+      for (k = 0; k < 16; ++k) {
+        ++distribution[out[k]];
+      }
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
+                                             const uint8_t* const b,
+                                             uint32x4_t* const sum) {
+  const uint8x16_t a0 = vld1q_u8(a);
+  const uint8x16_t b0 = vld1q_u8(b);
+  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
+  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
+                                    vget_low_u8(abs_diff));
+  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
+                                    vget_high_u8(abs_diff));
+  /* pair-wise adds and widen */
+  const uint32x4_t sum1 = vpaddlq_u16(prod1);
+  const uint32x4_t sum2 = vpaddlq_u16(prod2);
+  *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
+}
+
+// Horizontal sum of all four uint32_t values in 'sum'.
+static int SumToInt_NEON(uint32x4_t sum) {
+  const uint64x2_t sum2 = vpaddlq_u32(sum);
+  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
+  return (int)sum3;
+}
+
+static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
+  uint32x4_t sum = vdupq_n_u32(0);
+  int y;
+  for (y = 0; y < 16; ++y) {
+    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
+  }
+  return SumToInt_NEON(sum);
+}
+
+static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
+  uint32x4_t sum = vdupq_n_u32(0);
+  int y;
+  for (y = 0; y < 8; ++y) {
+    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
+  }
+  return SumToInt_NEON(sum);
+}
+
+static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
+  uint32x4_t sum = vdupq_n_u32(0);
+  int y;
+  for (y = 0; y < 8; ++y) {
+    const uint8x8_t a0 = vld1_u8(a + y * BPS);
+    const uint8x8_t b0 = vld1_u8(b + y * BPS);
+    const uint8x8_t abs_diff = vabd_u8(a0, b0);
+    const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
+    sum = vpadalq_u16(sum, prod);
+  }
+  return SumToInt_NEON(sum);
+}
+
+static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
+  const uint8x16_t a0 = Load4x4_NEON(a);
+  const uint8x16_t b0 = Load4x4_NEON(b);
+  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
+  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
+                                    vget_low_u8(abs_diff));
+  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
+                                    vget_high_u8(abs_diff));
+  /* pair-wise adds and widen */
+  const uint32x4_t sum1 = vpaddlq_u16(prod1);
+  const uint32x4_t sum2 = vpaddlq_u16(prod2);
+  return SumToInt_NEON(vaddq_u32(sum1, sum2));
+}
+
+//------------------------------------------------------------------------------
+
+// Compilation with gcc-4.6.x is problematic for now.
+#if !defined(WORK_AROUND_GCC)
+
+static int16x8_t Quantize_NEON(int16_t* const in,
+                               const VP8Matrix* const mtx, int offset) {
+  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
+  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
+  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
+  const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
+  const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
+
+  const int16x8_t a = vld1q_s16(in + offset);                // in
+  const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
+  const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
+  const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
+  const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
+  const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
+  const uint32x4_t m2 = vhaddq_u32(m0, bias0);
+  const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
+  const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
+                                     vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
+  const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
+  const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
+  const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
+  const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
+  vst1q_s16(in + offset, c4);
+  assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
+  return c3;
+}
+
+static const uint8_t kShuffles[4][8] = {
+  { 0,   1,  2,  3,  8,  9, 16, 17 },
+  { 10, 11,  4,  5,  6,  7, 12, 13 },
+  { 18, 19, 24, 25, 26, 27, 20, 21 },
+  { 14, 15, 22, 23, 28, 29, 30, 31 }
+};
+
+static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
+                              const VP8Matrix* const mtx) {
+  const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
+  const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
+  uint8x8x4_t shuffles;
+  // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+  // non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+  uint8x16x2_t all_out;
+  INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
+  INIT_VECTOR4(shuffles,
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
+#else
+  uint8x8x4_t all_out;
+  INIT_VECTOR4(all_out,
+               vreinterpret_u8_s16(vget_low_s16(out0)),
+               vreinterpret_u8_s16(vget_high_s16(out0)),
+               vreinterpret_u8_s16(vget_low_s16(out1)),
+               vreinterpret_u8_s16(vget_high_s16(out1)));
+  INIT_VECTOR4(shuffles,
+               vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
+#endif
+  // Zigzag reordering
+  vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
+  vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
+  vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
+  vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
+  // test zeros
+  if (*(uint64_t*)(out +  0) != 0) return 1;
+  if (*(uint64_t*)(out +  4) != 0) return 1;
+  if (*(uint64_t*)(out +  8) != 0) return 1;
+  if (*(uint64_t*)(out + 12) != 0) return 1;
+  return 0;
+}
+
+static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
+                                const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
+#endif   // !WORK_AROUND_GCC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
+  VP8ITransform = ITransform_NEON;
+  VP8FTransform = FTransform_NEON;
+
+  VP8FTransformWHT = FTransformWHT_NEON;
+
+  VP8TDisto4x4 = Disto4x4_NEON;
+  VP8TDisto16x16 = Disto16x16_NEON;
+  VP8CollectHistogram = CollectHistogram_NEON;
+
+  VP8SSE16x16 = SSE16x16_NEON;
+  VP8SSE16x8 = SSE16x8_NEON;
+  VP8SSE8x8 = SSE8x8_NEON;
+  VP8SSE4x4 = SSE4x4_NEON;
+
+#if !defined(WORK_AROUND_GCC)
+  VP8EncQuantizeBlock = QuantizeBlock_NEON;
+  VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
+#endif
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/media/libwebp/dsp/enc_sse2.c b/media/libwebp/dsp/enc_sse2.c
new file mode 100644
index 0000000000..ff78755111
--- /dev/null
+++ b/media/libwebp/dsp/enc_sse2.c
@@ -0,0 +1,1381 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of speed-critical encoding functions.
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <assert.h>
+#include <stdlib.h>  // for abs()
+#include <emmintrin.h>
+
+#include "../dsp/common_sse2.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+// Does one or two inverse transforms.
+static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                            int do_two) {
+  // This implementation makes use of 16-bit fixed point versions of two
+  // multiply constants:
+  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
+  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
+  //
+  // To be able to use signed 16-bit integers, we use the following trick to
+  // have constants within range:
+  // - Associated constants are obtained by subtracting the 16-bit fixed point
+  //   version of one:
+  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
+  //      K1 = 85267  =>  k1 =  20091
+  //      K2 = 35468  =>  k2 = -30068
+  // - The multiplication of a variable by a constant become the sum of the
+  //   variable and the multiplication of that variable by the associated
+  //   constant:
+  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
+  const __m128i k1 = _mm_set1_epi16(20091);
+  const __m128i k2 = _mm_set1_epi16(-30068);
+  __m128i T0, T1, T2, T3;
+
+  // Load and concatenate the transform coefficients (we'll do two inverse
+  // transforms in parallel). In the case of only one inverse transform, the
+  // second half of the vectors will just contain random value we'll never
+  // use nor store.
+  __m128i in0, in1, in2, in3;
+  {
+    in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
+    in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
+    in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
+    in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
+    // a00 a10 a20 a30   x x x x
+    // a01 a11 a21 a31   x x x x
+    // a02 a12 a22 a32   x x x x
+    // a03 a13 a23 a33   x x x x
+    if (do_two) {
+      const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
+      in0 = _mm_unpacklo_epi64(in0, inB0);
+      in1 = _mm_unpacklo_epi64(in1, inB1);
+      in2 = _mm_unpacklo_epi64(in2, inB2);
+      in3 = _mm_unpacklo_epi64(in3, inB3);
+      // a00 a10 a20 a30   b00 b10 b20 b30
+      // a01 a11 a21 a31   b01 b11 b21 b31
+      // a02 a12 a22 a32   b02 b12 b22 b32
+      // a03 a13 a23 a33   b03 b13 b23 b33
+    }
+  }
+
+  // Vertical pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i a = _mm_add_epi16(in0, in2);
+    const __m128i b = _mm_sub_epi16(in0, in2);
+    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
+    const __m128i c3 = _mm_sub_epi16(in1, in3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
+    const __m128i d3 = _mm_add_epi16(in1, in3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+
+    // Transpose the two 4x4.
+    VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3);
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i four = _mm_set1_epi16(4);
+    const __m128i dc = _mm_add_epi16(T0, four);
+    const __m128i a =  _mm_add_epi16(dc, T2);
+    const __m128i b =  _mm_sub_epi16(dc, T2);
+    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
+    const __m128i c3 = _mm_sub_epi16(T1, T3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
+    const __m128i d3 = _mm_add_epi16(T1, T3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
+    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
+    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
+    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
+
+    // Transpose the two 4x4.
+    VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
+                           &T2, &T3);
+  }
+
+  // Add inverse transform to 'ref' and store.
+  {
+    const __m128i zero = _mm_setzero_si128();
+    // Load the reference(s).
+    __m128i ref0, ref1, ref2, ref3;
+    if (do_two) {
+      // Load eight bytes/pixels per line.
+      ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+      ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+      ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+      ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+    } else {
+      // Load four bytes/pixels per line.
+      ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS]));
+      ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS]));
+      ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS]));
+      ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS]));
+    }
+    // Convert to 16b.
+    ref0 = _mm_unpacklo_epi8(ref0, zero);
+    ref1 = _mm_unpacklo_epi8(ref1, zero);
+    ref2 = _mm_unpacklo_epi8(ref2, zero);
+    ref3 = _mm_unpacklo_epi8(ref3, zero);
+    // Add the inverse transform(s).
+    ref0 = _mm_add_epi16(ref0, T0);
+    ref1 = _mm_add_epi16(ref1, T1);
+    ref2 = _mm_add_epi16(ref2, T2);
+    ref3 = _mm_add_epi16(ref3, T3);
+    // Unsigned saturate to 8b.
+    ref0 = _mm_packus_epi16(ref0, ref0);
+    ref1 = _mm_packus_epi16(ref1, ref1);
+    ref2 = _mm_packus_epi16(ref2, ref2);
+    ref3 = _mm_packus_epi16(ref3, ref3);
+    // Store the results.
+    if (do_two) {
+      // Store eight bytes/pixels per line.
+      _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
+      _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
+      _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
+      _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
+    } else {
+      // Store four bytes/pixels per line.
+      WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0));
+      WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1));
+      WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2));
+      WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3));
+    }
+  }
+}
+
+static void FTransformPass1_SSE2(const __m128i* const in01,
+                                 const __m128i* const in23,
+                                 __m128i* const out01,
+                                 __m128i* const out32) {
+  const __m128i k937 = _mm_set1_epi32(937);
+  const __m128i k1812 = _mm_set1_epi32(1812);
+
+  const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
+  const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
+  const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
+                                            2217, 5352, 2217, 5352);
+  const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
+                                            -5352, 2217, -5352, 2217);
+
+  // *in01 = 00 01 10 11 02 03 12 13
+  // *in23 = 20 21 30 31 22 23 32 33
+  const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1));
+  const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1));
+  // 00 01 10 11 03 02 13 12
+  // 20 21 30 31 23 22 33 32
+  const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
+  const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
+  // 00 01 10 11 20 21 30 31
+  // 03 02 13 12 23 22 33 32
+  const __m128i a01 = _mm_add_epi16(s01, s32);
+  const __m128i a32 = _mm_sub_epi16(s01, s32);
+  // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
+  // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
+
+  const __m128i tmp0   = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
+  const __m128i tmp2   = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
+  const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
+  const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
+  const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
+  const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
+  const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
+  const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
+  const __m128i s03    = _mm_packs_epi32(tmp0, tmp2);
+  const __m128i s12    = _mm_packs_epi32(tmp1, tmp3);
+  const __m128i s_lo   = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
+  const __m128i s_hi   = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
+  const __m128i v23    = _mm_unpackhi_epi32(s_lo, s_hi);
+  *out01 = _mm_unpacklo_epi32(s_lo, s_hi);
+  *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
+}
+
+static void FTransformPass2_SSE2(const __m128i* const v01,
+                                 const __m128i* const v32,
+                                 int16_t* out) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i seven = _mm_set1_epi16(7);
+  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
+                                           5352,  2217, 5352,  2217);
+  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
+                                           2217, -5352, 2217, -5352);
+  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
+  const __m128i k51000 = _mm_set1_epi32(51000);
+
+  // Same operations are done on the (0,3) and (1,2) pairs.
+  // a3 = v0 - v3
+  // a2 = v1 - v2
+  const __m128i a32 = _mm_sub_epi16(*v01, *v32);
+  const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
+
+  const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
+  const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
+  const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
+  const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
+  const __m128i d3 = _mm_add_epi32(c3, k51000);
+  const __m128i e1 = _mm_srai_epi32(d1, 16);
+  const __m128i e3 = _mm_srai_epi32(d3, 16);
+  // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+  // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
+  const __m128i f1 = _mm_packs_epi32(e1, e1);
+  const __m128i f3 = _mm_packs_epi32(e3, e3);
+  // g1 = f1 + (a3 != 0);
+  // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
+  // desired (0, 1), we add one earlier through k12000_plus_one.
+  // -> g1 = f1 + 1 - (a3 == 0)
+  const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
+
+  // a0 = v0 + v3
+  // a1 = v1 + v2
+  const __m128i a01 = _mm_add_epi16(*v01, *v32);
+  const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
+  const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
+  const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
+  const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
+  // d0 = (a0 + a1 + 7) >> 4;
+  // d2 = (a0 - a1 + 7) >> 4;
+  const __m128i d0 = _mm_srai_epi16(c0, 4);
+  const __m128i d2 = _mm_srai_epi16(c2, 4);
+
+  const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
+  const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
+  _mm_storeu_si128((__m128i*)&out[0], d0_g1);
+  _mm_storeu_si128((__m128i*)&out[8], d2_f3);
+}
+
+static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
+  const __m128i zero = _mm_setzero_si128();
+  // Load src.
+  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
+  const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
+  const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
+  const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
+  // 00 01 02 03 *
+  // 10 11 12 13 *
+  // 20 21 22 23 *
+  // 30 31 32 33 *
+  // Shuffle.
+  const __m128i src_0 = _mm_unpacklo_epi16(src0, src1);
+  const __m128i src_1 = _mm_unpacklo_epi16(src2, src3);
+  // 00 01 10 11 02 03 12 13 * * ...
+  // 20 21 30 31 22 22 32 33 * * ...
+
+  // Load ref.
+  const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+  const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+  const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+  const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+  const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1);
+  const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3);
+
+  // Convert both to 16 bit.
+  const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero);
+  const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero);
+  const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero);
+  const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero);
+
+  // Compute the difference.
+  const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b);
+  const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b);
+  __m128i v01, v32;
+
+  // First pass
+  FTransformPass1_SSE2(&row01, &row23, &v01, &v32);
+
+  // Second pass
+  FTransformPass2_SSE2(&v01, &v32, out);
+}
+
+static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
+                             int16_t* out) {
+  const __m128i zero = _mm_setzero_si128();
+
+  // Load src and convert to 16b.
+  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
+  const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
+  const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
+  const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
+  const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
+  const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
+  const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
+  const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
+  // Load ref and convert to 16b.
+  const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+  const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+  const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+  const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+  const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
+  const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
+  const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
+  const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
+  // Compute difference. -> 00 01 02 03  00' 01' 02' 03'
+  const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
+  const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
+  const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
+  const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
+
+  // Unpack and shuffle
+  // 00 01 02 03   0 0 0 0
+  // 10 11 12 13   0 0 0 0
+  // 20 21 22 23   0 0 0 0
+  // 30 31 32 33   0 0 0 0
+  const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1);
+  const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3);
+  const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1);
+  const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3);
+  __m128i v01l, v32l;
+  __m128i v01h, v32h;
+
+  // First pass
+  FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l);
+  FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h);
+
+  // Second pass
+  FTransformPass2_SSE2(&v01l, &v32l, out + 0);
+  FTransformPass2_SSE2(&v01h, &v32h, out + 16);
+}
+
+static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
+  const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
+  const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
+  const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
+  const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]);
+  const __m128i src3 = _mm_loadl_epi64((__m128i*)&in[3 * 16]);
+  const __m128i A01 = _mm_unpacklo_epi16(src0, src1);  // A0 A1 | ...
+  const __m128i A23 = _mm_unpacklo_epi16(src2, src3);  // A2 A3 | ...
+  const __m128i B0 = _mm_adds_epi16(A01, A23);    // a0 | a1 | ...
+  const __m128i B1 = _mm_subs_epi16(A01, A23);    // a3 | a2 | ...
+  const __m128i C0 = _mm_unpacklo_epi32(B0, B1);  // a0 | a1 | a3 | a2 | ...
+  const __m128i C1 = _mm_unpacklo_epi32(B1, B0);  // a3 | a2 | a0 | a1 | ...
+  const __m128i D = _mm_unpacklo_epi64(C0, C1);   // a0 a1 a3 a2 a3 a2 a0 a1
+  *out = _mm_madd_epi16(D, kMult);
+}
+
+static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
+  // Input is 12b signed.
+  __m128i row0, row1, row2, row3;
+  // Rows are 14b signed.
+  FTransformWHTRow_SSE2(in + 0 * 64, &row0);
+  FTransformWHTRow_SSE2(in + 1 * 64, &row1);
+  FTransformWHTRow_SSE2(in + 2 * 64, &row2);
+  FTransformWHTRow_SSE2(in + 3 * 64, &row3);
+
+  {
+    // The a* are 15b signed.
+    const __m128i a0 = _mm_add_epi32(row0, row2);
+    const __m128i a1 = _mm_add_epi32(row1, row3);
+    const __m128i a2 = _mm_sub_epi32(row1, row3);
+    const __m128i a3 = _mm_sub_epi32(row0, row2);
+    const __m128i a0a3 = _mm_packs_epi32(a0, a3);
+    const __m128i a1a2 = _mm_packs_epi32(a1, a2);
+
+    // The b* are 16b signed.
+    const __m128i b0b1 = _mm_add_epi16(a0a3, a1a2);
+    const __m128i b3b2 = _mm_sub_epi16(a0a3, a1a2);
+    const __m128i tmp_b2b3 = _mm_unpackhi_epi64(b3b2, b3b2);
+    const __m128i b2b3 = _mm_unpacklo_epi64(tmp_b2b3, b3b2);
+
+    _mm_storeu_si128((__m128i*)&out[0], _mm_srai_epi16(b0b1, 1));
+    _mm_storeu_si128((__m128i*)&out[8], _mm_srai_epi16(b2b3, 1));
+  }
+}
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
+                                  int start_block, int end_block,
+                                  VP8Histogram* const histo) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int k;
+
+    FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin (within out[]).
+    {
+      // Load.
+      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+      const __m128i d0 = _mm_sub_epi16(zero, out0);
+      const __m128i d1 = _mm_sub_epi16(zero, out1);
+      const __m128i abs0 = _mm_max_epi16(out0, d0);   // abs(v), 16b
+      const __m128i abs1 = _mm_max_epi16(out1, d1);
+      // v = abs(out) >> 3
+      const __m128i v0 = _mm_srai_epi16(abs0, 3);
+      const __m128i v1 = _mm_srai_epi16(abs1, 3);
+      // bin = min(v, MAX_COEFF_THRESH)
+      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+      // Store.
+      _mm_storeu_si128((__m128i*)&out[0], bin0);
+      _mm_storeu_si128((__m128i*)&out[8], bin1);
+    }
+
+    // Convert coefficients to bin.
+    for (k = 0; k < 16; ++k) {
+      ++distribution[out[k]];
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+// helper for chroma-DC predictions
+static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 16; ++j) {
+    _mm_store_si128((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
+  if (size == 4) {
+    int j;
+    for (j = 0; j < 4; ++j) {
+      memset(dst + j * BPS, value, 4);
+    }
+  } else if (size == 8) {
+    Put8x8uv_SSE2(value, dst);
+  } else {
+    Put16_SSE2(value, dst);
+  }
+}
+
+static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
+  int j;
+  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), top_values);
+  }
+}
+
+static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
+  const __m128i top_values = _mm_load_si128((const __m128i*)top);
+  int j;
+  for (j = 0; j < 16; ++j) {
+    _mm_store_si128((__m128i*)(dst + j * BPS), top_values);
+  }
+}
+
+static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
+                                          const uint8_t* top, int size) {
+  if (top != NULL) {
+    if (size == 8) {
+      VE8uv_SSE2(dst, top);
+    } else {
+      VE16_SSE2(dst, top);
+    }
+  } else {
+    Fill_SSE2(dst, 127, size);
+  }
+}
+
+static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
+  int j;
+  for (j = 0; j < 8; ++j) {
+    const __m128i values = _mm_set1_epi8(left[j]);
+    _mm_storel_epi64((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
+  int j;
+  for (j = 0; j < 16; ++j) {
+    const __m128i values = _mm_set1_epi8(left[j]);
+    _mm_store_si128((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
+                                            const uint8_t* left, int size) {
+  if (left != NULL) {
+    if (size == 8) {
+      HE8uv_SSE2(dst, left);
+    } else {
+      HE16_SSE2(dst, left);
+    }
+  } else {
+    Fill_SSE2(dst, 129, size);
+  }
+}
+
+static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
+                                const uint8_t* top, int size) {
+  const __m128i zero = _mm_setzero_si128();
+  int y;
+  if (size == 8) {
+    const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+    for (y = 0; y < 8; ++y, dst += BPS) {
+      const int val = left[y] - left[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+      _mm_storel_epi64((__m128i*)dst, out);
+    }
+  } else {
+    const __m128i top_values = _mm_load_si128((const __m128i*)top);
+    const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero);
+    const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero);
+    for (y = 0; y < 16; ++y, dst += BPS) {
+      const int val = left[y] - left[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out_0 = _mm_add_epi16(base, top_base_0);
+      const __m128i out_1 = _mm_add_epi16(base, top_base_1);
+      const __m128i out = _mm_packus_epi16(out_0, out_1);
+      _mm_store_si128((__m128i*)dst, out);
+    }
+  }
+}
+
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
+                                        const uint8_t* top, int size) {
+  if (left != NULL) {
+    if (top != NULL) {
+      TM_SSE2(dst, left, top, size);
+    } else {
+      HorizontalPred_SSE2(dst, left, size);
+    }
+  } else {
+    // true motion without left samples (hence: with default 129 value)
+    // is equivalent to VE prediction where you just copy the top samples.
+    // Note that if top samples are not available, the default value is
+    // then 129, and not 127 as in the VerticalPred case.
+    if (top != NULL) {
+      VerticalPred_SSE2(dst, top, size);
+    } else {
+      Fill_SSE2(dst, 129, size);
+    }
+  }
+}
+
+static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top) {
+  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+  const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
+  const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
+  const int DC = VP8HorizontalAdd8b(&combined) + 8;
+  Put8x8uv_SSE2(DC >> 4, dst);
+}
+
+static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+  const __m128i sum = _mm_sad_epu8(top_values, zero);
+  const int DC = _mm_cvtsi128_si32(sum) + 4;
+  Put8x8uv_SSE2(DC >> 3, dst);
+}
+
+static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
+  // 'left' is contiguous so we can reuse the top summation.
+  DC8uvNoLeft_SSE2(dst, left);
+}
+
+static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
+  Put8x8uv_SSE2(0x80, dst);
+}
+
+static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
+                                       const uint8_t* top) {
+  if (top != NULL) {
+    if (left != NULL) {  // top and left present
+      DC8uv_SSE2(dst, left, top);
+    } else {  // top, but no left
+      DC8uvNoLeft_SSE2(dst, top);
+    }
+  } else if (left != NULL) {  // left but no top
+    DC8uvNoTop_SSE2(dst, left);
+  } else {  // no top, no left, nothing.
+    DC8uvNoTopLeft_SSE2(dst);
+  }
+}
+
+static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
+  const __m128i top_row = _mm_load_si128((const __m128i*)top);
+  const __m128i left_row = _mm_load_si128((const __m128i*)left);
+  const int DC =
+      VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
+  Put16_SSE2(DC >> 5, dst);
+}
+
+static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
+  const __m128i top_row = _mm_load_si128((const __m128i*)top);
+  const int DC = VP8HorizontalAdd8b(&top_row) + 8;
+  Put16_SSE2(DC >> 4, dst);
+}
+
+static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
+  // 'left' is contiguous so we can reuse the top summation.
+  DC16NoLeft_SSE2(dst, left);
+}
+
+static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
+  Put16_SSE2(0x80, dst);
+}
+
+static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
+                                      const uint8_t* top) {
+  if (top != NULL) {
+    if (left != NULL) {  // top and left present
+      DC16_SSE2(dst, left, top);
+    } else {  // top, but no left
+      DC16NoLeft_SSE2(dst, top);
+    }
+  } else if (left != NULL) {  // left but no top
+    DC16NoTop_SSE2(dst, left);
+  } else {  // no top, no left, nothing.
+    DC16NoTopLeft_SSE2(dst);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 4x4 predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+// We use the following 8b-arithmetic tricks:
+//     (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
+//   where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
+// and:
+//     (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
+//   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
+//   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
+
+static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // vertical
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
+  const __m128i b = _mm_subs_epu8(a, lsb);
+  const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
+  const uint32_t vals = _mm_cvtsi128_si32(avg);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    WebPUint32ToMem(dst + i * BPS, vals);
+  }
+}
+
+static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // horizontal
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
+}
+
+static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+  Fill_SSE2(dst, dc >> 3, 4);
+}
+
+static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Down-Left
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, top[7], 3);
+  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Vertical-Right
+  const __m128i one = _mm_set1_epi8(1);
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int X = top[-1];
+  const __m128i XABCD = _mm_loadl_epi64((const __m128i*)(top - 1));
+  const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
+  const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
+  const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
+  const __m128i IXABCD = _mm_insert_epi16(_XABCD, (short)(I | (X << 8)), 0);
+  const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+
+  // these two are hard to implement in SSE2, so we keep the C-version:
+  DST(0, 2) = AVG3(J, I, X);
+  DST(0, 3) = AVG3(K, J, I);
+}
+
+static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Vertical-Left
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
+  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_);
+  const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_);
+  const __m128i avg3 = _mm_avg_epu8(avg1, avg2);
+  const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one);
+  const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_);
+  const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_);
+  const __m128i abbc = _mm_or_si128(ab, bc);
+  const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
+  const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
+  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+
+  // these two are hard to get and irregular
+  DST(3, 2) = (extra_out >> 0) & 0xff;
+  DST(3, 3) = (extra_out >> 8) & 0xff;
+}
+
+static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Down-right
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
+  const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
+  const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
+  const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
+  const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+  const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+  int y;
+  for (y = 0; y < 4; ++y, dst += BPS) {
+    const int val = top[-2 - y] - top[-1];
+    const __m128i base = _mm_set1_epi16(val);
+    const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+    WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+  }
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+//------------------------------------------------------------------------------
+// luma 4x4 prediction
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
+  DC4_SSE2(I4DC4 + dst, top);
+  TM4_SSE2(I4TM4 + dst, top);
+  VE4_SSE2(I4VE4 + dst, top);
+  HE4_SSE2(I4HE4 + dst, top);
+  RD4_SSE2(I4RD4 + dst, top);
+  VR4_SSE2(I4VR4 + dst, top);
+  LD4_SSE2(I4LD4 + dst, top);
+  VL4_SSE2(I4VL4 + dst, top);
+  HD4_SSE2(I4HD4 + dst, top);
+  HU4_SSE2(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
+  // U block
+  DC8uvMode_SSE2(C8DC8 + dst, left, top);
+  VerticalPred_SSE2(C8VE8 + dst, top, 8);
+  HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+  TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
+  // V block
+  dst += 8;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
+  DC8uvMode_SSE2(C8DC8 + dst, left, top);
+  VerticalPred_SSE2(C8VE8 + dst, top, 8);
+  HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+  TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds_SSE2(uint8_t* dst,
+                              const uint8_t* left, const uint8_t* top) {
+  DC16Mode_SSE2(I16DC16 + dst, left, top);
+  VerticalPred_SSE2(I16VE16 + dst, top, 16);
+  HorizontalPred_SSE2(I16HE16 + dst, left, 16);
+  TrueMotion_SSE2(I16TM16 + dst, left, top, 16);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
+                                                   const __m128i b,
+                                                   __m128i* const sum) {
+  // take abs(a-b) in 8b
+  const __m128i a_b = _mm_subs_epu8(a, b);
+  const __m128i b_a = _mm_subs_epu8(b, a);
+  const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
+  // zero-extend to 16b
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
+  const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
+  // multiply with self
+  const __m128i sum1 = _mm_madd_epi16(C0, C0);
+  const __m128i sum2 = _mm_madd_epi16(C1, C1);
+  *sum = _mm_add_epi32(sum1, sum2);
+}
+
+static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
+                                     int num_pairs) {
+  __m128i sum = _mm_setzero_si128();
+  int32_t tmp[4];
+  int i;
+
+  for (i = 0; i < num_pairs; ++i) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[BPS * 0]);
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[BPS * 0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
+    __m128i sum1, sum2;
+    SubtractAndAccumulate_SSE2(a0, b0, &sum1);
+    SubtractAndAccumulate_SSE2(a1, b1, &sum2);
+    sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
+    a += 2 * BPS;
+    b += 2 * BPS;
+  }
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+
+static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_16xN_SSE2(a, b, 8);
+}
+
+static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_16xN_SSE2(a, b, 4);
+}
+
+#define LOAD_8x16b(ptr) \
+  _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
+
+static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
+  const __m128i zero = _mm_setzero_si128();
+  int num_pairs = 4;
+  __m128i sum = zero;
+  int32_t tmp[4];
+  while (num_pairs-- > 0) {
+    const __m128i a0 = LOAD_8x16b(&a[BPS * 0]);
+    const __m128i a1 = LOAD_8x16b(&a[BPS * 1]);
+    const __m128i b0 = LOAD_8x16b(&b[BPS * 0]);
+    const __m128i b1 = LOAD_8x16b(&b[BPS * 1]);
+    // subtract
+    const __m128i c0 = _mm_subs_epi16(a0, b0);
+    const __m128i c1 = _mm_subs_epi16(a1, b1);
+    // multiply/accumulate with self
+    const __m128i d0 = _mm_madd_epi16(c0, c0);
+    const __m128i d1 = _mm_madd_epi16(c1, c1);
+    // collect
+    const __m128i sum01 = _mm_add_epi32(d0, d1);
+    sum = _mm_add_epi32(sum, sum01);
+    a += 2 * BPS;
+    b += 2 * BPS;
+  }
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+#undef LOAD_8x16b
+
+static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
+  const __m128i zero = _mm_setzero_si128();
+
+  // Load values. Note that we read 8 pixels instead of 4,
+  // but the a/b buffers are over-allocated to that effect.
+  const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]);
+  const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]);
+  const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]);
+  const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]);
+  const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]);
+  const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]);
+  const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]);
+  const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]);
+  // Combine pair of lines.
+  const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
+  const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
+  // Convert to 16b.
+  const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
+  const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
+  const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
+  const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
+  // subtract, square and accumulate
+  const __m128i d0 = _mm_subs_epi16(a01s, b01s);
+  const __m128i d1 = _mm_subs_epi16(a23s, b23s);
+  const __m128i e0 = _mm_madd_epi16(d0, d0);
+  const __m128i e1 = _mm_madd_epi16(d1, d1);
+  const __m128i sum = _mm_add_epi32(e0, e1);
+
+  int32_t tmp[4];
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+
+//------------------------------------------------------------------------------
+
+static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
+  const __m128i mask = _mm_set1_epi16(0x00ff);
+  const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
+  const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
+  const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]);
+  const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]);
+  const __m128i b0 = _mm_srli_epi16(a0, 8);     // hi byte
+  const __m128i b1 = _mm_srli_epi16(a1, 8);
+  const __m128i b2 = _mm_srli_epi16(a2, 8);
+  const __m128i b3 = _mm_srli_epi16(a3, 8);
+  const __m128i c0 = _mm_and_si128(a0, mask);   // lo byte
+  const __m128i c1 = _mm_and_si128(a1, mask);
+  const __m128i c2 = _mm_and_si128(a2, mask);
+  const __m128i c3 = _mm_and_si128(a3, mask);
+  const __m128i d0 = _mm_add_epi32(b0, c0);
+  const __m128i d1 = _mm_add_epi32(b1, c1);
+  const __m128i d2 = _mm_add_epi32(b2, c2);
+  const __m128i d3 = _mm_add_epi32(b3, c3);
+  const __m128i e0 = _mm_add_epi32(d0, d1);
+  const __m128i e1 = _mm_add_epi32(d2, d3);
+  const __m128i f0 = _mm_add_epi32(e0, e1);
+  uint16_t tmp[8];
+  _mm_storeu_si128((__m128i*)tmp, f0);
+  dc[0] = tmp[0] + tmp[1];
+  dc[1] = tmp[2] + tmp[3];
+  dc[2] = tmp[4] + tmp[5];
+  dc[3] = tmp[6] + tmp[7];
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
+static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
+                           const uint16_t* const w) {
+  int32_t sum[4];
+  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
+  const __m128i zero = _mm_setzero_si128();
+
+  // Load and combine inputs.
+  {
+    const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
+    const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
+    const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
+    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
+    const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
+    const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
+    const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
+    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
+
+    // Combine inA and inB (we'll do two transforms in parallel).
+    const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
+    const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
+    const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
+    const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
+    tmp_0 = _mm_unpacklo_epi8(inAB_0, zero);
+    tmp_1 = _mm_unpacklo_epi8(inAB_1, zero);
+    tmp_2 = _mm_unpacklo_epi8(inAB_2, zero);
+    tmp_3 = _mm_unpacklo_epi8(inAB_3, zero);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+  }
+
+  // Vertical pass first to avoid a transpose (vertical and horizontal passes
+  // are commutative because w/kWeightY is symmetric) and subsequent transpose.
+  {
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+
+    // Transpose the two 4x4.
+    VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
+  }
+
+  // Horizontal pass and difference of weighted sums.
+  {
+    // Load all inputs.
+    const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
+    const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
+
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+
+    // Separate the transforms of inA and inB.
+    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
+    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
+    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
+    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
+
+    {
+      const __m128i d0 = _mm_sub_epi16(zero, A_b0);
+      const __m128i d1 = _mm_sub_epi16(zero, A_b2);
+      const __m128i d2 = _mm_sub_epi16(zero, B_b0);
+      const __m128i d3 = _mm_sub_epi16(zero, B_b2);
+      A_b0 = _mm_max_epi16(A_b0, d0);   // abs(v), 16b
+      A_b2 = _mm_max_epi16(A_b2, d1);
+      B_b0 = _mm_max_epi16(B_b0, d2);
+      B_b2 = _mm_max_epi16(B_b2, d3);
+    }
+
+    // weighted sums
+    A_b0 = _mm_madd_epi16(A_b0, w_0);
+    A_b2 = _mm_madd_epi16(A_b2, w_8);
+    B_b0 = _mm_madd_epi16(B_b0, w_0);
+    B_b2 = _mm_madd_epi16(B_b2, w_8);
+    A_b0 = _mm_add_epi32(A_b0, A_b2);
+    B_b0 = _mm_add_epi32(B_b0, B_b2);
+
+    // difference of weighted sums
+    A_b0 = _mm_sub_epi32(A_b0, B_b0);
+    _mm_storeu_si128((__m128i*)&sum[0], A_b0);
+  }
+  return sum[0] + sum[1] + sum[2] + sum[3];
+}
+
+static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
+                         const uint16_t* const w) {
+  const int diff_sum = TTransform_SSE2(a, b, w);
+  return abs(diff_sum) >> 5;
+}
+
+static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_SSE2(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+                                            const uint16_t* const sharpen,
+                                            const VP8Matrix* const mtx) {
+  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i coeff0, coeff8;
+  __m128i out0, out8;
+  __m128i packed_out;
+
+  // Load all inputs.
+  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
+  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
+  const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
+  const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
+  const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
+  const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
+
+  // extract sign(in)  (0x0000 if positive, 0xffff if negative)
+  const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
+  const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);
+
+  // coeff = abs(in) = (in ^ sign) - sign
+  coeff0 = _mm_xor_si128(in0, sign0);
+  coeff8 = _mm_xor_si128(in8, sign8);
+  coeff0 = _mm_sub_epi16(coeff0, sign0);
+  coeff8 = _mm_sub_epi16(coeff8, sign8);
+
+  // coeff = abs(in) + sharpen
+  if (sharpen != NULL) {
+    const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
+    const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
+    coeff0 = _mm_add_epi16(coeff0, sharpen0);
+    coeff8 = _mm_add_epi16(coeff8, sharpen8);
+  }
+
+  // out = (coeff * iQ + B) >> QFIX
+  {
+    // doing calculations with 32b precision (QFIX=17)
+    // out = (coeff * iQ)
+    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
+    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
+    // out = (coeff * iQ + B)
+    const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
+    const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
+    const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
+    const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
+    out_00 = _mm_add_epi32(out_00, bias_00);
+    out_04 = _mm_add_epi32(out_04, bias_04);
+    out_08 = _mm_add_epi32(out_08, bias_08);
+    out_12 = _mm_add_epi32(out_12, bias_12);
+    // out = QUANTDIV(coeff, iQ, B, QFIX)
+    out_00 = _mm_srai_epi32(out_00, QFIX);
+    out_04 = _mm_srai_epi32(out_04, QFIX);
+    out_08 = _mm_srai_epi32(out_08, QFIX);
+    out_12 = _mm_srai_epi32(out_12, QFIX);
+
+    // pack result as 16b
+    out0 = _mm_packs_epi32(out_00, out_04);
+    out8 = _mm_packs_epi32(out_08, out_12);
+
+    // if (coeff > 2047) coeff = 2047
+    out0 = _mm_min_epi16(out0, max_coeff_2047);
+    out8 = _mm_min_epi16(out8, max_coeff_2047);
+  }
+
+  // get sign back (if (sign[j]) out_n = -out_n)
+  out0 = _mm_xor_si128(out0, sign0);
+  out8 = _mm_xor_si128(out8, sign8);
+  out0 = _mm_sub_epi16(out0, sign0);
+  out8 = _mm_sub_epi16(out8, sign8);
+
+  // in = out * Q
+  in0 = _mm_mullo_epi16(out0, q0);
+  in8 = _mm_mullo_epi16(out8, q8);
+
+  _mm_storeu_si128((__m128i*)&in[0], in0);
+  _mm_storeu_si128((__m128i*)&in[8], in8);
+
+  // zigzag the output before storing it.
+  //
+  // The zigzag pattern can almost be reproduced with a small sequence of
+  // shuffles. After it, we only need to swap the 7th (ending up in third
+  // position instead of twelfth) and 8th values.
+  {
+    __m128i outZ0, outZ8;
+    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
+    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
+    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
+    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
+    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
+    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
+    _mm_storeu_si128((__m128i*)&out[0], outZ0);
+    _mm_storeu_si128((__m128i*)&out[8], outZ8);
+    packed_out = _mm_packs_epi16(outZ0, outZ8);
+  }
+  {
+    const int16_t outZ_12 = out[12];
+    const int16_t outZ_3 = out[3];
+    out[3] = outZ_12;
+    out[12] = outZ_3;
+  }
+
+  // detect if all 'out' values are zeroes or not
+  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
+}
+
+static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+                              const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
+}
+
+static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
+                                 const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
+}
+
+static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
+                                const VP8Matrix* const mtx) {
+  int nz;
+  const uint16_t* const sharpen = &mtx->sharpen_[0];
+  nz  = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
+  VP8CollectHistogram = CollectHistogram_SSE2;
+  VP8EncPredLuma16 = Intra16Preds_SSE2;
+  VP8EncPredChroma8 = IntraChromaPreds_SSE2;
+  VP8EncPredLuma4 = Intra4Preds_SSE2;
+  VP8EncQuantizeBlock = QuantizeBlock_SSE2;
+  VP8EncQuantize2Blocks = Quantize2Blocks_SSE2;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2;
+  VP8ITransform = ITransform_SSE2;
+  VP8FTransform = FTransform_SSE2;
+  VP8FTransform2 = FTransform2_SSE2;
+  VP8FTransformWHT = FTransformWHT_SSE2;
+  VP8SSE16x16 = SSE16x16_SSE2;
+  VP8SSE16x8 = SSE16x8_SSE2;
+  VP8SSE8x8 = SSE8x8_SSE2;
+  VP8SSE4x4 = SSE4x4_SSE2;
+  VP8TDisto4x4 = Disto4x4_SSE2;
+  VP8TDisto16x16 = Disto16x16_SSE2;
+  VP8Mean16x4 = Mean16x4_SSE2;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/enc_sse41.c b/media/libwebp/dsp/enc_sse41.c
new file mode 100644
index 0000000000..09ea29361d
--- /dev/null
+++ b/media/libwebp/dsp/enc_sse41.c
@@ -0,0 +1,339 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4 version of some encoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+#include <smmintrin.h>
+#include <stdlib.h>  // for abs()
+
+#include "../dsp/common_sse2.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms.
+
+static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
+                                   int start_block, int end_block,
+                                   VP8Histogram* const histo) {
+  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int k;
+
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin (within out[]).
+    {
+      // Load.
+      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+      // v = abs(out) >> 3
+      const __m128i abs0 = _mm_abs_epi16(out0);
+      const __m128i abs1 = _mm_abs_epi16(out1);
+      const __m128i v0 = _mm_srai_epi16(abs0, 3);
+      const __m128i v1 = _mm_srai_epi16(abs1, 3);
+      // bin = min(v, MAX_COEFF_THRESH)
+      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+      // Store.
+      _mm_storeu_si128((__m128i*)&out[0], bin0);
+      _mm_storeu_si128((__m128i*)&out[8], bin1);
+    }
+
+    // Convert coefficients to bin.
+    for (k = 0; k < 16; ++k) {
+      ++distribution[out[k]];
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
+static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
+                            const uint16_t* const w) {
+  int32_t sum[4];
+  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
+
+  // Load and combine inputs.
+  {
+    const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
+    const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
+    const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
+    // In SSE4.1, with gcc 4.8 at least (maybe other versions),
+    // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump
+    // of inA and inB, _mm_loadl_epi64 is still used not to have an out of
+    // bound read.
+    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
+    const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
+    const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
+    const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
+    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
+
+    // Combine inA and inB (we'll do two transforms in parallel).
+    const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
+    const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
+    const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
+    const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
+    tmp_0 = _mm_cvtepu8_epi16(inAB_0);
+    tmp_1 = _mm_cvtepu8_epi16(inAB_1);
+    tmp_2 = _mm_cvtepu8_epi16(inAB_2);
+    tmp_3 = _mm_cvtepu8_epi16(inAB_3);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+  }
+
+  // Vertical pass first to avoid a transpose (vertical and horizontal passes
+  // are commutative because w/kWeightY is symmetric) and subsequent transpose.
+  {
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+
+    // Transpose the two 4x4.
+    VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
+  }
+
+  // Horizontal pass and difference of weighted sums.
+  {
+    // Load all inputs.
+    const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
+    const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
+
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+
+    // Separate the transforms of inA and inB.
+    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
+    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
+    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
+    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
+
+    A_b0 = _mm_abs_epi16(A_b0);
+    A_b2 = _mm_abs_epi16(A_b2);
+    B_b0 = _mm_abs_epi16(B_b0);
+    B_b2 = _mm_abs_epi16(B_b2);
+
+    // weighted sums
+    A_b0 = _mm_madd_epi16(A_b0, w_0);
+    A_b2 = _mm_madd_epi16(A_b2, w_8);
+    B_b0 = _mm_madd_epi16(B_b0, w_0);
+    B_b2 = _mm_madd_epi16(B_b2, w_8);
+    A_b0 = _mm_add_epi32(A_b0, A_b2);
+    B_b0 = _mm_add_epi32(B_b0, B_b2);
+
+    // difference of weighted sums
+    A_b2 = _mm_sub_epi32(A_b0, B_b0);
+    _mm_storeu_si128((__m128i*)&sum[0], A_b2);
+  }
+  return sum[0] + sum[1] + sum[2] + sum[3];
+}
+
+static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
+                          const uint16_t* const w) {
+  const int diff_sum = TTransform_SSE41(a, b, w);
+  return abs(diff_sum) >> 5;
+}
+
+static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
+                            const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_SSE41(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+// Generates a pshufb constant for shuffling 16b words.
+#define PSHUFB_CST(A,B,C,D,E,F,G,H) \
+  _mm_set_epi8(2 * (H) + 1, 2 * (H) + 0, 2 * (G) + 1, 2 * (G) + 0, \
+               2 * (F) + 1, 2 * (F) + 0, 2 * (E) + 1, 2 * (E) + 0, \
+               2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
+               2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
+
+static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+                                             const uint16_t* const sharpen,
+                                             const VP8Matrix* const mtx) {
+  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i out0, out8;
+  __m128i packed_out;
+
+  // Load all inputs.
+  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
+  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
+  const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
+  const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
+  const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
+  const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
+
+  // coeff = abs(in)
+  __m128i coeff0 = _mm_abs_epi16(in0);
+  __m128i coeff8 = _mm_abs_epi16(in8);
+
+  // coeff = abs(in) + sharpen
+  if (sharpen != NULL) {
+    const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
+    const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
+    coeff0 = _mm_add_epi16(coeff0, sharpen0);
+    coeff8 = _mm_add_epi16(coeff8, sharpen8);
+  }
+
+  // out = (coeff * iQ + B) >> QFIX
+  {
+    // doing calculations with 32b precision (QFIX=17)
+    // out = (coeff * iQ)
+    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
+    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
+    // out = (coeff * iQ + B)
+    const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
+    const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
+    const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
+    const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
+    out_00 = _mm_add_epi32(out_00, bias_00);
+    out_04 = _mm_add_epi32(out_04, bias_04);
+    out_08 = _mm_add_epi32(out_08, bias_08);
+    out_12 = _mm_add_epi32(out_12, bias_12);
+    // out = QUANTDIV(coeff, iQ, B, QFIX)
+    out_00 = _mm_srai_epi32(out_00, QFIX);
+    out_04 = _mm_srai_epi32(out_04, QFIX);
+    out_08 = _mm_srai_epi32(out_08, QFIX);
+    out_12 = _mm_srai_epi32(out_12, QFIX);
+
+    // pack result as 16b
+    out0 = _mm_packs_epi32(out_00, out_04);
+    out8 = _mm_packs_epi32(out_08, out_12);
+
+    // if (coeff > 2047) coeff = 2047
+    out0 = _mm_min_epi16(out0, max_coeff_2047);
+    out8 = _mm_min_epi16(out8, max_coeff_2047);
+  }
+
+  // put sign back
+  out0 = _mm_sign_epi16(out0, in0);
+  out8 = _mm_sign_epi16(out8, in8);
+
+  // in = out * Q
+  in0 = _mm_mullo_epi16(out0, q0);
+  in8 = _mm_mullo_epi16(out8, q8);
+
+  _mm_storeu_si128((__m128i*)&in[0], in0);
+  _mm_storeu_si128((__m128i*)&in[8], in8);
+
+  // zigzag the output before storing it. The re-ordering is:
+  //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
+  // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
+  // There's only two misplaced entries ([8] and [7]) that are crossing the
+  // reg's boundaries.
+  // We use pshufb instead of pshuflo/pshufhi.
+  {
+    const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
+    const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
+    const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
+    const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7);  // extract #7
+    const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
+    const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
+    const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
+    const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8);  // extract #8
+    const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
+    const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
+    _mm_storeu_si128((__m128i*)&out[0], out_z0);
+    _mm_storeu_si128((__m128i*)&out[8], out_z8);
+    packed_out = _mm_packs_epi16(out_z0, out_z8);
+  }
+
+  // detect if all 'out' values are zeroes or not
+  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
+}
+
+#undef PSHUFB_CST
+
+static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+                               const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
+}
+
+static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
+                                  const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
+}
+
+static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
+                                 const VP8Matrix* const mtx) {
+  int nz;
+  const uint16_t* const sharpen = &mtx->sharpen_[0];
+  nz  = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitSSE41(void);
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
+  VP8CollectHistogram = CollectHistogram_SSE41;
+  VP8EncQuantizeBlock = QuantizeBlock_SSE41;
+  VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
+  VP8TDisto4x4 = Disto4x4_SSE41;
+  VP8TDisto16x16 = Disto16x16_SSE41;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/media/libwebp/dsp/filters.c b/media/libwebp/dsp/filters.c
index dea3eb4101..b0c659478f 100644
--- a/media/libwebp/dsp/filters.c
+++ b/media/libwebp/dsp/filters.c
@@ -33,9 +33,9 @@ static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
                                       uint8_t* dst, int length, int inverse) {
   int i;
   if (inverse) {
-    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
+    for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] + pred[i]);
   } else {
-    for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
+    for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]);
   }
 }
 
@@ -155,7 +155,7 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
       const int pred = GradientPredictor_C(preds[w - 1],
                                            preds[w - stride],
                                            preds[w - stride - 1]);
-      out[w] = in[w] + (inverse ? pred : -pred);
+      out[w] = (uint8_t)(in[w] + (inverse ? pred : -pred));
     }
     ++row;
     preds += stride;
@@ -194,7 +194,7 @@ static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
   uint8_t pred = (prev == NULL) ? 0 : prev[0];
   int i;
   for (i = 0; i < width; ++i) {
-    out[i] = pred + in[i];
+    out[i] = (uint8_t)(pred + in[i]);
     pred = out[i];
   }
 }
@@ -206,7 +206,7 @@ static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
     HorizontalUnfilter_C(NULL, in, out, width);
   } else {
     int i;
-    for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
+    for (i = 0; i < width; ++i) out[i] = (uint8_t)(prev[i] + in[i]);
   }
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
@@ -220,7 +220,7 @@ static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
     int i;
     for (i = 0; i < width; ++i) {
       top = prev[i];  // need to read this first, in case prev==out
-      left = in[i] + GradientPredictor_C(left, top, top_left);
+      left = (uint8_t)(in[i] + GradientPredictor_C(left, top, top_left));
       top_left = top;
       out[i] = left;
     }
@@ -254,7 +254,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
 #endif
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8FiltersInitSSE2();
     }
@@ -271,7 +271,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8FiltersInitNEON();
diff --git a/media/libwebp/dsp/filters_mips_dsp_r2.c b/media/libwebp/dsp/filters_mips_dsp_r2.c
new file mode 100644
index 0000000000..edb1eaac26
--- /dev/null
+++ b/media/libwebp/dsp/filters_mips_dsp_r2.c
@@ -0,0 +1,402 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Spatial prediction using various filters
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+//            Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/dsp.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+# define SANITY_CHECK(in, out)                                                 \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
+  assert(width > 0);                                                           \
+  assert(height > 0);                                                          \
+  assert(stride >= width);                                                     \
+  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
+  (void)height;  // Silence unused warning.
+
+#define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do {                        \
+    const uint8_t* psrc = (uint8_t*)(SRC);                                     \
+    uint8_t* pdst = (uint8_t*)(DST);                                           \
+    const int ilength = (int)(LENGTH);                                         \
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6;                       \
+    __asm__ volatile (                                                         \
+      ".set      push                                   \n\t"                  \
+      ".set      noreorder                              \n\t"                  \
+      "srl       %[temp0],    %[length],    2           \n\t"                  \
+      "beqz      %[temp0],    4f                        \n\t"                  \
+      " andi     %[temp6],    %[length],    3           \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+    "1:                                                 \n\t"                  \
+      "lbu       %[temp1],    -1(%[dst])                \n\t"                  \
+      "lbu       %[temp2],    0(%[src])                 \n\t"                  \
+      "lbu       %[temp3],    1(%[src])                 \n\t"                  \
+      "lbu       %[temp4],    2(%[src])                 \n\t"                  \
+      "lbu       %[temp5],    3(%[src])                 \n\t"                  \
+      "addu      %[temp1],    %[temp1],     %[temp2]    \n\t"                  \
+      "addu      %[temp2],    %[temp1],     %[temp3]    \n\t"                  \
+      "addu      %[temp3],    %[temp2],     %[temp4]    \n\t"                  \
+      "addu      %[temp4],    %[temp3],     %[temp5]    \n\t"                  \
+      "sb        %[temp1],    0(%[dst])                 \n\t"                  \
+      "sb        %[temp2],    1(%[dst])                 \n\t"                  \
+      "sb        %[temp3],    2(%[dst])                 \n\t"                  \
+      "sb        %[temp4],    3(%[dst])                 \n\t"                  \
+      "addiu     %[src],      %[src],       4           \n\t"                  \
+      "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
+      "bnez      %[temp0],    1b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       4           \n\t"                  \
+    ".else                                              \n\t"                  \
+    "1:                                                 \n\t"                  \
+      "ulw       %[temp1],    -1(%[src])                \n\t"                  \
+      "ulw       %[temp2],    0(%[src])                 \n\t"                  \
+      "addiu     %[src],      %[src],       4           \n\t"                  \
+      "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
+      "subu.qb   %[temp3],    %[temp2],     %[temp1]    \n\t"                  \
+      "usw       %[temp3],    0(%[dst])                 \n\t"                  \
+      "bnez      %[temp0],    1b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       4           \n\t"                  \
+    ".endif                                             \n\t"                  \
+    "4:                                                 \n\t"                  \
+      "beqz      %[temp6],    3f                        \n\t"                  \
+      " nop                                             \n\t"                  \
+    "2:                                                 \n\t"                  \
+      "lbu       %[temp2],    0(%[src])                 \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "lbu       %[temp1],    -1(%[dst])                \n\t"                  \
+      "addu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+    ".else                                              \n\t"                  \
+      "lbu       %[temp1],    -1(%[src])                \n\t"                  \
+      "subu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+    ".endif                                             \n\t"                  \
+      "addiu     %[src],      %[src],       1           \n\t"                  \
+      "sb        %[temp3],    0(%[dst])                 \n\t"                  \
+      "addiu     %[temp6],    %[temp6],     -1          \n\t"                  \
+      "bnez      %[temp6],    2b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       1           \n\t"                  \
+    "3:                                                 \n\t"                  \
+      ".set      pop                                    \n\t"                  \
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),         \
+        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
+        [temp6]"=&r"(temp6), [dst]"+&r"(pdst), [src]"+&r"(psrc)                \
+      : [length]"r"(ilength)                                                   \
+      : "memory"                                                               \
+    );                                                                         \
+  } while (0)
+
+static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
+                                              int length) {
+  DO_PREDICT_LINE(src, dst, length, 0);
+}
+
+#define DO_PREDICT_LINE_VERTICAL(SRC, PRED, DST, LENGTH, INVERSE) do {         \
+    const uint8_t* psrc = (uint8_t*)(SRC);                                     \
+    const uint8_t* ppred = (uint8_t*)(PRED);                                   \
+    uint8_t* pdst = (uint8_t*)(DST);                                           \
+    const int ilength = (int)(LENGTH);                                         \
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;                \
+    __asm__ volatile (                                                         \
+      ".set      push                                   \n\t"                  \
+      ".set      noreorder                              \n\t"                  \
+      "srl       %[temp0],    %[length],    0x3         \n\t"                  \
+      "beqz      %[temp0],    4f                        \n\t"                  \
+      " andi     %[temp7],    %[length],    0x7         \n\t"                  \
+    "1:                                                 \n\t"                  \
+      "ulw       %[temp1],    0(%[src])                 \n\t"                  \
+      "ulw       %[temp2],    0(%[pred])                \n\t"                  \
+      "ulw       %[temp3],    4(%[src])                 \n\t"                  \
+      "ulw       %[temp4],    4(%[pred])                \n\t"                  \
+      "addiu     %[src],      %[src],       8           \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "addu.qb   %[temp5],    %[temp1],     %[temp2]    \n\t"                  \
+      "addu.qb   %[temp6],    %[temp3],     %[temp4]    \n\t"                  \
+    ".else                                              \n\t"                  \
+      "subu.qb   %[temp5],    %[temp1],     %[temp2]    \n\t"                  \
+      "subu.qb   %[temp6],    %[temp3],     %[temp4]    \n\t"                  \
+    ".endif                                             \n\t"                  \
+      "addiu     %[pred],     %[pred],      8           \n\t"                  \
+      "usw       %[temp5],    0(%[dst])                 \n\t"                  \
+      "usw       %[temp6],    4(%[dst])                 \n\t"                  \
+      "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
+      "bnez      %[temp0],    1b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       8           \n\t"                  \
+    "4:                                                 \n\t"                  \
+      "beqz      %[temp7],    3f                        \n\t"                  \
+      " nop                                             \n\t"                  \
+    "2:                                                 \n\t"                  \
+      "lbu       %[temp1],    0(%[src])                 \n\t"                  \
+      "lbu       %[temp2],    0(%[pred])                \n\t"                  \
+      "addiu     %[src],      %[src],       1           \n\t"                  \
+      "addiu     %[pred],     %[pred],      1           \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "addu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+    ".else                                              \n\t"                  \
+      "subu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+    ".endif                                             \n\t"                  \
+      "sb        %[temp3],    0(%[dst])                 \n\t"                  \
+      "addiu     %[temp7],    %[temp7],     -1          \n\t"                  \
+      "bnez      %[temp7],    2b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       1           \n\t"                  \
+    "3:                                                 \n\t"                  \
+      ".set      pop                                    \n\t"                  \
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),         \
+        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
+        [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [pred]"+&r"(ppred),          \
+        [dst]"+&r"(pdst), [src]"+&r"(psrc)                                     \
+      : [length]"r"(ilength)                                                   \
+      : "memory"                                                               \
+    );                                                                         \
+  } while (0)
+
+#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST) do {                             \
+    int temp1, temp2, temp3;                                                   \
+    __asm__ volatile (                                                         \
+      "lbu       %[temp1],   0(%[src])               \n\t"                     \
+      "lbu       %[temp2],   0(%[pred])              \n\t"                     \
+      "subu      %[temp3],   %[temp1],   %[temp2]    \n\t"                     \
+      "sb        %[temp3],   0(%[dst])               \n\t"                     \
+      : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)          \
+      : [pred]"r"((PRED)), [dst]"r"((DST)), [src]"r"((SRC))                    \
+      : "memory"                                                               \
+    );                                                                         \
+  } while (0)
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+#define FILTER_LINE_BY_LINE do {                                               \
+    while (row < last_row) {                                                   \
+      PREDICT_LINE_ONE_PASS(in, preds - stride, out);                          \
+      DO_PREDICT_LINE(in + 1, out + 1, width - 1, 0);                          \
+      ++row;                                                                   \
+      preds += stride;                                                         \
+      in += stride;                                                            \
+      out += stride;                                                           \
+    }                                                                          \
+  } while (0)
+
+static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
+                                                     int width, int height,
+                                                     int stride,
+                                                     int row, int num_rows,
+                                                     uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = in;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  FILTER_LINE_BY_LINE;
+}
+#undef FILTER_LINE_BY_LINE
+
+static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
+                                       int width, int height,
+                                       int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                               filtered_data);
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+#define FILTER_LINE_BY_LINE do {                                               \
+    while (row < last_row) {                                                   \
+      DO_PREDICT_LINE_VERTICAL(in, preds, out, width, 0);                      \
+      ++row;                                                                   \
+      preds += stride;                                                         \
+      in += stride;                                                            \
+      out += stride;                                                           \
+    }                                                                          \
+  } while (0)
+
+static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
+                                                   int width, int height,
+                                                   int stride,
+                                                   int row, int num_rows,
+                                                   uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = in;
+
+  if (row == 0) {
+    // Very first top-left pixel is copied.
+    out[0] = in[0];
+    // Rest of top scan-line is left-predicted.
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+    row = 1;
+    in += stride;
+    out += stride;
+  } else {
+    // We are starting from in-between. Make sure 'preds' points to prev row.
+    preds -= stride;
+  }
+
+  // Filter line-by-line.
+  FILTER_LINE_BY_LINE;
+}
+#undef FILTER_LINE_BY_LINE
+
+static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+                                     int stride, uint8_t* filtered_data) {
+  DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                             filtered_data);
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
+  int temp0;
+  __asm__ volatile (
+    "addu             %[temp0],   %[a],       %[b]        \n\t"
+    "subu             %[temp0],   %[temp0],   %[c]        \n\t"
+    "shll_s.w         %[temp0],   %[temp0],   23          \n\t"
+    "precrqu_s.qb.ph  %[temp0],   %[temp0],   $zero       \n\t"
+    "srl              %[temp0],   %[temp0],   24          \n\t"
+    : [temp0]"=&r"(temp0)
+    : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+  );
+  return temp0;
+}
+
+#define FILTER_LINE_BY_LINE(PREDS, OPERATION) do {                             \
+    while (row < last_row) {                                                   \
+      int w;                                                                   \
+      PREDICT_LINE_ONE_PASS(in, PREDS - stride, out);                          \
+      for (w = 1; w < width; ++w) {                                            \
+        const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1],             \
+                                                     PREDS[w - stride],        \
+                                                     PREDS[w - stride - 1]);   \
+        out[w] = in[w] OPERATION pred;                                         \
+      }                                                                        \
+      ++row;                                                                   \
+      in += stride;                                                            \
+      out += stride;                                                           \
+    }                                                                          \
+  } while (0)
+
+static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
+                                       int width, int height, int stride,
+                                       int row, int num_rows, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = in;
+
+  // left prediction for top scan-line
+  if (row == 0) {
+    out[0] = in[0];
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  FILTER_LINE_BY_LINE(in, -);
+}
+#undef FILTER_LINE_BY_LINE
+
+static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+                                     int stride, uint8_t* filtered_data) {
+  DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                             filtered_data);
+}
+
+//------------------------------------------------------------------------------
+
+static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                         uint8_t* out, int width) {
+ out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
+ DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
+}
+
+static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                       uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
+  } else {
+    DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
+  }
+}
+
+static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                       uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
+  } else {
+    uint8_t top = prev[0], top_left = top, left = top;
+    int i;
+    for (i = 0; i < width; ++i) {
+      top = prev[i];  // need to read this first, in case prev==dst
+      left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left);
+      top_left = top;
+      out[i] = left;
+    }
+  }
+}
+
+#undef DO_PREDICT_LINE_VERTICAL
+#undef PREDICT_LINE_ONE_PASS
+#undef DO_PREDICT_LINE
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2;
+
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/filters_msa.c b/media/libwebp/dsp/filters_msa.c
new file mode 100644
index 0000000000..cd32cdabaf
--- /dev/null
+++ b/media/libwebp/dsp/filters_msa.c
@@ -0,0 +1,202 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of alpha filters
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/msa_macro.h"
+
+#include <assert.h>
+
+static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
+                                            const uint8_t* pred,
+                                            uint8_t* dst, int length) {
+  v16u8 src0, pred0, dst0;
+  assert(length >= 0);
+  while (length >= 32) {
+    v16u8 src1, pred1, dst1;
+    LD_UB2(src, 16, src0, src1);
+    LD_UB2(pred, 16, pred0, pred1);
+    SUB2(src0, pred0, src1, pred1, dst0, dst1);
+    ST_UB2(dst0, dst1, dst, 16);
+    src += 32;
+    pred += 32;
+    dst += 32;
+    length -= 32;
+  }
+  if (length > 0) {
+    int i;
+    if (length >= 16) {
+      src0 = LD_UB(src);
+      pred0 = LD_UB(pred);
+      dst0 = src0 - pred0;
+      ST_UB(dst0, dst);
+      src += 16;
+      pred += 16;
+      dst += 16;
+      length -= 16;
+    }
+    for (i = 0; i < length; i++) {
+      dst[i] = src[i] - pred[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+#define SANITY_CHECK(in, out)  \
+  assert(in != NULL);          \
+  assert(out != NULL);         \
+  assert(width > 0);           \
+  assert(height > 0);          \
+  assert(stride >= width);
+
+//------------------------------------------------------------------------------
+// Horrizontal filter
+
+static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
+                                 int stride, uint8_t* filtered_data) {
+  const uint8_t* preds = data;
+  const uint8_t* in = data;
+  uint8_t* out = filtered_data;
+  int row = 1;
+  SANITY_CHECK(in, out);
+
+  // Leftmost pixel is the same as input for topmost scanline.
+  out[0] = in[0];
+  PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+  preds += stride;
+  in += stride;
+  out += stride;
+  // Filter line-by-line.
+  while (row < height) {
+    // Leftmost pixel is predicted from above.
+    PredictLineInverse0(in, preds - stride, out, 1);
+    PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter
+
+static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
+                                            const uint8_t* ppred,
+                                            uint8_t* poutput, int stride,
+                                            int size) {
+  int w;
+  const v16i8 zero = { 0 };
+  while (size >= 16) {
+    v16u8 pred0, dst0;
+    v8i16 a0, a1, b0, b1, c0, c1;
+    const v16u8 tmp0 = LD_UB(ppred - 1);
+    const v16u8 tmp1 = LD_UB(ppred - stride);
+    const v16u8 tmp2 = LD_UB(ppred - stride - 1);
+    const v16u8 src0 = LD_UB(pinput);
+    ILVRL_B2_SH(zero, tmp0, a0, a1);
+    ILVRL_B2_SH(zero, tmp1, b0, b1);
+    ILVRL_B2_SH(zero, tmp2, c0, c1);
+    ADD2(a0, b0, a1, b1, a0, a1);
+    SUB2(a0, c0, a1, c1, a0, a1);
+    CLIP_SH2_0_255(a0, a1);
+    pred0 = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);
+    dst0 = src0 - pred0;
+    ST_UB(dst0, poutput);
+    ppred += 16;
+    pinput += 16;
+    poutput += 16;
+    size -= 16;
+  }
+  for (w = 0; w < size; ++w) {
+    const int pred = ppred[w - 1] + ppred[w - stride] - ppred[w - stride - 1];
+    poutput[w] = pinput[w] - (pred < 0 ? 0 : pred > 255 ? 255 : pred);
+  }
+}
+
+
+static void GradientFilter_MSA(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
+  const uint8_t* in = data;
+  const uint8_t* preds = data;
+  uint8_t* out = filtered_data;
+  int row = 1;
+  SANITY_CHECK(in, out);
+
+  // left prediction for top scan-line
+  out[0] = in[0];
+  PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+  preds += stride;
+  in += stride;
+  out += stride;
+  // Filter line-by-line.
+  while (row < height) {
+    out[0] = in[0] - preds[- stride];
+    PredictLineGradient(preds + 1, in + 1, out + 1, stride, width - 1);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter
+
+static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
+  const uint8_t* in = data;
+  const uint8_t* preds = data;
+  uint8_t* out = filtered_data;
+  int row = 1;
+  SANITY_CHECK(in, out);
+
+  // Very first top-left pixel is copied.
+  out[0] = in[0];
+  // Rest of top scan-line is left-predicted.
+  PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+  in += stride;
+  out += stride;
+
+  // Filter line-by-line.
+  while (row < height) {
+    PredictLineInverse0(in, preds, out, width);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/filters_sse2.c b/media/libwebp/dsp/filters_sse2.c
index 2cc9bb9766..9b91ab680f 100644
--- a/media/libwebp/dsp/filters_sse2.c
+++ b/media/libwebp/dsp/filters_sse2.c
@@ -163,7 +163,8 @@ static void GradientPredictDirect_SSE2(const uint8_t* const row,
     _mm_storel_epi64((__m128i*)(out + i), H);
   }
   for (; i < length; ++i) {
-    out[i] = row[i] - GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+    const int delta = GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+    out[i] = (uint8_t)(row[i] - delta);
   }
 }
 
@@ -188,7 +189,7 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
 
   // Filter line-by-line.
   while (row < last_row) {
-    out[0] = in[0] - in[-stride];
+    out[0] = (uint8_t)(in[0] - in[-stride]);
     GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
     ++row;
     in += stride;
@@ -223,7 +224,7 @@ static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
                                     uint8_t* out, int width) {
   int i;
   __m128i last;
-  out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
+  out[0] = (uint8_t)(in[0] + (prev == NULL ? 0 : prev[0]));
   if (width <= 1) return;
   last = _mm_set_epi32(0, 0, 0, out[0]);
   for (i = 1; i + 8 <= width; i += 8) {
@@ -238,7 +239,7 @@ static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
     _mm_storel_epi64((__m128i*)(out + i), A7);
     last = _mm_srli_epi64(A7, 56);
   }
-  for (; i < width; ++i) out[i] = in[i] + out[i - 1];
+  for (; i < width; ++i) out[i] = (uint8_t)(in[i] + out[i - 1]);
 }
 
 static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
@@ -259,7 +260,7 @@ static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
       _mm_storeu_si128((__m128i*)&out[i +  0], C0);
       _mm_storeu_si128((__m128i*)&out[i + 16], C1);
     }
-    for (; i < width; ++i) out[i] = in[i] + prev[i];
+    for (; i < width; ++i) out[i] = (uint8_t)(in[i] + prev[i]);
   }
 }
 
@@ -296,7 +297,8 @@ static void GradientPredictInverse_SSE2(const uint8_t* const in,
       _mm_storel_epi64((__m128i*)&row[i], out);
     }
     for (; i < length; ++i) {
-      row[i] = in[i] + GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+      const int delta = GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+      row[i] = (uint8_t)(in[i] + delta);
     }
   }
 }
@@ -306,7 +308,7 @@ static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
   if (prev == NULL) {
     HorizontalUnfilter_SSE2(NULL, in, out, width);
   } else {
-    out[0] = in[0] + prev[0];  // predict from above
+    out[0] = (uint8_t)(in[0] + prev[0]);  // predict from above
     GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1);
   }
 }
@@ -318,7 +320,12 @@ extern void VP8FiltersInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
   WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
+#if defined(CHROMIUM)
+  // TODO(crbug.com/654974)
+  (void)VerticalUnfilter_SSE2;
+#else
   WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
+#endif
   WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
 
   WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
diff --git a/media/libwebp/dsp/lossless.c b/media/libwebp/dsp/lossless.c
index 1a1523d221..763c425ff8 100644
--- a/media/libwebp/dsp/lossless.c
+++ b/media/libwebp/dsp/lossless.c
@@ -81,7 +81,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
 
 // gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is
 // inlined.
-#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409
+#if defined(__arm__) && defined(__GNUC__) && LOCAL_GCC_VERSION <= 0x409
 # define LOCAL_INLINE __attribute__ ((noinline))
 #else
 # define LOCAL_INLINE WEBP_INLINE
@@ -107,88 +107,107 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 // Predictors
 
-static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)top;
   (void)left;
   return ARGB_BLACK;
 }
-static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)top;
-  return left;
+  return *left;
 }
-static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[0];
 }
-static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[1];
 }
-static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[-1];
 }
-static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3(left, top[0], top[1]);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average3(*left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[-1]);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average2(*left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[0]);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average2(*left, top[0]);
   return pred;
 }
-static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   const uint32_t pred = Average2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   const uint32_t pred = Average2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = Average4(*left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select(top[0], left, top[-1]);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], *left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(*left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(*left, top[0], top[-1]);
   return pred;
 }
 
-GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C)
+static void PredictorAdd0_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
+  int x;
+  (void)upper;
+  for (x = 0; x < num_pixels; ++x) out[x] = VP8LAddPixels(in[x], ARGB_BLACK);
+}
 static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
                             int num_pixels, uint32_t* out) {
   int i;
   uint32_t left = out[-1];
+  (void)upper;
   for (i = 0; i < num_pixels; ++i) {
     out[i] = left = VP8LAddPixels(in[i], left);
   }
-  (void)upper;
 }
-GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
-GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
-GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
-GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
-GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
-GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
-GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
-GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
-GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
-GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
-GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
-GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor2_C, PredictorAdd2_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor3_C, PredictorAdd3_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor4_C, PredictorAdd4_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor5_C, PredictorAdd5_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor6_C, PredictorAdd6_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor7_C, PredictorAdd7_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor8_C, PredictorAdd8_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor9_C, PredictorAdd9_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor10_C, PredictorAdd10_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor11_C, PredictorAdd11_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor12_C, PredictorAdd12_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor13_C, PredictorAdd13_C)
 
 //------------------------------------------------------------------------------
 
@@ -270,14 +289,14 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
   int i;
   for (i = 0; i < num_pixels; ++i) {
     const uint32_t argb = src[i];
-    const uint32_t green = argb >> 8;
+    const int8_t green = (int8_t)(argb >> 8);
     const uint32_t red = argb >> 16;
     int new_red = red & 0xff;
     int new_blue = argb & 0xff;
     new_red += ColorTransformDelta(m->green_to_red_, green);
     new_red &= 0xff;
     new_blue += ColorTransformDelta(m->green_to_blue_, green);
-    new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
+    new_blue += ColorTransformDelta(m->red_to_blue_, (int8_t)new_red);
     new_blue &= 0xff;
     dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   }
@@ -557,7 +576,6 @@ VP8LPredictorFunc VP8LPredictors[16];
 
 // exposed plain-C implementations
 VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
-VP8LPredictorFunc VP8LPredictors_C[16];
 
 VP8LTransformColorInverseFunc VP8LTransformColorInverse;
 
@@ -571,6 +589,7 @@ VP8LMapARGBFunc VP8LMapColor32b;
 VP8LMapAlphaFunc VP8LMapColor8b;
 
 extern void VP8LDspInitSSE2(void);
+extern void VP8LDspInitSSE41(void);
 extern void VP8LDspInitNEON(void);
 extern void VP8LDspInitMIPSdspR2(void);
 extern void VP8LDspInitMSA(void);
@@ -595,8 +614,7 @@ extern void VP8LDspInitMSA(void);
 } while (0);
 
 WEBP_DSP_INIT_FUNC(VP8LDspInit) {
-  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
-  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
+  COPY_PREDICTOR_ARRAY(VP8LPredictor, VP8LPredictors)
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
 
@@ -618,9 +636,14 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8LDspInitSSE2();
+#if defined(WEBP_HAVE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8LDspInitSSE41();
+      }
+#endif
     }
 #endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
@@ -635,7 +658,7 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8LDspInitNEON();
diff --git a/media/libwebp/dsp/lossless.h b/media/libwebp/dsp/lossless.h
index 6db5fafc13..0c129d2860 100644
--- a/media/libwebp/dsp/lossless.h
+++ b/media/libwebp/dsp/lossless.h
@@ -28,9 +28,39 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Decoding
 
-typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
+typedef uint32_t (*VP8LPredictorFunc)(const uint32_t* const left,
+                                      const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
-extern VP8LPredictorFunc VP8LPredictors_C[16];
+
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+                           const uint32_t* const top);
+
 // These Add/Sub function expects upper[-1] and out[-1] to be readable.
 typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
                                         const uint32_t* upper, int num_pixels,
diff --git a/media/libwebp/dsp/lossless_common.h b/media/libwebp/dsp/lossless_common.h
index dd2e4f247e..2b20637a28 100644
--- a/media/libwebp/dsp/lossless_common.h
+++ b/media/libwebp/dsp/lossless_common.h
@@ -177,24 +177,13 @@ uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
 static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
                           int num_pixels, uint32_t* out) {           \
   int x;                                                             \
+  assert(upper != NULL);                                             \
   for (x = 0; x < num_pixels; ++x) {                                 \
-    const uint32_t pred = (PREDICTOR)(out[x - 1], upper + x);        \
+    const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x);       \
     out[x] = VP8LAddPixels(in[x], pred);                             \
   }                                                                  \
 }
 
-// It subtracts the prediction from the input pixel and stores the residual
-// in the output pixel.
-#define GENERATE_PREDICTOR_SUB(PREDICTOR, PREDICTOR_SUB)             \
-static void PREDICTOR_SUB(const uint32_t* in, const uint32_t* upper, \
-                          int num_pixels, uint32_t* out) {           \
-  int x;                                                             \
-  for (x = 0; x < num_pixels; ++x) {                                 \
-    const uint32_t pred = (PREDICTOR)(in[x - 1], upper + x);         \
-    out[x] = VP8LSubPixels(in[x], pred);                             \
-  }                                                                  \
-}
-
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/media/libwebp/dsp/lossless_enc.c b/media/libwebp/dsp/lossless_enc.c
new file mode 100644
index 0000000000..dca5e26be8
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc.c
@@ -0,0 +1,948 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+//          Urvang Joshi (urvang@google.com)
+
+#include "../dsp/dsp.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include "../dec/vp8li_dec.h"
+#include "../utils/endian_inl_utils.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../dsp/yuv.h"
+
+// lookup table for small values of log2(int)
+const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
+  0.0000000000000000f, 0.0000000000000000f,
+  1.0000000000000000f, 1.5849625007211560f,
+  2.0000000000000000f, 2.3219280948873621f,
+  2.5849625007211560f, 2.8073549220576041f,
+  3.0000000000000000f, 3.1699250014423121f,
+  3.3219280948873621f, 3.4594316186372973f,
+  3.5849625007211560f, 3.7004397181410921f,
+  3.8073549220576041f, 3.9068905956085187f,
+  4.0000000000000000f, 4.0874628412503390f,
+  4.1699250014423121f, 4.2479275134435852f,
+  4.3219280948873626f, 4.3923174227787606f,
+  4.4594316186372973f, 4.5235619560570130f,
+  4.5849625007211560f, 4.6438561897747243f,
+  4.7004397181410917f, 4.7548875021634682f,
+  4.8073549220576037f, 4.8579809951275718f,
+  4.9068905956085187f, 4.9541963103868749f,
+  5.0000000000000000f, 5.0443941193584533f,
+  5.0874628412503390f, 5.1292830169449663f,
+  5.1699250014423121f, 5.2094533656289501f,
+  5.2479275134435852f, 5.2854022188622487f,
+  5.3219280948873626f, 5.3575520046180837f,
+  5.3923174227787606f, 5.4262647547020979f,
+  5.4594316186372973f, 5.4918530963296747f,
+  5.5235619560570130f, 5.5545888516776376f,
+  5.5849625007211560f, 5.6147098441152083f,
+  5.6438561897747243f, 5.6724253419714951f,
+  5.7004397181410917f, 5.7279204545631987f,
+  5.7548875021634682f, 5.7813597135246599f,
+  5.8073549220576037f, 5.8328900141647412f,
+  5.8579809951275718f, 5.8826430493618415f,
+  5.9068905956085187f, 5.9307373375628866f,
+  5.9541963103868749f, 5.9772799234999167f,
+  6.0000000000000000f, 6.0223678130284543f,
+  6.0443941193584533f, 6.0660891904577720f,
+  6.0874628412503390f, 6.1085244567781691f,
+  6.1292830169449663f, 6.1497471195046822f,
+  6.1699250014423121f, 6.1898245588800175f,
+  6.2094533656289501f, 6.2288186904958804f,
+  6.2479275134435852f, 6.2667865406949010f,
+  6.2854022188622487f, 6.3037807481771030f,
+  6.3219280948873626f, 6.3398500028846243f,
+  6.3575520046180837f, 6.3750394313469245f,
+  6.3923174227787606f, 6.4093909361377017f,
+  6.4262647547020979f, 6.4429434958487279f,
+  6.4594316186372973f, 6.4757334309663976f,
+  6.4918530963296747f, 6.5077946401986963f,
+  6.5235619560570130f, 6.5391588111080309f,
+  6.5545888516776376f, 6.5698556083309478f,
+  6.5849625007211560f, 6.5999128421871278f,
+  6.6147098441152083f, 6.6293566200796094f,
+  6.6438561897747243f, 6.6582114827517946f,
+  6.6724253419714951f, 6.6865005271832185f,
+  6.7004397181410917f, 6.7142455176661224f,
+  6.7279204545631987f, 6.7414669864011464f,
+  6.7548875021634682f, 6.7681843247769259f,
+  6.7813597135246599f, 6.7944158663501061f,
+  6.8073549220576037f, 6.8201789624151878f,
+  6.8328900141647412f, 6.8454900509443747f,
+  6.8579809951275718f, 6.8703647195834047f,
+  6.8826430493618415f, 6.8948177633079437f,
+  6.9068905956085187f, 6.9188632372745946f,
+  6.9307373375628866f, 6.9425145053392398f,
+  6.9541963103868749f, 6.9657842846620869f,
+  6.9772799234999167f, 6.9886846867721654f,
+  7.0000000000000000f, 7.0112272554232539f,
+  7.0223678130284543f, 7.0334230015374501f,
+  7.0443941193584533f, 7.0552824355011898f,
+  7.0660891904577720f, 7.0768155970508308f,
+  7.0874628412503390f, 7.0980320829605263f,
+  7.1085244567781691f, 7.1189410727235076f,
+  7.1292830169449663f, 7.1395513523987936f,
+  7.1497471195046822f, 7.1598713367783890f,
+  7.1699250014423121f, 7.1799090900149344f,
+  7.1898245588800175f, 7.1996723448363644f,
+  7.2094533656289501f, 7.2191685204621611f,
+  7.2288186904958804f, 7.2384047393250785f,
+  7.2479275134435852f, 7.2573878426926521f,
+  7.2667865406949010f, 7.2761244052742375f,
+  7.2854022188622487f, 7.2946207488916270f,
+  7.3037807481771030f, 7.3128829552843557f,
+  7.3219280948873626f, 7.3309168781146167f,
+  7.3398500028846243f, 7.3487281542310771f,
+  7.3575520046180837f, 7.3663222142458160f,
+  7.3750394313469245f, 7.3837042924740519f,
+  7.3923174227787606f, 7.4008794362821843f,
+  7.4093909361377017f, 7.4178525148858982f,
+  7.4262647547020979f, 7.4346282276367245f,
+  7.4429434958487279f, 7.4512111118323289f,
+  7.4594316186372973f, 7.4676055500829976f,
+  7.4757334309663976f, 7.4838157772642563f,
+  7.4918530963296747f, 7.4998458870832056f,
+  7.5077946401986963f, 7.5156998382840427f,
+  7.5235619560570130f, 7.5313814605163118f,
+  7.5391588111080309f, 7.5468944598876364f,
+  7.5545888516776376f, 7.5622424242210728f,
+  7.5698556083309478f, 7.5774288280357486f,
+  7.5849625007211560f, 7.5924570372680806f,
+  7.5999128421871278f, 7.6073303137496104f,
+  7.6147098441152083f, 7.6220518194563764f,
+  7.6293566200796094f, 7.6366246205436487f,
+  7.6438561897747243f, 7.6510516911789281f,
+  7.6582114827517946f, 7.6653359171851764f,
+  7.6724253419714951f, 7.6794800995054464f,
+  7.6865005271832185f, 7.6934869574993252f,
+  7.7004397181410917f, 7.7073591320808825f,
+  7.7142455176661224f, 7.7210991887071855f,
+  7.7279204545631987f, 7.7347096202258383f,
+  7.7414669864011464f, 7.7481928495894605f,
+  7.7548875021634682f, 7.7615512324444795f,
+  7.7681843247769259f, 7.7747870596011736f,
+  7.7813597135246599f, 7.7879025593914317f,
+  7.7944158663501061f, 7.8008998999203047f,
+  7.8073549220576037f, 7.8137811912170374f,
+  7.8201789624151878f, 7.8265484872909150f,
+  7.8328900141647412f, 7.8392037880969436f,
+  7.8454900509443747f, 7.8517490414160571f,
+  7.8579809951275718f, 7.8641861446542797f,
+  7.8703647195834047f, 7.8765169465649993f,
+  7.8826430493618415f, 7.8887432488982591f,
+  7.8948177633079437f, 7.9008668079807486f,
+  7.9068905956085187f, 7.9128893362299619f,
+  7.9188632372745946f, 7.9248125036057812f,
+  7.9307373375628866f, 7.9366379390025709f,
+  7.9425145053392398f, 7.9483672315846778f,
+  7.9541963103868749f, 7.9600019320680805f,
+  7.9657842846620869f, 7.9715435539507719f,
+  7.9772799234999167f, 7.9829935746943103f,
+  7.9886846867721654f, 7.9943534368588577f
+};
+
+const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
+  0.00000000f,    0.00000000f,  2.00000000f,   4.75488750f,
+  8.00000000f,   11.60964047f,  15.50977500f,  19.65148445f,
+  24.00000000f,  28.52932501f,  33.21928095f,  38.05374781f,
+  43.01955001f,  48.10571634f,  53.30296891f,  58.60335893f,
+  64.00000000f,  69.48686830f,  75.05865003f,  80.71062276f,
+  86.43856190f,  92.23866588f,  98.10749561f,  104.04192499f,
+  110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
+  134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
+  160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
+  186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
+  212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
+  240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
+  268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
+  296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
+  325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
+  354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
+  384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
+  413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
+  444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
+  474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
+  505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
+  536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
+  568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
+  600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
+  632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
+  664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
+  696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
+  729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
+  762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
+  795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
+  828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
+  862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
+  896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
+  929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
+  963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
+  998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
+  1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
+  1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
+  1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
+  1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
+  1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
+  1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
+  1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
+  1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
+  1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
+  1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
+  1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
+  1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
+  1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
+  1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
+  1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
+  1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
+  1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
+  1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
+  1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
+  1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
+  1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
+  1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
+  1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
+  1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
+  1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
+  1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
+  1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
+  2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
+};
+
+const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
+  { 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
+  { 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
+  { 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
+  { 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
+  { 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
+  {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
+  {10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
+  {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
+  {11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+};
+
+const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
+   0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  2,  3,  0,  1,  2,  3,
+   0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+  127,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
+};
+
+static float FastSLog2Slow_C(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+    // use clz if available
+    const int log_cnt = BitsLog2Floor(v) - 7;
+    const uint32_t y = 1 << log_cnt;
+    int correction = 0;
+    const float v_f = (float)v;
+    const uint32_t orig_v = v;
+    v >>= log_cnt;
+#else
+    int log_cnt = 0;
+    uint32_t y = 1;
+    int correction = 0;
+    const float v_f = (float)v;
+    const uint32_t orig_v = v;
+    do {
+      ++log_cnt;
+      v = v >> 1;
+      y = y << 1;
+    } while (v >= LOG_LOOKUP_IDX_MAX);
+#endif
+    // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
+    // Xf = floor(Xf) * (1 + (v % y) / v)
+    // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
+    // The correction factor: log(1 + d) ~ d; for very small d values, so
+    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
+    // LOG_2_RECIPROCAL ~ 23/16
+    correction = (23 * (orig_v & (y - 1))) >> 4;
+    return v_f * (kLog2Table[v] + log_cnt) + correction;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
+  }
+}
+
+static float FastLog2Slow_C(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+    // use clz if available
+    const int log_cnt = BitsLog2Floor(v) - 7;
+    const uint32_t y = 1 << log_cnt;
+    const uint32_t orig_v = v;
+    double log_2;
+    v >>= log_cnt;
+#else
+    int log_cnt = 0;
+    uint32_t y = 1;
+    const uint32_t orig_v = v;
+    double log_2;
+    do {
+      ++log_cnt;
+      v = v >> 1;
+      y = y << 1;
+    } while (v >= LOG_LOOKUP_IDX_MAX);
+#endif
+    log_2 = kLog2Table[v] + log_cnt;
+    if (orig_v >= APPROX_LOG_MAX) {
+      // Since the division is still expensive, add this correction factor only
+      // for large values of 'v'.
+      const int correction = (23 * (orig_v & (y - 1))) >> 4;
+      log_2 += (double)correction / orig_v;
+    }
+    return (float)log_2;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * log((double)v));
+  }
+}
+
+//------------------------------------------------------------------------------
+// Methods to calculate Entropy (Shannon).
+
+// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
+static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
+  int i;
+  double retval = 0.;
+  int sumX = 0, sumXY = 0;
+  for (i = 0; i < 256; ++i) {
+    const int x = X[i];
+    if (x != 0) {
+      const int xy = x + Y[i];
+      sumX += x;
+      retval -= VP8LFastSLog2(x);
+      sumXY += xy;
+      retval -= VP8LFastSLog2(xy);
+    } else if (Y[i] != 0) {
+      sumXY += Y[i];
+      retval -= VP8LFastSLog2(Y[i]);
+    }
+  }
+  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
+  return (float)retval;
+}
+
+void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
+  entropy->entropy = 0.;
+  entropy->sum = 0;
+  entropy->nonzeros = 0;
+  entropy->max_val = 0;
+  entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM;
+}
+
+void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
+                              VP8LBitEntropy* const entropy) {
+  int i;
+
+  VP8LBitEntropyInit(entropy);
+
+  for (i = 0; i < n; ++i) {
+    if (array[i] != 0) {
+      entropy->sum += array[i];
+      entropy->nonzero_code = i;
+      ++entropy->nonzeros;
+      entropy->entropy -= VP8LFastSLog2(array[i]);
+      if (entropy->max_val < array[i]) {
+        entropy->max_val = array[i];
+      }
+    }
+  }
+  entropy->entropy += VP8LFastSLog2(entropy->sum);
+}
+
+static WEBP_INLINE void GetEntropyUnrefinedHelper(
+    uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
+    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+  const int streak = i - *i_prev;
+
+  // Gather info for the bit entropy.
+  if (*val_prev != 0) {
+    bit_entropy->sum += (*val_prev) * streak;
+    bit_entropy->nonzeros += streak;
+    bit_entropy->nonzero_code = *i_prev;
+    bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
+    if (bit_entropy->max_val < *val_prev) {
+      bit_entropy->max_val = *val_prev;
+    }
+  }
+
+  // Gather info for the Huffman cost.
+  stats->counts[*val_prev != 0] += (streak > 3);
+  stats->streaks[*val_prev != 0][(streak > 3)] += streak;
+
+  *val_prev = val;
+  *i_prev = i;
+}
+
+static void GetEntropyUnrefined_C(const uint32_t X[], int length,
+                                  VP8LBitEntropy* const bit_entropy,
+                                  VP8LStreaks* const stats) {
+  int i;
+  int i_prev = 0;
+  uint32_t x_prev = X[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(bit_entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t x = X[i];
+    if (x != x_prev) {
+      GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
+    }
+  }
+  GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
+
+  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
+                                          const uint32_t Y[],
+                                          int length,
+                                          VP8LBitEntropy* const bit_entropy,
+                                          VP8LStreaks* const stats) {
+  int i = 1;
+  int i_prev = 0;
+  uint32_t xy_prev = X[0] + Y[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(bit_entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t xy = X[i] + Y[i];
+    if (xy != xy_prev) {
+      GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
+    }
+  }
+  GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
+
+  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+//------------------------------------------------------------------------------
+
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const int argb = argb_data[i];
+    const int green = (argb >> 8) & 0xff;
+    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
+    const uint32_t new_b = (((argb >>  0) & 0xff) - green) & 0xff;
+    argb_data[i] = (argb & 0xff00ff00u) | (new_r << 16) | new_b;
+  }
+}
+
+static WEBP_INLINE int ColorTransformDelta(int8_t color_pred, int8_t color) {
+  return ((int)color_pred * color) >> 5;
+}
+
+static WEBP_INLINE int8_t U32ToS8(uint32_t v) {
+  return (int8_t)(v & 0xff);
+}
+
+void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
+                          int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint32_t argb = data[i];
+    const int8_t green = U32ToS8(argb >>  8);
+    const int8_t red   = U32ToS8(argb >> 16);
+    int new_red = red & 0xff;
+    int new_blue = argb & 0xff;
+    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue &= 0xff;
+    data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
+                                             uint32_t argb) {
+  const int8_t green = U32ToS8(argb >> 8);
+  int new_red = argb >> 16;
+  new_red -= ColorTransformDelta(green_to_red, green);
+  return (new_red & 0xff);
+}
+
+static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
+                                              uint8_t red_to_blue,
+                                              uint32_t argb) {
+  const int8_t green = U32ToS8(argb >>  8);
+  const int8_t red   = U32ToS8(argb >> 16);
+  uint8_t new_blue = argb & 0xff;
+  new_blue -= ColorTransformDelta(green_to_blue, green);
+  new_blue -= ColorTransformDelta(red_to_blue, red);
+  return (new_blue & 0xff);
+}
+
+void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
+                                     int tile_width, int tile_height,
+                                     int green_to_red, int histo[]) {
+  while (tile_height-- > 0) {
+    int x;
+    for (x = 0; x < tile_width; ++x) {
+      ++histo[TransformColorRed((uint8_t)green_to_red, argb[x])];
+    }
+    argb += stride;
+  }
+}
+
+void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
+                                      int tile_width, int tile_height,
+                                      int green_to_blue, int red_to_blue,
+                                      int histo[]) {
+  while (tile_height-- > 0) {
+    int x;
+    for (x = 0; x < tile_width; ++x) {
+      ++histo[TransformColorBlue((uint8_t)green_to_blue, (uint8_t)red_to_blue,
+                                 argb[x])];
+    }
+    argb += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static int VectorMismatch_C(const uint32_t* const array1,
+                            const uint32_t* const array2, int length) {
+  int match_len = 0;
+
+  while (match_len < length && array1[match_len] == array2[match_len]) {
+    ++match_len;
+  }
+  return match_len;
+}
+
+// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
+void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
+                          uint32_t* dst) {
+  int x;
+  if (xbits > 0) {
+    const int bit_depth = 1 << (3 - xbits);
+    const int mask = (1 << xbits) - 1;
+    uint32_t code = 0xff000000;
+    for (x = 0; x < width; ++x) {
+      const int xsub = x & mask;
+      if (xsub == 0) {
+        code = 0xff000000;
+      }
+      code |= row[x] << (8 + bit_depth * xsub);
+      dst[x >> xbits] = code;
+    }
+  } else {
+    for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static double ExtraCost_C(const uint32_t* population, int length) {
+  int i;
+  double cost = 0.;
+  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
+  return cost;
+}
+
+static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
+                                  int length) {
+  int i;
+  double cost = 0.;
+  for (i = 2; i < length - 2; ++i) {
+    const int xy = X[i + 2] + Y[i + 2];
+    cost += (i >> 1) * xy;
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+
+static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out,
+                        int size) {
+  int i;
+  for (i = 0; i < size; ++i) out[i] = a[i] + b[i];
+}
+
+static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
+  int i;
+  for (i = 0; i < size; ++i) out[i] += a[i];
+}
+
+#define ADD(X, ARG, LEN) do {                                                  \
+  if (a->is_used_[X]) {                                                        \
+    if (b->is_used_[X]) {                                                      \
+      VP8LAddVector(a->ARG, b->ARG, out->ARG, (LEN));                          \
+    } else {                                                                   \
+      memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0]));           \
+    }                                                                          \
+  } else if (b->is_used_[X]) {                                                 \
+    memcpy(&out->ARG[0], &b->ARG[0], (LEN) * sizeof(out->ARG[0]));             \
+  } else {                                                                     \
+    memset(&out->ARG[0], 0, (LEN) * sizeof(out->ARG[0]));                      \
+  }                                                                            \
+} while (0)
+
+#define ADD_EQ(X, ARG, LEN) do {                                               \
+  if (a->is_used_[X]) {                                                        \
+    if (out->is_used_[X]) {                                                    \
+      VP8LAddVectorEq(a->ARG, out->ARG, (LEN));                                \
+    } else {                                                                   \
+      memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0]));           \
+    }                                                                          \
+  }                                                                            \
+} while (0)
+
+void VP8LHistogramAdd(const VP8LHistogram* const a,
+                      const VP8LHistogram* const b, VP8LHistogram* const out) {
+  int i;
+  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+
+  if (b != out) {
+    ADD(0, literal_, literal_size);
+    ADD(1, red_, NUM_LITERAL_CODES);
+    ADD(2, blue_, NUM_LITERAL_CODES);
+    ADD(3, alpha_, NUM_LITERAL_CODES);
+    ADD(4, distance_, NUM_DISTANCE_CODES);
+    for (i = 0; i < 5; ++i) {
+      out->is_used_[i] = (a->is_used_[i] | b->is_used_[i]);
+    }
+  } else {
+    ADD_EQ(0, literal_, literal_size);
+    ADD_EQ(1, red_, NUM_LITERAL_CODES);
+    ADD_EQ(2, blue_, NUM_LITERAL_CODES);
+    ADD_EQ(3, alpha_, NUM_LITERAL_CODES);
+    ADD_EQ(4, distance_, NUM_DISTANCE_CODES);
+    for (i = 0; i < 5; ++i) out->is_used_[i] |= a->is_used_[i];
+  }
+}
+#undef ADD
+#undef ADD_EQ
+
+//------------------------------------------------------------------------------
+// Image transforms.
+
+static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK);
+  (void)upper;
+}
+
+static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]);
+  (void)upper;
+}
+
+// It subtracts the prediction from the input pixel and stores the residual
+// in the output pixel.
+#define GENERATE_PREDICTOR_SUB(PREDICTOR_I)                                \
+static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in,              \
+                                          const uint32_t* upper,           \
+                                          int num_pixels, uint32_t* out) { \
+  int x;                                                                   \
+  assert(upper != NULL);                                                   \
+  for (x = 0; x < num_pixels; ++x) {                                       \
+    const uint32_t pred =                                                  \
+        VP8LPredictor##PREDICTOR_I##_C(&in[x - 1], upper + x);             \
+    out[x] = VP8LSubPixels(in[x], pred);                                   \
+  }                                                                        \
+}
+
+GENERATE_PREDICTOR_SUB(2)
+GENERATE_PREDICTOR_SUB(3)
+GENERATE_PREDICTOR_SUB(4)
+GENERATE_PREDICTOR_SUB(5)
+GENERATE_PREDICTOR_SUB(6)
+GENERATE_PREDICTOR_SUB(7)
+GENERATE_PREDICTOR_SUB(8)
+GENERATE_PREDICTOR_SUB(9)
+GENERATE_PREDICTOR_SUB(10)
+GENERATE_PREDICTOR_SUB(11)
+GENERATE_PREDICTOR_SUB(12)
+GENERATE_PREDICTOR_SUB(13)
+
+//------------------------------------------------------------------------------
+
+VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+
+VP8LTransformColorFunc VP8LTransformColor;
+
+VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
+VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
+
+VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
+VP8LCostFunc VP8LExtraCost;
+VP8LCostCombinedFunc VP8LExtraCostCombined;
+VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
+
+VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
+VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
+
+VP8LAddVectorFunc VP8LAddVector;
+VP8LAddVectorEqFunc VP8LAddVectorEq;
+
+VP8LVectorMismatchFunc VP8LVectorMismatch;
+VP8LBundleColorMapFunc VP8LBundleColorMap;
+
+VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
+VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
+
+extern void VP8LEncDspInitSSE2(void);
+extern void VP8LEncDspInitSSE41(void);
+extern void VP8LEncDspInitNEON(void);
+extern void VP8LEncDspInitMIPS32(void);
+extern void VP8LEncDspInitMIPSdspR2(void);
+extern void VP8LEncDspInitMSA(void);
+
+WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
+  VP8LDspInit();
+
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
+
+  VP8LTransformColor = VP8LTransformColor_C;
+#endif
+
+  VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
+  VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
+
+  VP8LFastLog2Slow = FastLog2Slow_C;
+  VP8LFastSLog2Slow = FastSLog2Slow_C;
+
+  VP8LExtraCost = ExtraCost_C;
+  VP8LExtraCostCombined = ExtraCostCombined_C;
+  VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
+
+  VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
+  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
+
+  VP8LAddVector = AddVector_C;
+  VP8LAddVectorEq = AddVectorEq_C;
+
+  VP8LVectorMismatch = VectorMismatch_C;
+  VP8LBundleColorMap = VP8LBundleColorMap_C;
+
+  VP8LPredictorsSub[0] = PredictorSub0_C;
+  VP8LPredictorsSub[1] = PredictorSub1_C;
+  VP8LPredictorsSub[2] = PredictorSub2_C;
+  VP8LPredictorsSub[3] = PredictorSub3_C;
+  VP8LPredictorsSub[4] = PredictorSub4_C;
+  VP8LPredictorsSub[5] = PredictorSub5_C;
+  VP8LPredictorsSub[6] = PredictorSub6_C;
+  VP8LPredictorsSub[7] = PredictorSub7_C;
+  VP8LPredictorsSub[8] = PredictorSub8_C;
+  VP8LPredictorsSub[9] = PredictorSub9_C;
+  VP8LPredictorsSub[10] = PredictorSub10_C;
+  VP8LPredictorsSub[11] = PredictorSub11_C;
+  VP8LPredictorsSub[12] = PredictorSub12_C;
+  VP8LPredictorsSub[13] = PredictorSub13_C;
+  VP8LPredictorsSub[14] = PredictorSub0_C;  // <- padding security sentinels
+  VP8LPredictorsSub[15] = PredictorSub0_C;
+
+  VP8LPredictorsSub_C[0] = PredictorSub0_C;
+  VP8LPredictorsSub_C[1] = PredictorSub1_C;
+  VP8LPredictorsSub_C[2] = PredictorSub2_C;
+  VP8LPredictorsSub_C[3] = PredictorSub3_C;
+  VP8LPredictorsSub_C[4] = PredictorSub4_C;
+  VP8LPredictorsSub_C[5] = PredictorSub5_C;
+  VP8LPredictorsSub_C[6] = PredictorSub6_C;
+  VP8LPredictorsSub_C[7] = PredictorSub7_C;
+  VP8LPredictorsSub_C[8] = PredictorSub8_C;
+  VP8LPredictorsSub_C[9] = PredictorSub9_C;
+  VP8LPredictorsSub_C[10] = PredictorSub10_C;
+  VP8LPredictorsSub_C[11] = PredictorSub11_C;
+  VP8LPredictorsSub_C[12] = PredictorSub12_C;
+  VP8LPredictorsSub_C[13] = PredictorSub13_C;
+  VP8LPredictorsSub_C[14] = PredictorSub0_C;  // <- padding security sentinels
+  VP8LPredictorsSub_C[15] = PredictorSub0_C;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_HAVE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8LEncDspInitSSE2();
+#if defined(WEBP_HAVE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8LEncDspInitSSE41();
+      }
+#endif
+    }
+#endif
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8LEncDspInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8LEncDspInitMIPSdspR2();
+    }
+#endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8LEncDspInitMSA();
+    }
+#endif
+  }
+
+#if defined(WEBP_HAVE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8LEncDspInitNEON();
+  }
+#endif
+
+  assert(VP8LSubtractGreenFromBlueAndRed != NULL);
+  assert(VP8LTransformColor != NULL);
+  assert(VP8LCollectColorBlueTransforms != NULL);
+  assert(VP8LCollectColorRedTransforms != NULL);
+  assert(VP8LFastLog2Slow != NULL);
+  assert(VP8LFastSLog2Slow != NULL);
+  assert(VP8LExtraCost != NULL);
+  assert(VP8LExtraCostCombined != NULL);
+  assert(VP8LCombinedShannonEntropy != NULL);
+  assert(VP8LGetEntropyUnrefined != NULL);
+  assert(VP8LGetCombinedEntropyUnrefined != NULL);
+  assert(VP8LAddVector != NULL);
+  assert(VP8LAddVectorEq != NULL);
+  assert(VP8LVectorMismatch != NULL);
+  assert(VP8LBundleColorMap != NULL);
+  assert(VP8LPredictorsSub[0] != NULL);
+  assert(VP8LPredictorsSub[1] != NULL);
+  assert(VP8LPredictorsSub[2] != NULL);
+  assert(VP8LPredictorsSub[3] != NULL);
+  assert(VP8LPredictorsSub[4] != NULL);
+  assert(VP8LPredictorsSub[5] != NULL);
+  assert(VP8LPredictorsSub[6] != NULL);
+  assert(VP8LPredictorsSub[7] != NULL);
+  assert(VP8LPredictorsSub[8] != NULL);
+  assert(VP8LPredictorsSub[9] != NULL);
+  assert(VP8LPredictorsSub[10] != NULL);
+  assert(VP8LPredictorsSub[11] != NULL);
+  assert(VP8LPredictorsSub[12] != NULL);
+  assert(VP8LPredictorsSub[13] != NULL);
+  assert(VP8LPredictorsSub[14] != NULL);
+  assert(VP8LPredictorsSub[15] != NULL);
+  assert(VP8LPredictorsSub_C[0] != NULL);
+  assert(VP8LPredictorsSub_C[1] != NULL);
+  assert(VP8LPredictorsSub_C[2] != NULL);
+  assert(VP8LPredictorsSub_C[3] != NULL);
+  assert(VP8LPredictorsSub_C[4] != NULL);
+  assert(VP8LPredictorsSub_C[5] != NULL);
+  assert(VP8LPredictorsSub_C[6] != NULL);
+  assert(VP8LPredictorsSub_C[7] != NULL);
+  assert(VP8LPredictorsSub_C[8] != NULL);
+  assert(VP8LPredictorsSub_C[9] != NULL);
+  assert(VP8LPredictorsSub_C[10] != NULL);
+  assert(VP8LPredictorsSub_C[11] != NULL);
+  assert(VP8LPredictorsSub_C[12] != NULL);
+  assert(VP8LPredictorsSub_C[13] != NULL);
+  assert(VP8LPredictorsSub_C[14] != NULL);
+  assert(VP8LPredictorsSub_C[15] != NULL);
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/dsp/lossless_enc_mips32.c b/media/libwebp/dsp/lossless_enc_mips32.c
new file mode 100644
index 0000000000..088e608b44
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_mips32.c
@@ -0,0 +1,397 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of lossless functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+static float FastSLog2Slow_MIPS32(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    uint32_t log_cnt, y, correction;
+    const int c24 = 24;
+    const float v_f = (float)v;
+    uint32_t temp;
+
+    // Xf = 256 = 2^8
+    // log_cnt is index of leading one in upper 24 bits
+    __asm__ volatile(
+      "clz      %[log_cnt], %[v]                      \n\t"
+      "addiu    %[y],       $zero,        1           \n\t"
+      "subu     %[log_cnt], %[c24],       %[log_cnt]  \n\t"
+      "sllv     %[y],       %[y],         %[log_cnt]  \n\t"
+      "srlv     %[temp],    %[v],         %[log_cnt]  \n\t"
+      : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
+        [temp]"=r"(temp)
+      : [c24]"r"(c24), [v]"r"(v)
+    );
+
+    // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
+    // Xf = floor(Xf) * (1 + (v % y) / v)
+    // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
+    // The correction factor: log(1 + d) ~ d; for very small d values, so
+    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
+    // LOG_2_RECIPROCAL ~ 23/16
+
+    // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
+    correction = (23 * (v & (y - 1))) >> 4;
+    return v_f * (kLog2Table[temp] + log_cnt) + correction;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
+  }
+}
+
+static float FastLog2Slow_MIPS32(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    uint32_t log_cnt, y;
+    const int c24 = 24;
+    double log_2;
+    uint32_t temp;
+
+    __asm__ volatile(
+      "clz      %[log_cnt], %[v]                      \n\t"
+      "addiu    %[y],       $zero,        1           \n\t"
+      "subu     %[log_cnt], %[c24],       %[log_cnt]  \n\t"
+      "sllv     %[y],       %[y],         %[log_cnt]  \n\t"
+      "srlv     %[temp],    %[v],         %[log_cnt]  \n\t"
+      : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
+        [temp]"=r"(temp)
+      : [c24]"r"(c24), [v]"r"(v)
+    );
+
+    log_2 = kLog2Table[temp] + log_cnt;
+    if (v >= APPROX_LOG_MAX) {
+      // Since the division is still expensive, add this correction factor only
+      // for large values of 'v'.
+
+      const uint32_t correction = (23 * (v & (y - 1))) >> 4;
+      log_2 += (double)correction / v;
+    }
+    return (float)log_2;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * log((double)v));
+  }
+}
+
+// C version of this function:
+//   int i = 0;
+//   int64_t cost = 0;
+//   const uint32_t* pop = &population[4];
+//   const uint32_t* LoopEnd = &population[length];
+//   while (pop != LoopEnd) {
+//     ++i;
+//     cost += i * *pop;
+//     cost += i * *(pop + 1);
+//     pop += 2;
+//   }
+//   return (double)cost;
+static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
+  int i, temp0, temp1;
+  const uint32_t* pop = &population[4];
+  const uint32_t* const LoopEnd = &population[length];
+
+  __asm__ volatile(
+    "mult   $zero,    $zero                  \n\t"
+    "xor    %[i],     %[i],       %[i]       \n\t"
+    "beq    %[pop],   %[LoopEnd], 2f         \n\t"
+  "1:                                        \n\t"
+    "lw     %[temp0], 0(%[pop])              \n\t"
+    "lw     %[temp1], 4(%[pop])              \n\t"
+    "addiu  %[i],     %[i],       1          \n\t"
+    "addiu  %[pop],   %[pop],     8          \n\t"
+    "madd   %[i],     %[temp0]               \n\t"
+    "madd   %[i],     %[temp1]               \n\t"
+    "bne    %[pop],   %[LoopEnd], 1b         \n\t"
+  "2:                                        \n\t"
+    "mfhi   %[temp0]                         \n\t"
+    "mflo   %[temp1]                         \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [i]"=&r"(i), [pop]"+r"(pop)
+    : [LoopEnd]"r"(LoopEnd)
+    : "memory", "hi", "lo"
+  );
+
+  return (double)((int64_t)temp0 << 32 | temp1);
+}
+
+// C version of this function:
+//   int i = 0;
+//   int64_t cost = 0;
+//   const uint32_t* pX = &X[4];
+//   const uint32_t* pY = &Y[4];
+//   const uint32_t* LoopEnd = &X[length];
+//   while (pX != LoopEnd) {
+//     const uint32_t xy0 = *pX + *pY;
+//     const uint32_t xy1 = *(pX + 1) + *(pY + 1);
+//     ++i;
+//     cost += i * xy0;
+//     cost += i * xy1;
+//     pX += 2;
+//     pY += 2;
+//   }
+//   return (double)cost;
+static double ExtraCostCombined_MIPS32(const uint32_t* const X,
+                                       const uint32_t* const Y, int length) {
+  int i, temp0, temp1, temp2, temp3;
+  const uint32_t* pX = &X[4];
+  const uint32_t* pY = &Y[4];
+  const uint32_t* const LoopEnd = &X[length];
+
+  __asm__ volatile(
+    "mult   $zero,    $zero                  \n\t"
+    "xor    %[i],     %[i],       %[i]       \n\t"
+    "beq    %[pX],    %[LoopEnd], 2f         \n\t"
+  "1:                                        \n\t"
+    "lw     %[temp0], 0(%[pX])               \n\t"
+    "lw     %[temp1], 0(%[pY])               \n\t"
+    "lw     %[temp2], 4(%[pX])               \n\t"
+    "lw     %[temp3], 4(%[pY])               \n\t"
+    "addiu  %[i],     %[i],       1          \n\t"
+    "addu   %[temp0], %[temp0],   %[temp1]   \n\t"
+    "addu   %[temp2], %[temp2],   %[temp3]   \n\t"
+    "addiu  %[pX],    %[pX],      8          \n\t"
+    "addiu  %[pY],    %[pY],      8          \n\t"
+    "madd   %[i],     %[temp0]               \n\t"
+    "madd   %[i],     %[temp2]               \n\t"
+    "bne    %[pX],    %[LoopEnd], 1b         \n\t"
+  "2:                                        \n\t"
+    "mfhi   %[temp0]                         \n\t"
+    "mflo   %[temp1]                         \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
+    : [LoopEnd]"r"(LoopEnd)
+    : "memory", "hi", "lo"
+  );
+
+  return (double)((int64_t)temp0 << 32 | temp1);
+}
+
+#define HUFFMAN_COST_PASS                                 \
+  __asm__ volatile(                                       \
+    "sll   %[temp1],  %[temp0],    3           \n\t"      \
+    "addiu %[temp3],  %[streak],   -3          \n\t"      \
+    "addu  %[temp2],  %[pstreaks], %[temp1]    \n\t"      \
+    "blez  %[temp3],  1f                       \n\t"      \
+    "srl   %[temp1],  %[temp1],    1           \n\t"      \
+    "addu  %[temp3],  %[pcnts],    %[temp1]    \n\t"      \
+    "lw    %[temp0],  4(%[temp2])              \n\t"      \
+    "lw    %[temp1],  0(%[temp3])              \n\t"      \
+    "addu  %[temp0],  %[temp0],    %[streak]   \n\t"      \
+    "addiu %[temp1],  %[temp1],    1           \n\t"      \
+    "sw    %[temp0],  4(%[temp2])              \n\t"      \
+    "sw    %[temp1],  0(%[temp3])              \n\t"      \
+    "b     2f                                  \n\t"      \
+  "1:                                          \n\t"      \
+    "lw    %[temp0],  0(%[temp2])              \n\t"      \
+    "addu  %[temp0],  %[temp0],    %[streak]   \n\t"      \
+    "sw    %[temp0],  0(%[temp2])              \n\t"      \
+  "2:                                          \n\t"      \
+    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),           \
+      [temp3]"=&r"(temp3), [temp0]"+r"(temp0)             \
+    : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts),         \
+      [streak]"r"(streak)                                 \
+    : "memory"                                            \
+  );
+
+// Returns the various RLE counts
+static WEBP_INLINE void GetEntropyUnrefinedHelper(
+    uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
+    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+  int* const pstreaks = &stats->streaks[0][0];
+  int* const pcnts = &stats->counts[0];
+  int temp0, temp1, temp2, temp3;
+  const int streak = i - *i_prev;
+
+  // Gather info for the bit entropy.
+  if (*val_prev != 0) {
+    bit_entropy->sum += (*val_prev) * streak;
+    bit_entropy->nonzeros += streak;
+    bit_entropy->nonzero_code = *i_prev;
+    bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
+    if (bit_entropy->max_val < *val_prev) {
+      bit_entropy->max_val = *val_prev;
+    }
+  }
+
+  // Gather info for the Huffman cost.
+  temp0 = (*val_prev != 0);
+  HUFFMAN_COST_PASS
+
+  *val_prev = val;
+  *i_prev = i;
+}
+
+static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
+                                       VP8LBitEntropy* const bit_entropy,
+                                       VP8LStreaks* const stats) {
+  int i;
+  int i_prev = 0;
+  uint32_t x_prev = X[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(bit_entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t x = X[i];
+    if (x != x_prev) {
+      GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
+    }
+  }
+  GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
+
+  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
+                                               const uint32_t Y[],
+                                               int length,
+                                               VP8LBitEntropy* const entropy,
+                                               VP8LStreaks* const stats) {
+  int i = 1;
+  int i_prev = 0;
+  uint32_t xy_prev = X[0] + Y[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t xy = X[i] + Y[i];
+    if (xy != xy_prev) {
+      GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);
+    }
+  }
+  GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
+
+  entropy->entropy += VP8LFastSLog2(entropy->sum);
+}
+
+#define ASM_START                                       \
+  __asm__ volatile(                                     \
+    ".set   push                            \n\t"       \
+    ".set   at                              \n\t"       \
+    ".set   macro                           \n\t"       \
+  "1:                                       \n\t"
+
+// P2 = P0 + P1
+// A..D - offsets
+// E - temp variable to tell macro
+//     if pointer should be incremented
+// literal_ and successive histograms could be unaligned
+// so we must use ulw and usw
+#define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2)           \
+    "ulw    %[temp0], " #A "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp1], " #B "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp2], " #C "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp3], " #D "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp4], " #A "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp5], " #B "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp6], " #C "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp7], " #D "(%[" #P1 "])    \n\t"       \
+    "addu   %[temp4], %[temp4],   %[temp0]  \n\t"       \
+    "addu   %[temp5], %[temp5],   %[temp1]  \n\t"       \
+    "addu   %[temp6], %[temp6],   %[temp2]  \n\t"       \
+    "addu   %[temp7], %[temp7],   %[temp3]  \n\t"       \
+    "addiu  %[" #P0 "],  %[" #P0 "],  16    \n\t"       \
+  ".if " #E " == 1                          \n\t"       \
+    "addiu  %[" #P1 "],  %[" #P1 "],  16    \n\t"       \
+  ".endif                                   \n\t"       \
+    "usw    %[temp4], " #A "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp5], " #B "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp6], " #C "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp7], " #D "(%[" #P2 "])    \n\t"       \
+    "addiu  %[" #P2 "], %[" #P2 "],   16    \n\t"       \
+    "bne    %[" #P0 "], %[LoopEnd], 1b      \n\t"       \
+    ".set   pop                             \n\t"       \
+
+#define ASM_END_COMMON_0                                \
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),         \
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),         \
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),         \
+      [pa]"+r"(pa), [pout]"+r"(pout)
+
+#define ASM_END_COMMON_1                                \
+    : [LoopEnd]"r"(LoopEnd)                             \
+    : "memory", "at"                                    \
+  );
+
+#define ASM_END_0                                       \
+    ASM_END_COMMON_0                                    \
+      , [pb]"+r"(pb)                                    \
+    ASM_END_COMMON_1
+
+#define ASM_END_1                                       \
+    ASM_END_COMMON_0                                    \
+    ASM_END_COMMON_1
+
+static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
+                             uint32_t* pout, int size) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  const uint32_t end = ((size) / 4) * 4;
+  const uint32_t* const LoopEnd = pa + end;
+  int i;
+  ASM_START
+  ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)
+  ASM_END_0
+  for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i];
+}
+
+static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  const uint32_t end = ((size) / 4) * 4;
+  const uint32_t* const LoopEnd = pa + end;
+  int i;
+  ASM_START
+  ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)
+  ASM_END_1
+  for (i = end; i < size; ++i) pout[i] += pa[i];
+}
+
+#undef ASM_END_1
+#undef ASM_END_0
+#undef ASM_END_COMMON_1
+#undef ASM_END_COMMON_0
+#undef ADD_TO_OUT
+#undef ASM_START
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
+  VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;
+  VP8LFastLog2Slow = FastLog2Slow_MIPS32;
+  VP8LExtraCost = ExtraCost_MIPS32;
+  VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
+  VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
+  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
+  VP8LAddVector = AddVector_MIPS32;
+  VP8LAddVectorEq = AddVectorEq_MIPS32;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/lossless_enc_mips_dsp_r2.c b/media/libwebp/dsp/lossless_enc_mips_dsp_r2.c
new file mode 100644
index 0000000000..157dfc2e01
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_mips_dsp_r2.c
@@ -0,0 +1,281 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/lossless.h"
+
+static void SubtractGreenFromBlueAndRed_MIPSdspR2(uint32_t* argb_data,
+                                                  int num_pixels) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
+  uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
+  __asm__ volatile (
+    ".set       push                                          \n\t"
+    ".set       noreorder                                     \n\t"
+    "beq        %[argb_data],    %[p_loop1_end],     3f       \n\t"
+    " nop                                                     \n\t"
+  "0:                                                         \n\t"
+    "lw         %[temp0],        0(%[argb_data])              \n\t"
+    "lw         %[temp1],        4(%[argb_data])              \n\t"
+    "lw         %[temp2],        8(%[argb_data])              \n\t"
+    "lw         %[temp3],        12(%[argb_data])             \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "ext        %[temp5],        %[temp1],           8,    8  \n\t"
+    "ext        %[temp6],        %[temp2],           8,    8  \n\t"
+    "ext        %[temp7],        %[temp3],           8,    8  \n\t"
+    "addiu      %[argb_data],    %[argb_data],       16       \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "replv.ph   %[temp5],        %[temp5]                     \n\t"
+    "replv.ph   %[temp6],        %[temp6]                     \n\t"
+    "replv.ph   %[temp7],        %[temp7]                     \n\t"
+    "subu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "subu.qb    %[temp1],        %[temp1],           %[temp5] \n\t"
+    "subu.qb    %[temp2],        %[temp2],           %[temp6] \n\t"
+    "subu.qb    %[temp3],        %[temp3],           %[temp7] \n\t"
+    "sw         %[temp0],        -16(%[argb_data])            \n\t"
+    "sw         %[temp1],        -12(%[argb_data])            \n\t"
+    "sw         %[temp2],        -8(%[argb_data])             \n\t"
+    "bne        %[argb_data],    %[p_loop1_end],     0b       \n\t"
+    " sw        %[temp3],        -4(%[argb_data])             \n\t"
+  "3:                                                         \n\t"
+    "beq        %[argb_data],    %[p_loop2_end],     2f       \n\t"
+    " nop                                                     \n\t"
+  "1:                                                         \n\t"
+    "lw         %[temp0],        0(%[argb_data])              \n\t"
+    "addiu      %[argb_data],    %[argb_data],       4        \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "subu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "bne        %[argb_data],    %[p_loop2_end],     1b       \n\t"
+    " sw        %[temp0],        -4(%[argb_data])             \n\t"
+  "2:                                                         \n\t"
+    ".set       pop                                           \n\t"
+    : [argb_data]"+&r"(argb_data), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
+                                                int8_t color) {
+  return (uint32_t)((int)(color_pred) * color) >> 5;
+}
+
+static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
+                                     uint32_t* data, int num_pixels) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  uint32_t argb, argb1, new_red, new_red1;
+  const uint32_t G_to_R = m->green_to_red_;
+  const uint32_t G_to_B = m->green_to_blue_;
+  const uint32_t R_to_B = m->red_to_blue_;
+  uint32_t* const p_loop_end = data + (num_pixels & ~1);
+  __asm__ volatile (
+    ".set            push                                    \n\t"
+    ".set            noreorder                               \n\t"
+    "beq             %[data],      %[p_loop_end],  1f        \n\t"
+    " nop                                                    \n\t"
+    "replv.ph        %[temp0],     %[G_to_R]                 \n\t"
+    "replv.ph        %[temp1],     %[G_to_B]                 \n\t"
+    "replv.ph        %[temp2],     %[R_to_B]                 \n\t"
+    "shll.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shll.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shll.ph         %[temp2],     %[temp2],       8         \n\t"
+    "shra.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shra.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shra.ph         %[temp2],     %[temp2],       8         \n\t"
+  "0:                                                        \n\t"
+    "lw              %[argb],      0(%[data])                \n\t"
+    "lw              %[argb1],     4(%[data])                \n\t"
+    "lhu             %[new_red],   2(%[data])                \n\t"
+    "lhu             %[new_red1],  6(%[data])                \n\t"
+    "precrq.qb.ph    %[temp3],     %[argb],        %[argb1]  \n\t"
+    "precr.qb.ph     %[temp4],     %[argb],        %[argb1]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[temp3]                  \n\t"
+    "preceu.ph.qbla  %[temp4],     %[temp4]                  \n\t"
+    "shll.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shll.ph         %[temp4],     %[temp4],       8         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       8         \n\t"
+    "mul.ph          %[temp5],     %[temp3],       %[temp0]  \n\t"
+    "mul.ph          %[temp3],     %[temp3],       %[temp1]  \n\t"
+    "mul.ph          %[temp4],     %[temp4],       %[temp2]  \n\t"
+    "addiu           %[data],      %[data],        8         \n\t"
+    "ins             %[new_red1],  %[new_red],     16,   16  \n\t"
+    "ins             %[argb1],     %[argb],        16,   16  \n\t"
+    "shra.ph         %[temp5],     %[temp5],       5         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       5         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       5         \n\t"
+    "subu.ph         %[new_red1],  %[new_red1],    %[temp5]  \n\t"
+    "subu.ph         %[argb1],     %[argb1],       %[temp3]  \n\t"
+    "preceu.ph.qbra  %[temp5],     %[new_red1]               \n\t"
+    "subu.ph         %[argb1],     %[argb1],       %[temp4]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[argb1]                  \n\t"
+    "sb              %[temp5],     -2(%[data])               \n\t"
+    "sb              %[temp3],     -4(%[data])               \n\t"
+    "sra             %[temp5],     %[temp5],       16        \n\t"
+    "sra             %[temp3],     %[temp3],       16        \n\t"
+    "sb              %[temp5],     -6(%[data])               \n\t"
+    "bne             %[data],      %[p_loop_end],  0b        \n\t"
+    " sb             %[temp3],     -8(%[data])               \n\t"
+  "1:                                                        \n\t"
+    ".set            pop                                     \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [new_red1]"=&r"(new_red1), [new_red]"=&r"(new_red),
+      [argb]"=&r"(argb), [argb1]"=&r"(argb1), [data]"+&r"(data)
+    : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
+      [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
+    : "memory", "hi", "lo"
+  );
+
+  if (num_pixels & 1) {
+    const uint32_t argb_ = data[0];
+    const uint32_t green = argb_ >> 8;
+    const uint32_t red = argb_ >> 16;
+    uint32_t new_blue = argb_;
+    new_red = red;
+    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue &= 0xff;
+    data[0] = (argb_ & 0xff00ff00u) | (new_red << 16) | (new_blue);
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
+                                              uint8_t red_to_blue,
+                                              uint32_t argb) {
+  const uint32_t green = argb >> 8;
+  const uint32_t red = argb >> 16;
+  uint8_t new_blue = argb;
+  new_blue -= ColorTransformDelta(green_to_blue, green);
+  new_blue -= ColorTransformDelta(red_to_blue, red);
+  return (new_blue & 0xff);
+}
+
+static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
+                                                 int stride,
+                                                 int tile_width,
+                                                 int tile_height,
+                                                 int green_to_blue,
+                                                 int red_to_blue,
+                                                 int histo[]) {
+  const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
+  const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
+  const uint32_t mask = 0xff00ffu;
+  while (tile_height-- > 0) {
+    int x;
+    const uint32_t* p_argb = argb;
+    argb += stride;
+    for (x = 0; x < (tile_width >> 1); ++x) {
+      int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+      __asm__ volatile (
+        "lw           %[temp0],  0(%[p_argb])             \n\t"
+        "lw           %[temp1],  4(%[p_argb])             \n\t"
+        "precr.qb.ph  %[temp2],  %[temp0],  %[temp1]      \n\t"
+        "ins          %[temp1],  %[temp0],  16,    16     \n\t"
+        "shra.ph      %[temp2],  %[temp2],  8             \n\t"
+        "shra.ph      %[temp3],  %[temp1],  8             \n\t"
+        "mul.ph       %[temp5],  %[temp2],  %[rtb]        \n\t"
+        "mul.ph       %[temp6],  %[temp3],  %[gtb]        \n\t"
+        "and          %[temp4],  %[temp1],  %[mask]       \n\t"
+        "addiu        %[p_argb], %[p_argb], 8             \n\t"
+        "shra.ph      %[temp5],  %[temp5],  5             \n\t"
+        "shra.ph      %[temp6],  %[temp6],  5             \n\t"
+        "subu.qb      %[temp2],  %[temp4],  %[temp5]      \n\t"
+        "subu.qb      %[temp2],  %[temp2],  %[temp6]      \n\t"
+        : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+          [temp5]"=&r"(temp5), [temp6]"=&r"(temp6)
+        : [rtb]"r"(rtb), [gtb]"r"(gtb), [mask]"r"(mask)
+        : "memory", "hi", "lo"
+      );
+      ++histo[(uint8_t)(temp2 >> 16)];
+      ++histo[(uint8_t)temp2];
+    }
+    if (tile_width & 1) {
+      ++histo[TransformColorBlue(green_to_blue, red_to_blue, *p_argb)];
+    }
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
+                                             uint32_t argb) {
+  const uint32_t green = argb >> 8;
+  uint32_t new_red = argb >> 16;
+  new_red -= ColorTransformDelta(green_to_red, green);
+  return (new_red & 0xff);
+}
+
+static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
+                                                int stride,
+                                                int tile_width,
+                                                int tile_height,
+                                                int green_to_red,
+                                                int histo[]) {
+  const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
+  while (tile_height-- > 0) {
+    int x;
+    const uint32_t* p_argb = argb;
+    argb += stride;
+    for (x = 0; x < (tile_width >> 1); ++x) {
+      int temp0, temp1, temp2, temp3, temp4;
+      __asm__ volatile (
+        "lw           %[temp0],  0(%[p_argb])             \n\t"
+        "lw           %[temp1],  4(%[p_argb])             \n\t"
+        "precrq.ph.w  %[temp4],  %[temp0],  %[temp1]      \n\t"
+        "ins          %[temp1],  %[temp0],  16,    16     \n\t"
+        "shra.ph      %[temp3],  %[temp1],  8             \n\t"
+        "mul.ph       %[temp2],  %[temp3],  %[gtr]        \n\t"
+        "addiu        %[p_argb], %[p_argb], 8             \n\t"
+        "shra.ph      %[temp2],  %[temp2],  5             \n\t"
+        "subu.qb      %[temp2],  %[temp4],  %[temp2]      \n\t"
+        : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
+        : [gtr]"r"(gtr)
+        : "memory", "hi", "lo"
+      );
+      ++histo[(uint8_t)(temp2 >> 16)];
+      ++histo[(uint8_t)temp2];
+    }
+    if (tile_width & 1) {
+      ++histo[TransformColorRed(green_to_red, *p_argb)];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MIPSdspR2;
+  VP8LTransformColor = TransformColor_MIPSdspR2;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_MIPSdspR2;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/lossless_enc_msa.c b/media/libwebp/dsp/lossless_enc_msa.c
new file mode 100644
index 0000000000..f8a5f8c56f
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_msa.c
@@ -0,0 +1,148 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of Image transform methods for lossless encoder.
+//
+// Authors: Prashant Patil (Prashant.Patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/lossless.h"
+#include "../dsp/msa_macro.h"
+
+#define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do {  \
+  v8i16 g0, g1, t0, t1, t2, t3;                                               \
+  v4i32 t4, t5;                                                               \
+  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1);                   \
+  DOTP_SB2_SH(g0, g1, c0, c0, t0, t1);                                        \
+  SRAI_H2_SH(t0, t1, 5);                                                      \
+  t0 = __msa_subv_h((v8i16)src0, t0);                                         \
+  t1 = __msa_subv_h((v8i16)src1, t1);                                         \
+  t4 = __msa_srli_w((v4i32)src0, 16);                                         \
+  t5 = __msa_srli_w((v4i32)src1, 16);                                         \
+  DOTP_SB2_SH(t4, t5, c1, c1, t2, t3);                                        \
+  SRAI_H2_SH(t2, t3, 5);                                                      \
+  SUB2(t0, t2, t1, t3, t0, t1);                                               \
+  VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1);                   \
+} while (0)
+
+#define TRANSFORM_COLOR_4(src, dst, c0, c1, mask0, mask1) do {  \
+  const v16i8 g0 = VSHF_SB(src, src, mask0);                    \
+  v8i16 t0 = __msa_dotp_s_h(c0, g0);                            \
+  v8i16 t1;                                                     \
+  v4i32 t2;                                                     \
+  t0 = SRAI_H(t0, 5);                                           \
+  t0 = __msa_subv_h((v8i16)src, t0);                            \
+  t2 = __msa_srli_w((v4i32)src, 16);                            \
+  t1 = __msa_dotp_s_h(c1, (v16i8)t2);                           \
+  t1 = SRAI_H(t1, 5);                                           \
+  t0 = t0 - t1;                                                 \
+  dst = VSHF_UB(src, t0, mask1);                                \
+} while (0)
+
+static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
+                               int num_pixels) {
+  v16u8 src0, dst0;
+  const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
+                                         (m->green_to_red_ << 16));
+  const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
+  const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                        13, 255, 13, 255 };
+  const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
+                        28, 13, 30, 15 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1;
+    LD_UB2(data, 4, src0, src1);
+    TRANSFORM_COLOR_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
+    ST_UB2(dst0, dst1, data, 4);
+    data += 8;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(data);
+      TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
+      ST_UB(dst0, data);
+      data += 4;
+      num_pixels -= 4;
+    }
+    if (num_pixels > 0) {
+      src0 = LD_UB(data);
+      TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
+      if (num_pixels == 3) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
+        SD(pix_d, data + 0);
+        SW(pix_w, data + 2);
+      } else if (num_pixels == 2) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        SD(pix_d, data);
+      } else {
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
+        SW(pix_w, data);
+      }
+    }
+  }
+}
+
+static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data,
+                                            int num_pixels) {
+  int i;
+  uint8_t* ptemp_data = (uint8_t*)argb_data;
+  v16u8 src0, dst0, tmp0;
+  const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                       13, 255, 13, 255 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1, tmp1;
+    LD_UB2(ptemp_data, 16, src0, src1);
+    VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
+    SUB2(src0, tmp0, src1, tmp1, dst0, dst1);
+    ST_UB2(dst0, dst1, ptemp_data, 16);
+    ptemp_data += 8 * 4;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(ptemp_data);
+      tmp0 = VSHF_UB(src0, src0, mask);
+      dst0 = src0 - tmp0;
+      ST_UB(dst0, ptemp_data);
+      ptemp_data += 4 * 4;
+      num_pixels -= 4;
+    }
+    for (i = 0; i < num_pixels; i++) {
+      const uint8_t b = ptemp_data[0];
+      const uint8_t g = ptemp_data[1];
+      const uint8_t r = ptemp_data[2];
+      ptemp_data[0] = (b - g) & 0xff;
+      ptemp_data[2] = (r - g) & 0xff;
+      ptemp_data += 4;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA;
+  VP8LTransformColor = TransformColor_MSA;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/lossless_enc_neon.c b/media/libwebp/dsp/lossless_enc_neon.c
new file mode 100644
index 0000000000..89d5439e49
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_neon.c
@@ -0,0 +1,144 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
+#include "../dsp/lossless.h"
+#include "../dsp/neon.h"
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+// non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+#define USE_VTBLQ
+#endif
+
+#ifdef USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[16] = {
+  1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
+};
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+                                                  const uint8x16_t shuffle) {
+  return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
+                     vtbl1q_u8(argb, vget_high_u8(shuffle)));
+}
+#else  // !USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+                                                  const uint8x8_t shuffle) {
+  return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+                     vtbl1_u8(vget_high_u8(argb), shuffle));
+}
+#endif  // USE_VTBLQ
+
+static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
+                                             int num_pixels) {
+  const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
+  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
+  for (; argb_data < end; argb_data += 4) {
+    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+    const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
+    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+static void TransformColor_NEON(const VP8LMultipliers* const m,
+                                uint32_t* argb_data, int num_pixels) {
+  // sign-extended multiplying constants, pre-shifted by 6.
+#define CST(X)  (((int16_t)(m->X << 8)) >> 6)
+  const int16_t rb[8] = {
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_)
+  };
+  const int16x8_t mults_rb = vld1q_s16(rb);
+  const int16_t b2[8] = {
+    0, CST(red_to_blue_), 0, CST(red_to_blue_),
+    0, CST(red_to_blue_), 0, CST(red_to_blue_),
+  };
+  const int16x8_t mults_b2 = vld1q_s16(b2);
+#undef CST
+#ifdef USE_VTBLQ
+  static const uint8_t kg0g0[16] = {
+    255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
+  };
+  const uint8x16_t shuffle = vld1q_u8(kg0g0);
+#else
+  static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
+  const uint8x8_t shuffle = vld1_u8(k0g0g);
+#endif
+  const uint32x4_t mask_rb = vdupq_n_u32(0x00ff00ffu);  // red-blue masks
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
+    // 0 g 0 g
+    const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
+    // x dr  x db1
+    const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
+    // r 0   b   0
+    const int16x8_t B = vshlq_n_s16(vreinterpretq_s16_u8(in), 8);
+    // x db2 0   0
+    const int16x8_t C = vqdmulhq_s16(B, mults_b2);
+    // 0 0   x db2
+    const uint32x4_t D = vshrq_n_u32(vreinterpretq_u32_s16(C), 16);
+    // x dr  x  db
+    const int8x16_t E = vaddq_s8(vreinterpretq_s8_u32(D),
+                                 vreinterpretq_s8_s16(A));
+    // 0 dr  0  db
+    const uint32x4_t F = vandq_u32(vreinterpretq_u32_s8(E), mask_rb);
+    const int8x16_t out = vsubq_s8(vreinterpretq_s8_u8(in),
+                                   vreinterpretq_s8_u32(F));
+    vst1q_s8((int8_t*)(argb_data + i), out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+}
+
+#undef USE_VTBLQ
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_NEON;
+  VP8LTransformColor = TransformColor_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/media/libwebp/dsp/lossless_enc_sse2.c b/media/libwebp/dsp/lossless_enc_sse2.c
new file mode 100644
index 0000000000..665ceb669c
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_sse2.c
@@ -0,0 +1,669 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <assert.h>
+#include <emmintrin.h>
+#include "../dsp/lossless.h"
+#include "../dsp/common_sse2.h"
+#include "../dsp/lossless_common.h"
+
+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X)  (((int16_t)((uint16_t)(X) << 8)) >> 5)
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
+                                             int num_pixels) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
+    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
+    const __m128i out = _mm_sub_epi8(in, C);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  if (i != num_pixels) {
+    VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+#define MK_CST_16(HI, LO) \
+  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
+static void TransformColor_SSE2(const VP8LMultipliers* const m,
+                                uint32_t* argb_data, int num_pixels) {
+  const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
+                                     CST_5b(m->green_to_blue_));
+  const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
+  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
+  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
+    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
+    const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
+    const __m128i E = _mm_slli_epi16(in, 8);           // r 0   b   0
+    const __m128i F = _mm_mulhi_epi16(E, mults_b2);    // x db2 0   0
+    const __m128i G = _mm_srli_epi32(F, 16);           // 0 0   x db2
+    const __m128i H = _mm_add_epi8(G, D);              // x dr  x  db
+    const __m128i I = _mm_and_si128(H, mask_rb);       // 0 dr  0  db
+    const __m128i out = _mm_sub_epi8(in, I);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  if (i != num_pixels) {
+    VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+  }
+}
+
+//------------------------------------------------------------------------------
+#define SPAN 8
+static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
+                                            int tile_width, int tile_height,
+                                            int green_to_blue, int red_to_blue,
+                                            int histo[]) {
+  const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
+  const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
+  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
+  const __m128i mask_b = _mm_set1_epi32(0x0000ff);  // blue mask
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x +        0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i A0 = _mm_slli_epi16(in0, 8);        // r 0  | b 0
+      const __m128i A1 = _mm_slli_epi16(in1, 8);
+      const __m128i B0 = _mm_and_si128(in0, mask_g);    // 0 0  | g 0
+      const __m128i B1 = _mm_and_si128(in1, mask_g);
+      const __m128i C0 = _mm_mulhi_epi16(A0, mults_r);  // x db | 0 0
+      const __m128i C1 = _mm_mulhi_epi16(A1, mults_r);
+      const __m128i D0 = _mm_mulhi_epi16(B0, mults_g);  // 0 0  | x db
+      const __m128i D1 = _mm_mulhi_epi16(B1, mults_g);
+      const __m128i E0 = _mm_sub_epi8(in0, D0);         // x x  | x b'
+      const __m128i E1 = _mm_sub_epi8(in1, D1);
+      const __m128i F0 = _mm_srli_epi32(C0, 16);        // 0 0  | x db
+      const __m128i F1 = _mm_srli_epi32(C1, 16);
+      const __m128i G0 = _mm_sub_epi8(E0, F0);          // 0 0  | x b'
+      const __m128i G1 = _mm_sub_epi8(E1, F1);
+      const __m128i H0 = _mm_and_si128(G0, mask_b);     // 0 0  | 0 b
+      const __m128i H1 = _mm_and_si128(G1, mask_b);
+      const __m128i I = _mm_packs_epi32(H0, H1);        // 0 b' | 0 b'
+      _mm_storeu_si128((__m128i*)values, I);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
+                                       left_over, tile_height,
+                                       green_to_blue, red_to_blue, histo);
+    }
+  }
+}
+
+static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
+                                           int tile_width, int tile_height,
+                                           int green_to_red, int histo[]) {
+  const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
+  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
+  const __m128i mask = _mm_set1_epi32(0xff);
+
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x +        0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i A0 = _mm_and_si128(in0, mask_g);    // 0 0  | g 0
+      const __m128i A1 = _mm_and_si128(in1, mask_g);
+      const __m128i B0 = _mm_srli_epi32(in0, 16);       // 0 0  | x r
+      const __m128i B1 = _mm_srli_epi32(in1, 16);
+      const __m128i C0 = _mm_mulhi_epi16(A0, mults_g);  // 0 0  | x dr
+      const __m128i C1 = _mm_mulhi_epi16(A1, mults_g);
+      const __m128i E0 = _mm_sub_epi8(B0, C0);          // x x  | x r'
+      const __m128i E1 = _mm_sub_epi8(B1, C1);
+      const __m128i F0 = _mm_and_si128(E0, mask);       // 0 0  | 0 r'
+      const __m128i F1 = _mm_and_si128(E1, mask);
+      const __m128i I = _mm_packs_epi32(F0, F1);
+      _mm_storeu_si128((__m128i*)values, I);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
+                                      left_over, tile_height,
+                                      green_to_red, histo);
+    }
+  }
+}
+#undef SPAN
+#undef MK_CST_16
+
+//------------------------------------------------------------------------------
+
+// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
+// that's ok since the histogram values are less than 1<<28 (max picture size).
+#define LINE_SIZE 16    // 8 or 16
+static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
+                           int size) {
+  int i;
+  for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+  for (; i < size; ++i) {
+    out[i] = a[i] + b[i];
+  }
+}
+
+static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
+  int i;
+  for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+  for (; i < size; ++i) {
+    out[i] += a[i];
+  }
+}
+#undef LINE_SIZE
+
+//------------------------------------------------------------------------------
+// Entropy
+
+// TODO(https://crbug.com/webp/499): this function produces different results
+// from the C code due to use of double/float resulting in output differences
+// when compared to -noasm.
+#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
+
+static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
+  int i;
+  double retval = 0.;
+  int sumX = 0, sumXY = 0;
+  const __m128i zero = _mm_setzero_si128();
+
+  for (i = 0; i < 256; i += 16) {
+    const __m128i x0 = _mm_loadu_si128((const __m128i*)(X + i +  0));
+    const __m128i y0 = _mm_loadu_si128((const __m128i*)(Y + i +  0));
+    const __m128i x1 = _mm_loadu_si128((const __m128i*)(X + i +  4));
+    const __m128i y1 = _mm_loadu_si128((const __m128i*)(Y + i +  4));
+    const __m128i x2 = _mm_loadu_si128((const __m128i*)(X + i +  8));
+    const __m128i y2 = _mm_loadu_si128((const __m128i*)(Y + i +  8));
+    const __m128i x3 = _mm_loadu_si128((const __m128i*)(X + i + 12));
+    const __m128i y3 = _mm_loadu_si128((const __m128i*)(Y + i + 12));
+    const __m128i x4 = _mm_packs_epi16(_mm_packs_epi32(x0, x1),
+                                       _mm_packs_epi32(x2, x3));
+    const __m128i y4 = _mm_packs_epi16(_mm_packs_epi32(y0, y1),
+                                       _mm_packs_epi32(y2, y3));
+    const int32_t mx = _mm_movemask_epi8(_mm_cmpgt_epi8(x4, zero));
+    int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
+    while (my) {
+      const int32_t j = BitsCtz(my);
+      int xy;
+      if ((mx >> j) & 1) {
+        const int x = X[i + j];
+        sumXY += x;
+        retval -= VP8LFastSLog2(x);
+      }
+      xy = X[i + j] + Y[i + j];
+      sumX += xy;
+      retval -= VP8LFastSLog2(xy);
+      my &= my - 1;
+    }
+  }
+  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
+  return (float)retval;
+}
+
+#else
+
+#define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC   // won't be faster
+
+#endif
+
+//------------------------------------------------------------------------------
+
+static int VectorMismatch_SSE2(const uint32_t* const array1,
+                               const uint32_t* const array2, int length) {
+  int match_len;
+
+  if (length >= 12) {
+    __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]);
+    __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]);
+    match_len = 0;
+    do {
+      // Loop unrolling and early load both provide a speedup of 10% for the
+      // current function. Also, max_limit can be MAX_LENGTH=4096 at most.
+      const __m128i cmpA = _mm_cmpeq_epi32(A0, A1);
+      const __m128i B0 =
+          _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
+      const __m128i B1 =
+          _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
+      if (_mm_movemask_epi8(cmpA) != 0xffff) break;
+      match_len += 4;
+
+      {
+        const __m128i cmpB = _mm_cmpeq_epi32(B0, B1);
+        A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
+        A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
+        if (_mm_movemask_epi8(cmpB) != 0xffff) break;
+        match_len += 4;
+      }
+    } while (match_len + 12 < length);
+  } else {
+    match_len = 0;
+    // Unroll the potential first two loops.
+    if (length >= 4 &&
+        _mm_movemask_epi8(_mm_cmpeq_epi32(
+            _mm_loadu_si128((const __m128i*)&array1[0]),
+            _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) {
+      match_len = 4;
+      if (length >= 8 &&
+          _mm_movemask_epi8(_mm_cmpeq_epi32(
+              _mm_loadu_si128((const __m128i*)&array1[4]),
+              _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) {
+        match_len = 8;
+      }
+    }
+  }
+
+  while (match_len < length && array1[match_len] == array2[match_len]) {
+    ++match_len;
+  }
+  return match_len;
+}
+
+// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
+static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
+                                uint32_t* dst) {
+  int x;
+  assert(xbits >= 0);
+  assert(xbits <= 3);
+  switch (xbits) {
+    case 0: {
+      const __m128i ff = _mm_set1_epi16((short)0xff00);
+      const __m128i zero = _mm_setzero_si128();
+      // Store 0xff000000 | (row[x] << 8).
+      for (x = 0; x + 16 <= width; x += 16, dst += 16) {
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i in_lo = _mm_unpacklo_epi8(zero, in);
+        const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff);
+        const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff);
+        const __m128i in_hi = _mm_unpackhi_epi8(zero, in);
+        const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff);
+        const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff);
+        _mm_storeu_si128((__m128i*)&dst[0], dst0);
+        _mm_storeu_si128((__m128i*)&dst[4], dst1);
+        _mm_storeu_si128((__m128i*)&dst[8], dst2);
+        _mm_storeu_si128((__m128i*)&dst[12], dst3);
+      }
+      break;
+    }
+    case 1: {
+      const __m128i ff = _mm_set1_epi16((short)0xff00);
+      const __m128i mul = _mm_set1_epi16(0x110);
+      for (x = 0; x + 16 <= width; x += 16, dst += 8) {
+        // 0a0b | (where a/b are 4 bits).
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i tmp = _mm_mullo_epi16(in, mul);  // aba0
+        const __m128i pack = _mm_and_si128(tmp, ff);   // ab00
+        const __m128i dst0 = _mm_unpacklo_epi16(pack, ff);
+        const __m128i dst1 = _mm_unpackhi_epi16(pack, ff);
+        _mm_storeu_si128((__m128i*)&dst[0], dst0);
+        _mm_storeu_si128((__m128i*)&dst[4], dst1);
+      }
+      break;
+    }
+    case 2: {
+      const __m128i mask_or = _mm_set1_epi32(0xff000000);
+      const __m128i mul_cst = _mm_set1_epi16(0x0104);
+      const __m128i mask_mul = _mm_set1_epi16(0x0f00);
+      for (x = 0; x + 16 <= width; x += 16, dst += 4) {
+        // 000a000b000c000d | (where a/b/c/d are 2 bits).
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i mul = _mm_mullo_epi16(in, mul_cst);  // 00ab00b000cd00d0
+        const __m128i tmp = _mm_and_si128(mul, mask_mul);  // 00ab000000cd0000
+        const __m128i shift = _mm_srli_epi32(tmp, 12);     // 00000000ab000000
+        const __m128i pack = _mm_or_si128(shift, tmp);     // 00000000abcd0000
+        // Convert to 0xff00**00.
+        const __m128i res = _mm_or_si128(pack, mask_or);
+        _mm_storeu_si128((__m128i*)dst, res);
+      }
+      break;
+    }
+    default: {
+      assert(xbits == 3);
+      for (x = 0; x + 16 <= width; x += 16, dst += 2) {
+        // 0000000a00000000b... | (where a/b are 1 bit).
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i shift = _mm_slli_epi64(in, 7);
+        const uint32_t move = _mm_movemask_epi8(shift);
+        dst[0] = 0xff000000 | ((move & 0xff) << 8);
+        dst[1] = 0xff000000 | (move & 0xff00);
+      }
+      break;
+    }
+  }
+  if (x != width) {
+    VP8LBundleColorMap_C(row + x, width - x, xbits, dst);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Batch version of Predictor Transform subtraction
+
+static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
+                                       const __m128i* const a1,
+                                       __m128i* const avg) {
+  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
+  const __m128i ones = _mm_set1_epi8(1);
+  const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
+  const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
+  *avg = _mm_sub_epi8(avg1, one);
+}
+
+// Predictor0: ARGB_BLACK.
+static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i res = _mm_sub_epi8(src, black);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[0](in + i, NULL, num_pixels - i, out + i);
+  }
+  (void)upper;
+}
+
+#define GENERATE_PREDICTOR_1(X, IN)                                         \
+  static void PredictorSub##X##_SSE2(const uint32_t* const in,              \
+                                     const uint32_t* const upper,           \
+                                     int num_pixels, uint32_t* const out) { \
+    int i;                                                                  \
+    for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
+      const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);          \
+      const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN));          \
+      const __m128i res = _mm_sub_epi8(src, pred);                          \
+      _mm_storeu_si128((__m128i*)&out[i], res);                             \
+    }                                                                       \
+    if (i != num_pixels) {                                                  \
+      VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i),           \
+                               num_pixels - i, out + i);                    \
+    }                                                                       \
+  }
+
+GENERATE_PREDICTOR_1(1, in[i - 1])       // Predictor1: L
+GENERATE_PREDICTOR_1(2, upper[i])        // Predictor2: T
+GENERATE_PREDICTOR_1(3, upper[i + 1])    // Predictor3: TR
+GENERATE_PREDICTOR_1(4, upper[i - 1])    // Predictor4: TL
+#undef GENERATE_PREDICTOR_1
+
+// Predictor5: avg2(avg2(L, TR), T)
+static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    __m128i avg, pred, res;
+    Average2_m128i(&L, &TR, &avg);
+    Average2_m128i(&avg, &T, &pred);
+    res = _mm_sub_epi8(src, pred);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+#define GENERATE_PREDICTOR_2(X, A, B)                                         \
+static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
+                                   int num_pixels, uint32_t* out) {           \
+  int i;                                                                      \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
+    const __m128i tA = _mm_loadu_si128((const __m128i*)&(A));                 \
+    const __m128i tB = _mm_loadu_si128((const __m128i*)&(B));                 \
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
+    __m128i pred, res;                                                        \
+    Average2_m128i(&tA, &tB, &pred);                                          \
+    res = _mm_sub_epi8(src, pred);                                            \
+    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
+  }                                                                           \
+  if (i != num_pixels) {                                                      \
+    VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
+  }                                                                           \
+}
+
+GENERATE_PREDICTOR_2(6, in[i - 1], upper[i - 1])   // Predictor6: avg(L, TL)
+GENERATE_PREDICTOR_2(7, in[i - 1], upper[i])       // Predictor7: avg(L, T)
+GENERATE_PREDICTOR_2(8, upper[i - 1], upper[i])    // Predictor8: avg(TL, T)
+GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1])    // Predictor9: average(T, TR)
+#undef GENERATE_PREDICTOR_2
+
+// Predictor10: avg(avg(L,TL), avg(T, TR)).
+static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
+    __m128i avgTTR, avgLTL, avg, res;
+    Average2_m128i(&T, &TR, &avgTTR);
+    Average2_m128i(&L, &TL, &avgLTL);
+    Average2_m128i(&avgTTR, &avgLTL, &avg);
+    res = _mm_sub_epi8(src, avg);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor11: select.
+static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
+                                 __m128i* const out) {
+  // We can unpack with any value on the upper 32 bits, provided it's the same
+  // on both operands (to that their sum of abs diff is zero). Here we use *A.
+  const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
+  const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
+  const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
+  const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
+  const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
+  const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
+  *out = _mm_packs_epi32(s_lo, s_hi);
+}
+
+static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    __m128i pa, pb;
+    GetSumAbsDiff32_SSE2(&T, &TL, &pa);   // pa = sum |T-TL|
+    GetSumAbsDiff32_SSE2(&L, &TL, &pb);   // pb = sum |L-TL|
+    {
+      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
+      const __m128i A = _mm_and_si128(mask, L);
+      const __m128i B = _mm_andnot_si128(mask, T);
+      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
+      const __m128i res = _mm_sub_epi8(src, pred);
+      _mm_storeu_si128((__m128i*)&out[i], res);
+    }
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor12: ClampedSubSubtractFull.
+static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
+    const __m128i L_hi = _mm_unpackhi_epi8(L, zero);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+    const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+    const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
+    const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
+    const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
+    const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo);
+    const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi);
+    const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+    const __m128i res = _mm_sub_epi8(src, pred);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictors13: ClampedAddSubtractHalf
+static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 2 <= num_pixels; i += 2) {
+    // we can only process two pixels at a time
+    const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]);
+    const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]);
+    const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]);
+    const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]);
+    const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
+    const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+    const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+    const __m128i sum = _mm_add_epi16(T_lo, L_lo);
+    const __m128i avg = _mm_srli_epi16(sum, 1);
+    const __m128i A1 = _mm_sub_epi16(avg, TL_lo);
+    const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg);
+    const __m128i A2 = _mm_sub_epi16(A1, bit_fix);
+    const __m128i A3 = _mm_srai_epi16(A2, 1);
+    const __m128i A4 = _mm_add_epi16(avg, A3);
+    const __m128i pred = _mm_packus_epi16(A4, A4);
+    const __m128i res = _mm_sub_epi8(src, pred);
+    _mm_storel_epi64((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE2;
+  VP8LTransformColor = TransformColor_SSE2;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
+  VP8LAddVector = AddVector_SSE2;
+  VP8LAddVectorEq = AddVectorEq_SSE2;
+#if !defined(DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC)
+  VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
+#endif
+  VP8LVectorMismatch = VectorMismatch_SSE2;
+  VP8LBundleColorMap = BundleColorMap_SSE2;
+
+  VP8LPredictorsSub[0] = PredictorSub0_SSE2;
+  VP8LPredictorsSub[1] = PredictorSub1_SSE2;
+  VP8LPredictorsSub[2] = PredictorSub2_SSE2;
+  VP8LPredictorsSub[3] = PredictorSub3_SSE2;
+  VP8LPredictorsSub[4] = PredictorSub4_SSE2;
+  VP8LPredictorsSub[5] = PredictorSub5_SSE2;
+  VP8LPredictorsSub[6] = PredictorSub6_SSE2;
+  VP8LPredictorsSub[7] = PredictorSub7_SSE2;
+  VP8LPredictorsSub[8] = PredictorSub8_SSE2;
+  VP8LPredictorsSub[9] = PredictorSub9_SSE2;
+  VP8LPredictorsSub[10] = PredictorSub10_SSE2;
+  VP8LPredictorsSub[11] = PredictorSub11_SSE2;
+  VP8LPredictorsSub[12] = PredictorSub12_SSE2;
+  VP8LPredictorsSub[13] = PredictorSub13_SSE2;
+  VP8LPredictorsSub[14] = PredictorSub0_SSE2;  // <- padding security sentinels
+  VP8LPredictorsSub[15] = PredictorSub0_SSE2;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/lossless_enc_sse41.c b/media/libwebp/dsp/lossless_enc_sse41.c
new file mode 100644
index 0000000000..2c6bc5bb00
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_sse41.c
@@ -0,0 +1,155 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4.1 variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+#include <assert.h>
+#include <smmintrin.h>
+#include "../dsp/lossless.h"
+
+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X)  (((int16_t)((uint16_t)(X) << 8)) >> 5)
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
+                                              int num_pixels) {
+  int i;
+  const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
+                                           -1,  5, -1,  5, -1, 1, -1, 1);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle);
+    const __m128i out = _mm_sub_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  if (i != num_pixels) {
+    VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+#define MK_CST_16(HI, LO) \
+  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
+static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
+                                             int tile_width, int tile_height,
+                                             int green_to_blue, int red_to_blue,
+                                             int histo[]) {
+  const __m128i mult =
+      MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
+  const __m128i perm =
+      _mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14);
+  if (tile_width >= 4) {
+    int y;
+    for (y = 0; y < tile_height; ++y) {
+      const uint32_t* const src = argb + y * stride;
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
+      const __m128i B1 = _mm_shuffle_epi8(A1, perm);
+      const __m128i C1 = _mm_mulhi_epi16(B1, mult);
+      const __m128i D1 = _mm_sub_epi16(A1, C1);
+      __m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1);
+      int x;
+      for (x = 4; x + 4 <= tile_width; x += 4) {
+        const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
+        __m128i B2, C2, D2;
+        ++histo[_mm_extract_epi8(E,  0)];
+        B2 = _mm_shuffle_epi8(A2, perm);
+        ++histo[_mm_extract_epi8(E,  4)];
+        C2 = _mm_mulhi_epi16(B2, mult);
+        ++histo[_mm_extract_epi8(E,  8)];
+        D2 = _mm_sub_epi16(A2, C2);
+        ++histo[_mm_extract_epi8(E, 12)];
+        E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2);
+      }
+      ++histo[_mm_extract_epi8(E,  0)];
+      ++histo[_mm_extract_epi8(E,  4)];
+      ++histo[_mm_extract_epi8(E,  8)];
+      ++histo[_mm_extract_epi8(E, 12)];
+    }
+  }
+  {
+    const int left_over = tile_width & 3;
+    if (left_over > 0) {
+      VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
+                                       left_over, tile_height,
+                                       green_to_blue, red_to_blue, histo);
+    }
+  }
+}
+
+static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
+                                            int tile_width, int tile_height,
+                                            int green_to_red, int histo[]) {
+
+  const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
+  const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
+  if (tile_width >= 4) {
+    int y;
+    for (y = 0; y < tile_height; ++y) {
+      const uint32_t* const src = argb + y * stride;
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
+      const __m128i B1 = _mm_and_si128(A1, mask_g);
+      const __m128i C1 = _mm_madd_epi16(B1, mult);
+      __m128i D = _mm_sub_epi16(A1, C1);
+      int x;
+      for (x = 4; x + 4 <= tile_width; x += 4) {
+        const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
+        __m128i B2, C2;
+        ++histo[_mm_extract_epi8(D,  2)];
+        B2 = _mm_and_si128(A2, mask_g);
+        ++histo[_mm_extract_epi8(D,  6)];
+        C2 = _mm_madd_epi16(B2, mult);
+        ++histo[_mm_extract_epi8(D, 10)];
+        ++histo[_mm_extract_epi8(D, 14)];
+        D = _mm_sub_epi16(A2, C2);
+      }
+      ++histo[_mm_extract_epi8(D,  2)];
+      ++histo[_mm_extract_epi8(D,  6)];
+      ++histo[_mm_extract_epi8(D, 10)];
+      ++histo[_mm_extract_epi8(D, 14)];
+    }
+  }
+  {
+    const int left_over = tile_width & 3;
+    if (left_over > 0) {
+      VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
+                                      left_over, tile_height, green_to_red,
+                                      histo);
+    }
+  }
+}
+
+#undef MK_CST_16
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/media/libwebp/dsp/lossless_mips_dsp_r2.c b/media/libwebp/dsp/lossless_mips_dsp_r2.c
new file mode 100644
index 0000000000..ec98834f84
--- /dev/null
+++ b/media/libwebp/dsp/lossless_mips_dsp_r2.c
@@ -0,0 +1,701 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transforms and color space conversion methods for lossless decoder.
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+
+#define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)                 \
+static void FUNC_NAME(const TYPE* src,                                         \
+                      const uint32_t* const color_map,                         \
+                      TYPE* dst, int y_start, int y_end,                       \
+                      int width) {                                             \
+  int y;                                                                       \
+  for (y = y_start; y < y_end; ++y) {                                          \
+    int x;                                                                     \
+    for (x = 0; x < (width >> 2); ++x) {                                       \
+      int tmp1, tmp2, tmp3, tmp4;                                              \
+      __asm__ volatile (                                                       \
+      ".ifc        " #TYPE ",  uint8_t                  \n\t"                  \
+        "lbu       %[tmp1],  0(%[src])                  \n\t"                  \
+        "lbu       %[tmp2],  1(%[src])                  \n\t"                  \
+        "lbu       %[tmp3],  2(%[src])                  \n\t"                  \
+        "lbu       %[tmp4],  3(%[src])                  \n\t"                  \
+        "addiu     %[src],   %[src],      4             \n\t"                  \
+      ".endif                                           \n\t"                  \
+      ".ifc        " #TYPE ",  uint32_t                 \n\t"                  \
+        "lw        %[tmp1],  0(%[src])                  \n\t"                  \
+        "lw        %[tmp2],  4(%[src])                  \n\t"                  \
+        "lw        %[tmp3],  8(%[src])                  \n\t"                  \
+        "lw        %[tmp4],  12(%[src])                 \n\t"                  \
+        "ext       %[tmp1],  %[tmp1],     8,        8   \n\t"                  \
+        "ext       %[tmp2],  %[tmp2],     8,        8   \n\t"                  \
+        "ext       %[tmp3],  %[tmp3],     8,        8   \n\t"                  \
+        "ext       %[tmp4],  %[tmp4],     8,        8   \n\t"                  \
+        "addiu     %[src],   %[src],      16            \n\t"                  \
+      ".endif                                           \n\t"                  \
+        "sll       %[tmp1],  %[tmp1],     2             \n\t"                  \
+        "sll       %[tmp2],  %[tmp2],     2             \n\t"                  \
+        "sll       %[tmp3],  %[tmp3],     2             \n\t"                  \
+        "sll       %[tmp4],  %[tmp4],     2             \n\t"                  \
+        "lwx       %[tmp1],  %[tmp1](%[color_map])      \n\t"                  \
+        "lwx       %[tmp2],  %[tmp2](%[color_map])      \n\t"                  \
+        "lwx       %[tmp3],  %[tmp3](%[color_map])      \n\t"                  \
+        "lwx       %[tmp4],  %[tmp4](%[color_map])      \n\t"                  \
+      ".ifc        " #TYPE ",  uint8_t                  \n\t"                  \
+        "ext       %[tmp1],  %[tmp1],     8,        8   \n\t"                  \
+        "ext       %[tmp2],  %[tmp2],     8,        8   \n\t"                  \
+        "ext       %[tmp3],  %[tmp3],     8,        8   \n\t"                  \
+        "ext       %[tmp4],  %[tmp4],     8,        8   \n\t"                  \
+        "sb        %[tmp1],  0(%[dst])                  \n\t"                  \
+        "sb        %[tmp2],  1(%[dst])                  \n\t"                  \
+        "sb        %[tmp3],  2(%[dst])                  \n\t"                  \
+        "sb        %[tmp4],  3(%[dst])                  \n\t"                  \
+        "addiu     %[dst],   %[dst],      4             \n\t"                  \
+      ".endif                                           \n\t"                  \
+      ".ifc        " #TYPE ",  uint32_t                 \n\t"                  \
+        "sw        %[tmp1],  0(%[dst])                  \n\t"                  \
+        "sw        %[tmp2],  4(%[dst])                  \n\t"                  \
+        "sw        %[tmp3],  8(%[dst])                  \n\t"                  \
+        "sw        %[tmp4],  12(%[dst])                 \n\t"                  \
+        "addiu     %[dst],   %[dst],      16            \n\t"                  \
+      ".endif                                           \n\t"                  \
+        : [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),             \
+          [tmp4]"=&r"(tmp4), [src]"+&r"(src), [dst]"+r"(dst)                   \
+        : [color_map]"r"(color_map)                                            \
+        : "memory"                                                             \
+      );                                                                       \
+    }                                                                          \
+    for (x = 0; x < (width & 3); ++x) {                                        \
+      *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                        \
+    }                                                                          \
+  }                                                                            \
+}
+
+MAP_COLOR_FUNCS(MapARGB_MIPSdspR2, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
+MAP_COLOR_FUNCS(MapAlpha_MIPSdspR2, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
+
+#undef MAP_COLOR_FUNCS
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  __asm__ volatile (
+    "preceu.ph.qbr   %[temp1],   %[c0]                 \n\t"
+    "preceu.ph.qbl   %[temp2],   %[c0]                 \n\t"
+    "preceu.ph.qbr   %[temp3],   %[c1]                 \n\t"
+    "preceu.ph.qbl   %[temp4],   %[c1]                 \n\t"
+    "preceu.ph.qbr   %[temp5],   %[c2]                 \n\t"
+    "preceu.ph.qbl   %[temp0],   %[c2]                 \n\t"
+    "subq.ph         %[temp3],   %[temp3],   %[temp5]  \n\t"
+    "subq.ph         %[temp4],   %[temp4],   %[temp0]  \n\t"
+    "addq.ph         %[temp1],   %[temp1],   %[temp3]  \n\t"
+    "addq.ph         %[temp2],   %[temp2],   %[temp4]  \n\t"
+    "shll_s.ph       %[temp1],   %[temp1],   7         \n\t"
+    "shll_s.ph       %[temp2],   %[temp2],   7         \n\t"
+    "precrqu_s.qb.ph %[temp2],   %[temp2],   %[temp1]  \n\t"
+    : [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5)
+    : [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
+    : "memory"
+  );
+  return temp2;
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  __asm__ volatile (
+    "adduh.qb         %[temp5],   %[c0],      %[c1]       \n\t"
+    "preceu.ph.qbr    %[temp3],   %[c2]                   \n\t"
+    "preceu.ph.qbr    %[temp1],   %[temp5]                \n\t"
+    "preceu.ph.qbl    %[temp2],   %[temp5]                \n\t"
+    "preceu.ph.qbl    %[temp4],   %[c2]                   \n\t"
+    "subq.ph          %[temp3],   %[temp1],   %[temp3]    \n\t"
+    "subq.ph          %[temp4],   %[temp2],   %[temp4]    \n\t"
+    "shrl.ph          %[temp5],   %[temp3],   15          \n\t"
+    "shrl.ph          %[temp0],   %[temp4],   15          \n\t"
+    "addq.ph          %[temp3],   %[temp3],   %[temp5]    \n\t"
+    "addq.ph          %[temp4],   %[temp0],   %[temp4]    \n\t"
+    "shra.ph          %[temp3],   %[temp3],   1           \n\t"
+    "shra.ph          %[temp4],   %[temp4],   1           \n\t"
+    "addq.ph          %[temp1],   %[temp1],   %[temp3]    \n\t"
+    "addq.ph          %[temp2],   %[temp2],   %[temp4]    \n\t"
+    "shll_s.ph        %[temp1],   %[temp1],   7           \n\t"
+    "shll_s.ph        %[temp2],   %[temp2],   7           \n\t"
+    "precrqu_s.qb.ph  %[temp1],   %[temp2],   %[temp1]    \n\t"
+    : [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=r"(temp4), [temp5]"=&r"(temp5)
+    : [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
+    : "memory"
+  );
+  return temp1;
+}
+
+static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  __asm__ volatile (
+    "cmpgdu.lt.qb %[temp1], %[c],     %[b]             \n\t"
+    "pick.qb      %[temp1], %[b],     %[c]             \n\t"
+    "pick.qb      %[temp2], %[c],     %[b]             \n\t"
+    "cmpgdu.lt.qb %[temp4], %[c],     %[a]             \n\t"
+    "pick.qb      %[temp4], %[a],     %[c]             \n\t"
+    "pick.qb      %[temp5], %[c],     %[a]             \n\t"
+    "subu.qb      %[temp3], %[temp1], %[temp2]         \n\t"
+    "subu.qb      %[temp0], %[temp4], %[temp5]         \n\t"
+    "raddu.w.qb   %[temp3], %[temp3]                   \n\t"
+    "raddu.w.qb   %[temp0], %[temp0]                   \n\t"
+    "subu         %[temp3], %[temp3], %[temp0]         \n\t"
+    "slti         %[temp0], %[temp3], 0x1              \n\t"
+    "movz         %[a],     %[b],     %[temp0]         \n\t"
+    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp0]"=&r"(temp0),
+      [a]"+&r"(a)
+    : [b]"r"(b), [c]"r"(c)
+  );
+  return a;
+}
+
+static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+  __asm__ volatile (
+    "adduh.qb    %[a0], %[a0], %[a1]       \n\t"
+    : [a0]"+r"(a0)
+    : [a1]"r"(a1)
+  );
+  return a0;
+}
+
+static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+  return Average2(Average2(a0, a2), a1);
+}
+
+static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
+                                     uint32_t a2, uint32_t a3) {
+  return Average2(Average2(a0, a1), Average2(a2, a3));
+}
+
+static uint32_t Predictor5_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average3(*left, top[0], top[1]);
+}
+
+static uint32_t Predictor6_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average2(*left, top[-1]);
+}
+
+static uint32_t Predictor7_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average2(*left, top[0]);
+}
+
+static uint32_t Predictor8_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  (void)left;
+  return Average2(top[-1], top[0]);
+}
+
+static uint32_t Predictor9_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  (void)left;
+  return Average2(top[0], top[1]);
+}
+
+static uint32_t Predictor10_MIPSdspR2(const uint32_t* const left,
+                                      const uint32_t* const top) {
+  return Average4(*left, top[-1], top[0], top[1]);
+}
+
+static uint32_t Predictor11_MIPSdspR2(const uint32_t* const left,
+                                      const uint32_t* const top) {
+  return Select(top[0], *left, top[-1]);
+}
+
+static uint32_t Predictor12_MIPSdspR2(const uint32_t* const left,
+                                      const uint32_t* const top) {
+  return ClampedAddSubtractFull(*left, top[0], top[-1]);
+}
+
+static uint32_t Predictor13_MIPSdspR2(const uint32_t* const left,
+                                      const uint32_t* const top) {
+  return ClampedAddSubtractHalf(*left, top[0], top[-1]);
+}
+
+// Add green to blue and red channels (i.e. perform the inverse transform of
+// 'subtract green').
+static void AddGreenToBlueAndRed_MIPSdspR2(const uint32_t* src, int num_pixels,
+                                           uint32_t* dst) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set       push                                          \n\t"
+    ".set       noreorder                                     \n\t"
+    "beq        %[src],          %[p_loop1_end],     3f       \n\t"
+    " nop                                                     \n\t"
+  "0:                                                         \n\t"
+    "lw         %[temp0],        0(%[src])                    \n\t"
+    "lw         %[temp1],        4(%[src])                    \n\t"
+    "lw         %[temp2],        8(%[src])                    \n\t"
+    "lw         %[temp3],        12(%[src])                   \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "ext        %[temp5],        %[temp1],           8,    8  \n\t"
+    "ext        %[temp6],        %[temp2],           8,    8  \n\t"
+    "ext        %[temp7],        %[temp3],           8,    8  \n\t"
+    "addiu      %[src],          %[src],             16       \n\t"
+    "addiu      %[dst],          %[dst],             16       \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "replv.ph   %[temp5],        %[temp5]                     \n\t"
+    "replv.ph   %[temp6],        %[temp6]                     \n\t"
+    "replv.ph   %[temp7],        %[temp7]                     \n\t"
+    "addu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "addu.qb    %[temp1],        %[temp1],           %[temp5] \n\t"
+    "addu.qb    %[temp2],        %[temp2],           %[temp6] \n\t"
+    "addu.qb    %[temp3],        %[temp3],           %[temp7] \n\t"
+    "sw         %[temp0],        -16(%[dst])                  \n\t"
+    "sw         %[temp1],        -12(%[dst])                  \n\t"
+    "sw         %[temp2],        -8(%[dst])                   \n\t"
+    "bne        %[src],          %[p_loop1_end],     0b       \n\t"
+    " sw        %[temp3],        -4(%[dst])                   \n\t"
+  "3:                                                         \n\t"
+    "beq        %[src],          %[p_loop2_end],     2f       \n\t"
+    " nop                                                     \n\t"
+  "1:                                                         \n\t"
+    "lw         %[temp0],        0(%[src])                    \n\t"
+    "addiu      %[src],          %[src],             4        \n\t"
+    "addiu      %[dst],          %[dst],             4        \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "addu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "bne        %[src],          %[p_loop2_end],     1b       \n\t"
+    " sw        %[temp0],        -4(%[dst])                   \n\t"
+  "2:                                                         \n\t"
+    ".set       pop                                           \n\t"
+    : [dst]"+&r"(dst), [src]"+&r"(src), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void TransformColorInverse_MIPSdspR2(const VP8LMultipliers* const m,
+                                            const uint32_t* src, int num_pixels,
+                                            uint32_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  uint32_t argb, argb1, new_red;
+  const uint32_t G_to_R = m->green_to_red_;
+  const uint32_t G_to_B = m->green_to_blue_;
+  const uint32_t R_to_B = m->red_to_blue_;
+  const uint32_t* const p_loop_end = src + (num_pixels & ~1);
+  __asm__ volatile (
+    ".set            push                                    \n\t"
+    ".set            noreorder                               \n\t"
+    "beq             %[src],       %[p_loop_end],  1f        \n\t"
+    " nop                                                    \n\t"
+    "replv.ph        %[temp0],     %[G_to_R]                 \n\t"
+    "replv.ph        %[temp1],     %[G_to_B]                 \n\t"
+    "replv.ph        %[temp2],     %[R_to_B]                 \n\t"
+    "shll.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shll.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shll.ph         %[temp2],     %[temp2],       8         \n\t"
+    "shra.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shra.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shra.ph         %[temp2],     %[temp2],       8         \n\t"
+  "0:                                                        \n\t"
+    "lw              %[argb],      0(%[src])                 \n\t"
+    "lw              %[argb1],     4(%[src])                 \n\t"
+    "sw              %[argb],      0(%[dst])                 \n\t"
+    "sw              %[argb1],     4(%[dst])                 \n\t"
+    "addiu           %[src],       %[src],         8         \n\t"
+    "addiu           %[dst],       %[dst],         8         \n\t"
+    "precrq.qb.ph    %[temp3],     %[argb],        %[argb1]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[temp3]                  \n\t"
+    "shll.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       8         \n\t"
+    "mul.ph          %[temp5],     %[temp3],       %[temp0]  \n\t"
+    "mul.ph          %[temp3],     %[temp3],       %[temp1]  \n\t"
+    "precrq.ph.w     %[new_red],   %[argb],        %[argb1]  \n\t"
+    "ins             %[argb1],     %[argb],        16,   16  \n\t"
+    "shra.ph         %[temp5],     %[temp5],       5         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       5         \n\t"
+    "addu.ph         %[new_red],   %[new_red],     %[temp5]  \n\t"
+    "addu.ph         %[argb1],     %[argb1],       %[temp3]  \n\t"
+    "preceu.ph.qbra  %[temp5],     %[new_red]                \n\t"
+    "shll.ph         %[temp4],     %[temp5],       8         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       8         \n\t"
+    "mul.ph          %[temp4],     %[temp4],       %[temp2]  \n\t"
+    "sb              %[temp5],     -2(%[dst])                \n\t"
+    "sra             %[temp5],     %[temp5],       16        \n\t"
+    "shra.ph         %[temp4],     %[temp4],       5         \n\t"
+    "addu.ph         %[argb1],     %[argb1],       %[temp4]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[argb1]                  \n\t"
+    "sb              %[temp5],     -6(%[dst])                \n\t"
+    "sb              %[temp3],     -4(%[dst])                \n\t"
+    "sra             %[temp3],     %[temp3],       16        \n\t"
+    "bne             %[src],       %[p_loop_end],  0b        \n\t"
+    " sb             %[temp3],     -8(%[dst])                \n\t"
+  "1:                                                        \n\t"
+    ".set            pop                                     \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [new_red]"=&r"(new_red), [argb]"=&r"(argb),
+      [argb1]"=&r"(argb1), [dst]"+&r"(dst), [src]"+&r"(src)
+    : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
+      [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
+    : "memory", "hi", "lo"
+  );
+
+  // Fall-back to C-version for left-overs.
+  if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
+}
+
+static void ConvertBGRAToRGB_MIPSdspR2(const uint32_t* src,
+                                       int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set       push                                       \n\t"
+    ".set       noreorder                                  \n\t"
+    "beq        %[src],      %[p_loop1_end],    3f         \n\t"
+    " nop                                                  \n\t"
+  "0:                                                      \n\t"
+    "lw         %[temp3],    12(%[src])                    \n\t"
+    "lw         %[temp2],    8(%[src])                     \n\t"
+    "lw         %[temp1],    4(%[src])                     \n\t"
+    "lw         %[temp0],    0(%[src])                     \n\t"
+    "ins        %[temp3],    %[temp2],          24,   8    \n\t"
+    "sll        %[temp2],    %[temp2],          8          \n\t"
+    "rotr       %[temp3],    %[temp3],          16         \n\t"
+    "ins        %[temp2],    %[temp1],          0,    16   \n\t"
+    "sll        %[temp1],    %[temp1],          8          \n\t"
+    "wsbh       %[temp3],    %[temp3]                      \n\t"
+    "balign     %[temp0],    %[temp1],          1          \n\t"
+    "wsbh       %[temp2],    %[temp2]                      \n\t"
+    "wsbh       %[temp0],    %[temp0]                      \n\t"
+    "usw        %[temp3],    8(%[dst])                     \n\t"
+    "rotr       %[temp0],    %[temp0],          16         \n\t"
+    "usw        %[temp2],    4(%[dst])                     \n\t"
+    "addiu      %[src],      %[src],            16         \n\t"
+    "usw        %[temp0],    0(%[dst])                     \n\t"
+    "bne        %[src],      %[p_loop1_end],    0b         \n\t"
+    " addiu     %[dst],      %[dst],            12         \n\t"
+  "3:                                                      \n\t"
+    "beq        %[src],      %[p_loop2_end],    2f         \n\t"
+    " nop                                                  \n\t"
+  "1:                                                      \n\t"
+    "lw         %[temp0],    0(%[src])                     \n\t"
+    "addiu      %[src],      %[src],            4          \n\t"
+    "wsbh       %[temp1],    %[temp0]                      \n\t"
+    "addiu      %[dst],      %[dst],            3          \n\t"
+    "ush        %[temp1],    -2(%[dst])                    \n\t"
+    "sra        %[temp0],    %[temp0],          16         \n\t"
+    "bne        %[src],      %[p_loop2_end],    1b         \n\t"
+    " sb        %[temp0],    -3(%[dst])                    \n\t"
+  "2:                                                      \n\t"
+    ".set       pop                                        \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void ConvertBGRAToRGBA_MIPSdspR2(const uint32_t* src,
+                                        int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set       push                                       \n\t"
+    ".set       noreorder                                  \n\t"
+    "beq        %[src],      %[p_loop1_end],    3f         \n\t"
+    " nop                                                  \n\t"
+  "0:                                                      \n\t"
+    "lw         %[temp0],    0(%[src])                     \n\t"
+    "lw         %[temp1],    4(%[src])                     \n\t"
+    "lw         %[temp2],    8(%[src])                     \n\t"
+    "lw         %[temp3],    12(%[src])                    \n\t"
+    "wsbh       %[temp0],    %[temp0]                      \n\t"
+    "wsbh       %[temp1],    %[temp1]                      \n\t"
+    "wsbh       %[temp2],    %[temp2]                      \n\t"
+    "wsbh       %[temp3],    %[temp3]                      \n\t"
+    "addiu      %[src],      %[src],            16         \n\t"
+    "balign     %[temp0],    %[temp0],          1          \n\t"
+    "balign     %[temp1],    %[temp1],          1          \n\t"
+    "balign     %[temp2],    %[temp2],          1          \n\t"
+    "balign     %[temp3],    %[temp3],          1          \n\t"
+    "usw        %[temp0],    0(%[dst])                     \n\t"
+    "usw        %[temp1],    4(%[dst])                     \n\t"
+    "usw        %[temp2],    8(%[dst])                     \n\t"
+    "usw        %[temp3],    12(%[dst])                    \n\t"
+    "bne        %[src],      %[p_loop1_end],    0b         \n\t"
+    " addiu     %[dst],      %[dst],            16         \n\t"
+  "3:                                                      \n\t"
+    "beq        %[src],      %[p_loop2_end],    2f         \n\t"
+    " nop                                                  \n\t"
+  "1:                                                      \n\t"
+    "lw         %[temp0],    0(%[src])                     \n\t"
+    "wsbh       %[temp0],    %[temp0]                      \n\t"
+    "addiu      %[src],      %[src],            4          \n\t"
+    "balign     %[temp0],    %[temp0],          1          \n\t"
+    "usw        %[temp0],    0(%[dst])                     \n\t"
+    "bne        %[src],      %[p_loop2_end],    1b         \n\t"
+    " addiu     %[dst],      %[dst],            4          \n\t"
+  "2:                                                      \n\t"
+    ".set       pop                                        \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void ConvertBGRAToRGBA4444_MIPSdspR2(const uint32_t* src,
+                                            int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set           push                                       \n\t"
+    ".set           noreorder                                  \n\t"
+    "beq            %[src],      %[p_loop1_end],    3f         \n\t"
+    " nop                                                      \n\t"
+  "0:                                                          \n\t"
+    "lw             %[temp0],    0(%[src])                     \n\t"
+    "lw             %[temp1],    4(%[src])                     \n\t"
+    "lw             %[temp2],    8(%[src])                     \n\t"
+    "lw             %[temp3],    12(%[src])                    \n\t"
+    "ext            %[temp4],    %[temp0],          28,   4    \n\t"
+    "ext            %[temp5],    %[temp0],          12,   4    \n\t"
+    "ins            %[temp0],    %[temp4],          0,    4    \n\t"
+    "ext            %[temp4],    %[temp1],          28,   4    \n\t"
+    "ins            %[temp0],    %[temp5],          16,   4    \n\t"
+    "ext            %[temp5],    %[temp1],          12,   4    \n\t"
+    "ins            %[temp1],    %[temp4],          0,    4    \n\t"
+    "ext            %[temp4],    %[temp2],          28,   4    \n\t"
+    "ins            %[temp1],    %[temp5],          16,   4    \n\t"
+    "ext            %[temp5],    %[temp2],          12,   4    \n\t"
+    "ins            %[temp2],    %[temp4],          0,    4    \n\t"
+    "ext            %[temp4],    %[temp3],          28,   4    \n\t"
+    "ins            %[temp2],    %[temp5],          16,   4    \n\t"
+    "ext            %[temp5],    %[temp3],          12,   4    \n\t"
+    "ins            %[temp3],    %[temp4],          0,    4    \n\t"
+    "precr.qb.ph    %[temp1],    %[temp1],          %[temp0]   \n\t"
+    "ins            %[temp3],    %[temp5],          16,   4    \n\t"
+    "addiu          %[src],      %[src],            16         \n\t"
+    "precr.qb.ph    %[temp3],    %[temp3],          %[temp2]   \n\t"
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    "usw            %[temp1],    0(%[dst])                     \n\t"
+    "usw            %[temp3],    4(%[dst])                     \n\t"
+#else
+    "wsbh           %[temp1],    %[temp1]                      \n\t"
+    "wsbh           %[temp3],    %[temp3]                      \n\t"
+    "usw            %[temp1],    0(%[dst])                     \n\t"
+    "usw            %[temp3],    4(%[dst])                     \n\t"
+#endif
+    "bne            %[src],      %[p_loop1_end],    0b         \n\t"
+    " addiu         %[dst],      %[dst],            8          \n\t"
+  "3:                                                          \n\t"
+    "beq            %[src],      %[p_loop2_end],    2f         \n\t"
+    " nop                                                      \n\t"
+  "1:                                                          \n\t"
+    "lw             %[temp0],    0(%[src])                     \n\t"
+    "ext            %[temp4],    %[temp0],          28,   4    \n\t"
+    "ext            %[temp5],    %[temp0],          12,   4    \n\t"
+    "ins            %[temp0],    %[temp4],          0,    4    \n\t"
+    "ins            %[temp0],    %[temp5],          16,   4    \n\t"
+    "addiu          %[src],      %[src],            4          \n\t"
+    "precr.qb.ph    %[temp0],    %[temp0],          %[temp0]   \n\t"
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    "ush            %[temp0],    0(%[dst])                     \n\t"
+#else
+    "wsbh           %[temp0],    %[temp0]                      \n\t"
+    "ush            %[temp0],    0(%[dst])                     \n\t"
+#endif
+    "bne            %[src],      %[p_loop2_end],    1b         \n\t"
+    " addiu         %[dst],      %[dst],            2          \n\t"
+  "2:                                                          \n\t"
+    ".set           pop                                        \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void ConvertBGRAToRGB565_MIPSdspR2(const uint32_t* src,
+                                          int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set           push                                       \n\t"
+    ".set           noreorder                                  \n\t"
+    "beq            %[src],      %[p_loop1_end],    3f         \n\t"
+    " nop                                                      \n\t"
+  "0:                                                          \n\t"
+    "lw             %[temp0],    0(%[src])                     \n\t"
+    "lw             %[temp1],    4(%[src])                     \n\t"
+    "lw             %[temp2],    8(%[src])                     \n\t"
+    "lw             %[temp3],    12(%[src])                    \n\t"
+    "ext            %[temp4],    %[temp0],          8,    16   \n\t"
+    "ext            %[temp5],    %[temp0],          5,    11   \n\t"
+    "ext            %[temp0],    %[temp0],          3,    5    \n\t"
+    "ins            %[temp4],    %[temp5],          0,    11   \n\t"
+    "ext            %[temp5],    %[temp1],          5,    11   \n\t"
+    "ins            %[temp4],    %[temp0],          0,    5    \n\t"
+    "ext            %[temp0],    %[temp1],          8,    16   \n\t"
+    "ext            %[temp1],    %[temp1],          3,    5    \n\t"
+    "ins            %[temp0],    %[temp5],          0,    11   \n\t"
+    "ext            %[temp5],    %[temp2],          5,    11   \n\t"
+    "ins            %[temp0],    %[temp1],          0,    5    \n\t"
+    "ext            %[temp1],    %[temp2],          8,    16   \n\t"
+    "ext            %[temp2],    %[temp2],          3,    5    \n\t"
+    "ins            %[temp1],    %[temp5],          0,    11   \n\t"
+    "ext            %[temp5],    %[temp3],          5,    11   \n\t"
+    "ins            %[temp1],    %[temp2],          0,    5    \n\t"
+    "ext            %[temp2],    %[temp3],          8,    16   \n\t"
+    "ext            %[temp3],    %[temp3],          3,    5    \n\t"
+    "ins            %[temp2],    %[temp5],          0,    11   \n\t"
+    "append         %[temp0],    %[temp4],          16         \n\t"
+    "ins            %[temp2],    %[temp3],          0,    5    \n\t"
+    "addiu          %[src],      %[src],            16         \n\t"
+    "append         %[temp2],    %[temp1],          16         \n\t"
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    "usw            %[temp0],    0(%[dst])                     \n\t"
+    "usw            %[temp2],    4(%[dst])                     \n\t"
+#else
+    "wsbh           %[temp0],    %[temp0]                      \n\t"
+    "wsbh           %[temp2],    %[temp2]                      \n\t"
+    "usw            %[temp0],    0(%[dst])                     \n\t"
+    "usw            %[temp2],    4(%[dst])                     \n\t"
+#endif
+    "bne            %[src],      %[p_loop1_end],    0b         \n\t"
+    " addiu         %[dst],      %[dst],            8          \n\t"
+  "3:                                                          \n\t"
+    "beq            %[src],      %[p_loop2_end],    2f         \n\t"
+    " nop                                                      \n\t"
+  "1:                                                          \n\t"
+    "lw             %[temp0],    0(%[src])                     \n\t"
+    "ext            %[temp4],    %[temp0],          8,    16   \n\t"
+    "ext            %[temp5],    %[temp0],          5,    11   \n\t"
+    "ext            %[temp0],    %[temp0],          3,    5    \n\t"
+    "ins            %[temp4],    %[temp5],          0,    11   \n\t"
+    "addiu          %[src],      %[src],            4          \n\t"
+    "ins            %[temp4],    %[temp0],          0,    5    \n\t"
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    "ush            %[temp4],    0(%[dst])                     \n\t"
+#else
+    "wsbh           %[temp4],    %[temp4]                      \n\t"
+    "ush            %[temp4],    0(%[dst])                     \n\t"
+#endif
+    "bne            %[src],      %[p_loop2_end],    1b         \n\t"
+    " addiu         %[dst],      %[dst],            2          \n\t"
+  "2:                                                          \n\t"
+    ".set           pop                                        \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void ConvertBGRAToBGR_MIPSdspR2(const uint32_t* src,
+                                       int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set       push                                         \n\t"
+    ".set       noreorder                                    \n\t"
+    "beq        %[src],      %[p_loop1_end],    3f           \n\t"
+    " nop                                                    \n\t"
+  "0:                                                        \n\t"
+    "lw         %[temp0],    0(%[src])                       \n\t"
+    "lw         %[temp1],    4(%[src])                       \n\t"
+    "lw         %[temp2],    8(%[src])                       \n\t"
+    "lw         %[temp3],    12(%[src])                      \n\t"
+    "ins        %[temp0],    %[temp1],          24,    8     \n\t"
+    "sra        %[temp1],    %[temp1],          8            \n\t"
+    "ins        %[temp1],    %[temp2],          16,    16    \n\t"
+    "sll        %[temp2],    %[temp2],          8            \n\t"
+    "balign     %[temp3],    %[temp2],          1            \n\t"
+    "addiu      %[src],      %[src],            16           \n\t"
+    "usw        %[temp0],    0(%[dst])                       \n\t"
+    "usw        %[temp1],    4(%[dst])                       \n\t"
+    "usw        %[temp3],    8(%[dst])                       \n\t"
+    "bne        %[src],      %[p_loop1_end],    0b           \n\t"
+    " addiu     %[dst],      %[dst],            12           \n\t"
+  "3:                                                        \n\t"
+    "beq        %[src],      %[p_loop2_end],    2f           \n\t"
+    " nop                                                    \n\t"
+  "1:                                                        \n\t"
+    "lw         %[temp0],    0(%[src])                       \n\t"
+    "addiu      %[src],      %[src],            4            \n\t"
+    "addiu      %[dst],      %[dst],            3            \n\t"
+    "ush        %[temp0],    -3(%[dst])                      \n\t"
+    "sra        %[temp0],    %[temp0],          16           \n\t"
+    "bne        %[src],      %[p_loop2_end],    1b           \n\t"
+    " sb        %[temp0],    -1(%[dst])                      \n\t"
+  "2:                                                        \n\t"
+    ".set       pop                                          \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
+  VP8LMapColor32b = MapARGB_MIPSdspR2;
+  VP8LMapColor8b = MapAlpha_MIPSdspR2;
+
+  VP8LPredictors[5] = Predictor5_MIPSdspR2;
+  VP8LPredictors[6] = Predictor6_MIPSdspR2;
+  VP8LPredictors[7] = Predictor7_MIPSdspR2;
+  VP8LPredictors[8] = Predictor8_MIPSdspR2;
+  VP8LPredictors[9] = Predictor9_MIPSdspR2;
+  VP8LPredictors[10] = Predictor10_MIPSdspR2;
+  VP8LPredictors[11] = Predictor11_MIPSdspR2;
+  VP8LPredictors[12] = Predictor12_MIPSdspR2;
+  VP8LPredictors[13] = Predictor13_MIPSdspR2;
+
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MIPSdspR2;
+  VP8LTransformColorInverse = TransformColorInverse_MIPSdspR2;
+
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MIPSdspR2;
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MIPSdspR2;
+  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_MIPSdspR2;
+  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_MIPSdspR2;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8LDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/lossless_msa.c b/media/libwebp/dsp/lossless_msa.c
new file mode 100644
index 0000000000..16256ab57f
--- /dev/null
+++ b/media/libwebp/dsp/lossless_msa.c
@@ -0,0 +1,356 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of methods for lossless decoder
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/lossless.h"
+#include "../dsp/msa_macro.h"
+
+//------------------------------------------------------------------------------
+// Colorspace conversion functions
+
+#define CONVERT16_BGRA_XXX(psrc, pdst, m0, m1, m2) do {    \
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;          \
+  LD_UB4(psrc, 16, src0, src1, src2, src3);                \
+  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
+  dst2 = VSHF_UB(src2, src3, m2);                          \
+  ST_UB2(dst0, dst1, pdst, 16);                            \
+  ST_UB(dst2, pdst + 32);                                  \
+} while (0)
+
+#define CONVERT12_BGRA_XXX(psrc, pdst, m0, m1, m2) do {    \
+  uint32_t pix_w;                                          \
+  v16u8 src0, src1, src2, dst0, dst1, dst2;                \
+  LD_UB3(psrc, 16, src0, src1, src2);                      \
+  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
+  dst2 = VSHF_UB(src2, src2, m2);                          \
+  ST_UB2(dst0, dst1, pdst, 16);                            \
+  pix_w = __msa_copy_s_w((v4i32)dst2, 0);                  \
+  SW(pix_w, pdst + 32);                                    \
+} while (0)
+
+#define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do {         \
+  uint64_t pix_d;                                          \
+  v16u8 src0, src1, src2 = { 0 }, dst0, dst1;              \
+  LD_UB2(psrc, 16, src0, src1);                            \
+  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
+  ST_UB(dst0, pdst);                                       \
+  pix_d = __msa_copy_s_d((v2i64)dst1, 0);                  \
+  SD(pix_d, pdst + 16);                                    \
+} while (0)
+
+#define CONVERT4_BGRA_XXX(psrc, pdst, m) do {       \
+  const v16u8 src0 = LD_UB(psrc);                   \
+  const v16u8 dst0 = VSHF_UB(src0, src0, m);        \
+  uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);  \
+  uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);  \
+  SD(pix_d, pdst + 0);                              \
+  SW(pix_w, pdst + 8);                              \
+} while (0)
+
+#define CONVERT1_BGRA_BGR(psrc, pdst) do {  \
+  const int32_t b = (psrc)[0];              \
+  const int32_t g = (psrc)[1];              \
+  const int32_t r = (psrc)[2];              \
+  (pdst)[0] = b;                            \
+  (pdst)[1] = g;                            \
+  (pdst)[2] = r;                            \
+} while (0)
+
+#define CONVERT1_BGRA_RGB(psrc, pdst) do {  \
+  const int32_t b = (psrc)[0];              \
+  const int32_t g = (psrc)[1];              \
+  const int32_t r = (psrc)[2];              \
+  (pdst)[0] = r;                            \
+  (pdst)[1] = g;                            \
+  (pdst)[2] = b;                            \
+} while (0)
+
+#define TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1,     \
+                                  c0, c1, mask0, mask1) do {  \
+  v8i16 g0, g1, t0, t1, t2, t3;                               \
+  v4i32 t4, t5;                                               \
+  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1);   \
+  DOTP_SB2_SH(g0, g1, c0, c0, t0, t1);                        \
+  SRAI_H2_SH(t0, t1, 5);                                      \
+  t0 = __msa_addv_h(t0, (v8i16)src0);                         \
+  t1 = __msa_addv_h(t1, (v8i16)src1);                         \
+  t4 = __msa_srli_w((v4i32)t0, 16);                           \
+  t5 = __msa_srli_w((v4i32)t1, 16);                           \
+  DOTP_SB2_SH(t4, t5, c1, c1, t2, t3);                        \
+  SRAI_H2_SH(t2, t3, 5);                                      \
+  ADD2(t0, t2, t1, t3, t0, t1);                               \
+  VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1);   \
+} while (0)
+
+#define TRANSFORM_COLOR_INVERSE_4(src, dst, c0, c1, mask0, mask1) do {  \
+  const v16i8 g0 = VSHF_SB(src, src, mask0);                            \
+  v8i16 t0 = __msa_dotp_s_h(c0, g0);                                    \
+  v8i16 t1;                                                             \
+  v4i32 t2;                                                             \
+  t0 = SRAI_H(t0, 5);                                                   \
+  t0 = __msa_addv_h(t0, (v8i16)src);                                    \
+  t2 = __msa_srli_w((v4i32)t0, 16);                                     \
+  t1 = __msa_dotp_s_h(c1, (v16i8)t2);                                   \
+  t1 = SRAI_H(t1, 5);                                                   \
+  t0 = t0 + t1;                                                         \
+  dst = VSHF_UB(src, t0, mask1);                                        \
+} while (0)
+
+static void ConvertBGRAToRGBA_MSA(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
+  int i;
+  const uint8_t* ptemp_src = (const uint8_t*)src;
+  uint8_t* ptemp_dst = (uint8_t*)dst;
+  v16u8 src0, dst0;
+  const v16u8 mask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1;
+    LD_UB2(ptemp_src, 16, src0, src1);
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, dst0, dst1);
+    ST_UB2(dst0, dst1, ptemp_dst, 16);
+    ptemp_src += 32;
+    ptemp_dst += 32;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(ptemp_src);
+      dst0 = VSHF_UB(src0, src0, mask);
+      ST_UB(dst0, ptemp_dst);
+      ptemp_src += 16;
+      ptemp_dst += 16;
+      num_pixels -= 4;
+    }
+    for (i = 0; i < num_pixels; i++) {
+      const uint8_t b = ptemp_src[2];
+      const uint8_t g = ptemp_src[1];
+      const uint8_t r = ptemp_src[0];
+      const uint8_t a = ptemp_src[3];
+      ptemp_dst[0] = b;
+      ptemp_dst[1] = g;
+      ptemp_dst[2] = r;
+      ptemp_dst[3] = a;
+      ptemp_src += 4;
+      ptemp_dst += 4;
+    }
+  }
+}
+
+static void ConvertBGRAToBGR_MSA(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst) {
+  const uint8_t* ptemp_src = (const uint8_t*)src;
+  uint8_t* ptemp_dst = (uint8_t*)dst;
+  const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
+                        16, 17, 18, 20 };
+  const v16u8 mask1 = { 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20,
+                        21, 22, 24, 25 };
+  const v16u8 mask2 = { 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25,
+                        26, 28, 29, 30 };
+
+  while (num_pixels >= 16) {
+    CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+    ptemp_src += 64;
+    ptemp_dst += 48;
+    num_pixels -= 16;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 12) {
+      CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+      ptemp_src += 48;
+      ptemp_dst += 36;
+      num_pixels -= 12;
+    } else if (num_pixels >= 8) {
+      CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1);
+      ptemp_src += 32;
+      ptemp_dst += 24;
+      num_pixels -= 8;
+    } else if (num_pixels >= 4) {
+      CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0);
+      ptemp_src += 16;
+      ptemp_dst += 12;
+      num_pixels -= 4;
+    }
+    if (num_pixels == 3) {
+      CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3);
+      CONVERT1_BGRA_BGR(ptemp_src + 8, ptemp_dst + 6);
+    } else if (num_pixels == 2) {
+      CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3);
+    } else if (num_pixels == 1) {
+      CONVERT1_BGRA_BGR(ptemp_src, ptemp_dst);
+    }
+  }
+}
+
+static void ConvertBGRAToRGB_MSA(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst) {
+  const uint8_t* ptemp_src = (const uint8_t*)src;
+  uint8_t* ptemp_dst = (uint8_t*)dst;
+  const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12,
+                        18, 17, 16, 22 };
+  const v16u8 mask1 = { 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22,
+                        21, 20, 26, 25 };
+  const v16u8 mask2 = { 8, 14, 13, 12, 18, 17, 16, 22, 21, 20, 26, 25,
+                        24, 30, 29, 28 };
+
+  while (num_pixels >= 16) {
+    CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+    ptemp_src += 64;
+    ptemp_dst += 48;
+    num_pixels -= 16;
+  }
+  if (num_pixels) {
+    if (num_pixels >= 12) {
+      CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+      ptemp_src += 48;
+      ptemp_dst += 36;
+      num_pixels -= 12;
+    } else if (num_pixels >= 8) {
+      CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1);
+      ptemp_src += 32;
+      ptemp_dst += 24;
+      num_pixels -= 8;
+    } else if (num_pixels >= 4) {
+      CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0);
+      ptemp_src += 16;
+      ptemp_dst += 12;
+      num_pixels -= 4;
+    }
+    if (num_pixels == 3) {
+      CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3);
+      CONVERT1_BGRA_RGB(ptemp_src + 8, ptemp_dst + 6);
+    } else if (num_pixels == 2) {
+      CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3);
+    } else if (num_pixels == 1) {
+      CONVERT1_BGRA_RGB(ptemp_src, ptemp_dst);
+    }
+  }
+}
+
+static void AddGreenToBlueAndRed_MSA(const uint32_t* const src, int num_pixels,
+                                     uint32_t* dst) {
+  int i;
+  const uint8_t* in = (const uint8_t*)src;
+  uint8_t* out = (uint8_t*)dst;
+  v16u8 src0, dst0, tmp0;
+  const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                       13, 255, 13, 255 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1, tmp1;
+    LD_UB2(in, 16, src0, src1);
+    VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
+    ADD2(src0, tmp0, src1, tmp1, dst0, dst1);
+    ST_UB2(dst0, dst1, out, 16);
+    in += 32;
+    out += 32;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(in);
+      tmp0 = VSHF_UB(src0, src0, mask);
+      dst0 = src0 + tmp0;
+      ST_UB(dst0, out);
+      in += 16;
+      out += 16;
+      num_pixels -= 4;
+    }
+    for (i = 0; i < num_pixels; i++) {
+      const uint8_t b = in[0];
+      const uint8_t g = in[1];
+      const uint8_t r = in[2];
+      out[0] = (b + g) & 0xff;
+      out[1] = g;
+      out[2] = (r + g) & 0xff;
+      out[4] = in[4];
+      out += 4;
+    }
+  }
+}
+
+static void TransformColorInverse_MSA(const VP8LMultipliers* const m,
+                                      const uint32_t* src, int num_pixels,
+                                      uint32_t* dst) {
+  v16u8 src0, dst0;
+  const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
+                                         (m->green_to_red_ << 16));
+  const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
+  const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                        13, 255, 13, 255 };
+  const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
+                        28, 13, 30, 15 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1;
+    LD_UB2(src, 4, src0, src1);
+    TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
+    ST_UB2(dst0, dst1, dst, 4);
+    src += 8;
+    dst += 8;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(src);
+      TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
+      ST_UB(dst0, dst);
+      src += 4;
+      dst += 4;
+      num_pixels -= 4;
+    }
+    if (num_pixels > 0) {
+      src0 = LD_UB(src);
+      TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
+      if (num_pixels == 3) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
+        SD(pix_d, dst + 0);
+        SW(pix_w, dst + 2);
+      } else if (num_pixels == 2) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        SD(pix_d, dst);
+      } else {
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
+        SW(pix_w, dst);
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) {
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MSA;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MSA;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MSA;
+
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MSA;
+  VP8LTransformColorInverse = TransformColorInverse_MSA;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8LDspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/lossless_neon.c b/media/libwebp/dsp/lossless_neon.c
index a7bf47f3c4..2122e46f7a 100644
--- a/media/libwebp/dsp/lossless_neon.c
+++ b/media/libwebp/dsp/lossless_neon.c
@@ -188,17 +188,21 @@ static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,
   return avg;
 }
 
-static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) {
-  return Average3_NEON(left, top[0], top[1]);
+static uint32_t Predictor5_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average3_NEON(*left, top[0], top[1]);
 }
-static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) {
-  return Average2_NEON(left, top[-1]);
+static uint32_t Predictor6_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average2_NEON(*left, top[-1]);
 }
-static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) {
-  return Average2_NEON(left, top[0]);
+static uint32_t Predictor7_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average2_NEON(*left, top[0]);
 }
-static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) {
-  return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]);
+static uint32_t Predictor13_NEON(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  return ClampedAddSubtractHalf_NEON(*left, top[0], top[-1]);
 }
 
 // Batch versions of those functions.
diff --git a/media/libwebp/dsp/lossless_sse2.c b/media/libwebp/dsp/lossless_sse2.c
index c40fcfb769..03796493de 100644
--- a/media/libwebp/dsp/lossless_sse2.c
+++ b/media/libwebp/dsp/lossless_sse2.c
@@ -18,7 +18,6 @@
 #include "../dsp/common_sse2.h"
 #include "../dsp/lossless.h"
 #include "../dsp/lossless_common.h"
-#include <assert.h>
 #include <emmintrin.h>
 
 //------------------------------------------------------------------------------
@@ -139,42 +138,51 @@ static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
   return output;
 }
 
-static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
+static uint32_t Predictor5_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2_SSE2(left, top[-1]);
+static uint32_t Predictor6_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average2_SSE2(*left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2_SSE2(left, top[0]);
+static uint32_t Predictor7_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average2_SSE2(*left, top[0]);
   return pred;
 }
-static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
   const uint32_t pred = Average2_SSE2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
   const uint32_t pred = Average2_SSE2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
+static uint32_t Predictor10_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
+static uint32_t Predictor11_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor12_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor13_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);
   return pred;
 }
 
@@ -191,8 +199,9 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
     _mm_storeu_si128((__m128i*)&out[i], res);
   }
   if (i != num_pixels) {
-    VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
+    VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
   }
+  (void)upper;
 }
 
 // Predictor1: left.
diff --git a/media/libwebp/dsp/lossless_sse41.c b/media/libwebp/dsp/lossless_sse41.c
new file mode 100644
index 0000000000..3308ac31ee
--- /dev/null
+++ b/media/libwebp/dsp/lossless_sse41.c
@@ -0,0 +1,132 @@
+// Copyright 2021 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE41 variant of methods for lossless decoder
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include "../dsp/common_sse41.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+
+//------------------------------------------------------------------------------
+// Color-space conversion functions
+
+static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
+                                        const uint32_t* const src,
+                                        int num_pixels, uint32_t* dst) {
+// sign-extended multiplying constants, pre-shifted by 5.
+#define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
+  const __m128i mults_rb = _mm_set1_epi32((uint32_t)CST(green_to_red_) << 16 |
+                                          (CST(green_to_blue_) & 0xffff));
+  const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_));
+#undef CST
+  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);
+  const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
+                                      -1, 9, -1, 9, -1, 13, -1, 13);
+  const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
+                                      -1, 10, -1, -1, -1, 14, -1, -1);
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
+    const __m128i C = _mm_mulhi_epi16(B, mults_rb);
+    const __m128i D = _mm_add_epi8(A, C);
+    const __m128i E = _mm_shuffle_epi8(D, perm2);
+    const __m128i F = _mm_mulhi_epi16(E, mults_b2);
+    const __m128i G = _mm_add_epi8(D, F);
+    const __m128i out = _mm_blendv_epi8(G, A, mask_ag);
+    _mm_storeu_si128((__m128i*)&dst[i], out);
+  }
+  // Fall-back to C-version for left-overs.
+  if (i != num_pixels) {
+    VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#define ARGB_TO_RGB_SSE41 do {                        \
+  while (num_pixels >= 16) {                          \
+    const __m128i in0 = _mm_loadu_si128(in + 0);      \
+    const __m128i in1 = _mm_loadu_si128(in + 1);      \
+    const __m128i in2 = _mm_loadu_si128(in + 2);      \
+    const __m128i in3 = _mm_loadu_si128(in + 3);      \
+    const __m128i a0 = _mm_shuffle_epi8(in0, perm0);  \
+    const __m128i a1 = _mm_shuffle_epi8(in1, perm1);  \
+    const __m128i a2 = _mm_shuffle_epi8(in2, perm2);  \
+    const __m128i a3 = _mm_shuffle_epi8(in3, perm3);  \
+    const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
+    const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
+    const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
+    _mm_storeu_si128(out + 0, b0);                    \
+    _mm_storeu_si128(out + 1, b1);                    \
+    _mm_storeu_si128(out + 2, b2);                    \
+    in += 4;                                          \
+    out += 3;                                         \
+    num_pixels -= 16;                                 \
+  }                                                   \
+} while (0)
+
+static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
+                                   uint8_t* dst) {
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
+                                      8, 14, 13, 12, -1, -1, -1, -1);
+  const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
+  const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
+  const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
+
+  ARGB_TO_RGB_SSE41;
+
+  // left-overs
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
+}
+
+static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
+                                   int num_pixels, uint8_t* dst) {
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
+                                      12, 13, 14, -1, -1, -1, -1);
+  const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
+  const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
+  const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
+
+  ARGB_TO_RGB_SSE41;
+
+  // left-overs
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
+}
+
+#undef ARGB_TO_RGB_SSE41
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
+  VP8LTransformColorInverse = TransformColorInverse_SSE41;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8LDspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/media/libwebp/dsp/moz.build b/media/libwebp/dsp/moz.build
index c00d5e1401..6e2c9deb2c 100644
--- a/media/libwebp/dsp/moz.build
+++ b/media/libwebp/dsp/moz.build
@@ -1,4 +1,5 @@
 # -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
@@ -8,53 +9,103 @@ with Files('**'):
 
 SOURCES += [
     'alpha_processing.c',
-    'alpha_processing_neon.c',
-    'alpha_processing_sse2.c',
-    'alpha_processing_sse41.c',
+    'cost.c',
     'dec.c',
     'dec_clip_tables.c',
-    'dec_neon.c',
-    'dec_sse2.c',
-    'dec_sse41.c',
+    'enc.c',
     'filters.c',
-    'filters_neon.c',
-    'filters_sse2.c',
     'lossless.c',
-    'lossless_neon.c',
-    'lossless_sse2.c',
+    'lossless_enc.c',
     'rescaler.c',
-    'rescaler_neon.c',
-    'rescaler_sse2.c',
+    'ssim.c',
     'upsampling.c',
-    'upsampling_neon.c',
-    'upsampling_sse2.c',
-    'upsampling_sse41.c',
     'yuv.c',
-    'yuv_neon.c',
-    'yuv_sse2.c',
-    'yuv_sse41.c',
 ]
 
 if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['BUILD_ARM_NEON']:
-    SOURCES['alpha_processing_neon.c'].flags += CONFIG['NEON_FLAGS']
-    SOURCES['dec_neon.c'].flags += CONFIG['NEON_FLAGS']
-    SOURCES['filters_neon.c'].flags += CONFIG['NEON_FLAGS']
-    SOURCES['lossless_neon.c'].flags += CONFIG['NEON_FLAGS']
-    SOURCES['rescaler_neon.c'].flags += CONFIG['NEON_FLAGS']
-    SOURCES['upsampling_neon.c'].flags += CONFIG['NEON_FLAGS']
-    SOURCES['yuv_neon.c'].flags += CONFIG['NEON_FLAGS']
+    SOURCES += [
+        'alpha_processing_neon.c',
+        'cost_neon.c',
+        'dec_neon.c',
+        'enc_neon.c',
+        'filters_neon.c',
+        'lossless_enc_neon.c',
+        'lossless_neon.c',
+        'rescaler_neon.c',
+        'upsampling_neon.c',
+        'yuv_neon.c',
+    ]
+    DEFINES['WEBP_HAVE_NEON'] = 1;
+    for f in SOURCES:
+      if f.endswith('neon.c'):
+        SOURCES[f].flags += CONFIG['NEON_FLAGS']
+elif CONFIG['CPU_ARCH'] == 'aarch64':
+    SOURCES += [
+        'alpha_processing_neon.c',
+        'cost_neon.c',
+        'dec_neon.c',
+        'enc_neon.c',
+        'filters_neon.c',
+        'lossless_enc_neon.c',
+        'lossless_neon.c',
+        'rescaler_neon.c',
+        'upsampling_neon.c',
+        'yuv_neon.c',
+    ]
+    DEFINES['WEBP_HAVE_NEON'] = 1;
 elif CONFIG['INTEL_ARCHITECTURE']:
-    SOURCES['alpha_processing_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['alpha_processing_sse41.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['dec_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['dec_sse41.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['filters_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['lossless_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['rescaler_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['upsampling_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['upsampling_sse41.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['yuv_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['yuv_sse41.c'].flags += CONFIG['SSE2_FLAGS']
+    SOURCES += [
+        'alpha_processing_sse2.c',
+        'alpha_processing_sse41.c',
+        'cost_sse2.c',
+        'dec_sse2.c',
+        'dec_sse41.c',
+        'enc_sse2.c',
+        'enc_sse41.c',
+        'filters_sse2.c',
+        'lossless_enc_sse2.c',
+        'lossless_enc_sse41.c',
+        'lossless_sse2.c',
+        'lossless_sse41.c',
+        'rescaler_sse2.c',
+        'ssim_sse2.c',
+        'upsampling_sse2.c',
+        'upsampling_sse41.c',
+        'yuv_sse2.c',
+        'yuv_sse41.c',
+    ]
+    DEFINES['WEBP_HAVE_SSE2'] = 1;
+    DEFINES['WEBP_HAVE_SSE41'] = 1;
+    for f in SOURCES:
+      if f.endswith('sse2.c'):
+        SOURCES[f].flags += CONFIG['SSE2_FLAGS']
+      elif f.endswith('sse41.c'):
+        SOURCES[f].flags += ['-msse4.1']
+elif CONFIG['CPU_ARCH'].startswith('mips'):
+    SOURCES += [
+        'alpha_processing_mips_dsp_r2.c',
+        'cost_mips32.c',
+        'cost_mips_dsp_r2.c',
+        'dec_mips32.c',
+        'dec_mips_dsp_r2.c',
+        'enc_mips32.c',
+        'enc_mips_dsp_r2.c',
+        'filters_mips_dsp_r2.c',
+        'lossless_enc_mips32.c',
+        'lossless_enc_mips_dsp_r2.c',
+        'lossless_mips_dsp_r2.c',
+        'lossless_msa.c',
+        'rescaler_mips32.c',
+        'rescaler_mips_dsp_r2.c',
+        'rescaler_msa.c',
+        'upsampling_mips_dsp_r2.c',
+        'upsampling_msa.c',
+        'yuv_mips32.c',
+        'yuv_mips_dsp_r2.c',
+    ]
+
+if CONFIG['CC_TYPE'] in ('clang', 'clang-cl'):
+    CFLAGS += ['-Wno-unreachable-code']
 
 FINAL_LIBRARY = 'gkmedias'
 
diff --git a/media/libwebp/dsp/msa_macro.h b/media/libwebp/dsp/msa_macro.h
index de026a1d9e..717e3b7b9f 100644
--- a/media/libwebp/dsp/msa_macro.h
+++ b/media/libwebp/dsp/msa_macro.h
@@ -14,6 +14,10 @@
 #ifndef WEBP_DSP_MSA_MACRO_H_
 #define WEBP_DSP_MSA_MACRO_H_
 
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
 #include <stdint.h>
 #include <msa.h>
 
@@ -1389,4 +1393,5 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
 } while (0)
 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 
+#endif  // WEBP_USE_MSA
 #endif  // WEBP_DSP_MSA_MACRO_H_
diff --git a/media/libwebp/dsp/neon.h b/media/libwebp/dsp/neon.h
index 63c27a2901..9a0f630de0 100644
--- a/media/libwebp/dsp/neon.h
+++ b/media/libwebp/dsp/neon.h
@@ -12,10 +12,12 @@
 #ifndef WEBP_DSP_NEON_H_
 #define WEBP_DSP_NEON_H_
 
-#include <arm_neon.h>
-
 #include "../dsp/dsp.h"
 
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
 // Right now, some intrinsics functions seem slower, so we disable them
 // everywhere except newer clang/gcc or aarch64 where the inline assembly is
 // incompatible.
@@ -98,4 +100,5 @@ static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
 } while (0)
 #endif
 
+#endif  // WEBP_USE_NEON
 #endif  // WEBP_DSP_NEON_H_
diff --git a/media/libwebp/dsp/quant.h b/media/libwebp/dsp/quant.h
index b82e728a53..14d8613431 100644
--- a/media/libwebp/dsp/quant.h
+++ b/media/libwebp/dsp/quant.h
@@ -10,6 +10,8 @@
 #ifndef WEBP_DSP_QUANT_H_
 #define WEBP_DSP_QUANT_H_
 
+#include <string.h>
+
 #include "../dsp/dsp.h"
 #include "../webp/types.h"
 
@@ -67,4 +69,17 @@ static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
 #endif  // defined(WEBP_USE_NEON) && !defined(WEBP_ANDROID_NEON) &&
         // !defined(WEBP_HAVE_NEON_RTCD)
 
+static WEBP_INLINE int IsFlatSource16(const uint8_t* src) {
+  const uint32_t v = src[0] * 0x01010101u;
+  int i;
+  for (i = 0; i < 16; ++i) {
+    if (memcmp(src + 0, &v, 4) || memcmp(src +  4, &v, 4) ||
+        memcmp(src + 8, &v, 4) || memcmp(src + 12, &v, 4)) {
+      return 0;
+    }
+    src += BPS;
+  }
+  return 1;
+}
+
 #endif  // WEBP_DSP_QUANT_H_
diff --git a/media/libwebp/dsp/rescaler.c b/media/libwebp/dsp/rescaler.c
index 6bf387f8e0..4bbd281b1c 100644
--- a/media/libwebp/dsp/rescaler.c
+++ b/media/libwebp/dsp/rescaler.c
@@ -38,8 +38,9 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
     int x_out = channel;
     // simple bilinear interpolation
     int accum = wrk->x_add;
-    int left = src[x_in];
-    int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
+    rescaler_t left = (rescaler_t)src[x_in];
+    rescaler_t right =
+        (wrk->src_width > 1) ? (rescaler_t)src[x_in + x_stride] : left;
     x_in += x_stride;
     while (1) {
       wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
@@ -50,7 +51,7 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
         left = right;
         x_in += x_stride;
         assert(x_in < wrk->src_width * x_stride);
-        right = src[x_in];
+        right = (rescaler_t)src[x_in];
         accum += wrk->x_add;
       }
     }
@@ -109,8 +110,7 @@ void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
     for (x_out = 0; x_out < x_out_max; ++x_out) {
       const uint32_t J = frow[x_out];
       const int v = (int)MULT_FIX(J, wrk->fy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   } else {
     const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
@@ -120,8 +120,7 @@ void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
                        + (uint64_t)B * irow[x_out];
       const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
       const int v = (int)MULT_FIX(J, wrk->fy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   }
 }
@@ -138,17 +137,15 @@ void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
   assert(!wrk->y_expand);
   if (yscale) {
     for (x_out = 0; x_out < x_out_max; ++x_out) {
-      const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
-      const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
+      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = frac;   // new fractional start
     }
   } else {
     for (x_out = 0; x_out < x_out_max; ++x_out) {
       const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = 0;
     }
   }
@@ -217,7 +214,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
   WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPRescalerDspInitSSE2();
     }
@@ -239,7 +236,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPRescalerDspInitNEON();
diff --git a/media/libwebp/dsp/rescaler_mips32.c b/media/libwebp/dsp/rescaler_mips32.c
new file mode 100644
index 0000000000..44fad3fbe7
--- /dev/null
+++ b/media/libwebp/dsp/rescaler_mips32.c
@@ -0,0 +1,295 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of rescaling functions
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32) && !defined(WEBP_REDUCE_SIZE)
+
+#include <assert.h>
+#include "../utils/rescaler_utils.h"
+
+//------------------------------------------------------------------------------
+// Row import
+
+static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
+                                   const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const int fx_scale = wrk->fx_scale;
+  const int x_add = wrk->x_add;
+  const int x_sub = wrk->x_sub;
+  const int x_stride1 = x_stride << 2;
+  int channel;
+  assert(!wrk->x_expand);
+  assert(!WebPRescalerInputDone(wrk));
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    rescaler_t* frow = wrk->frow + channel;
+    int temp1, temp2, temp3;
+    int base, frac, sum;
+    int accum, accum1;
+    int loop_c = x_out_max - channel;
+
+    __asm__ volatile (
+      "li     %[temp1],   0x8000                    \n\t"
+      "li     %[temp2],   0x10000                   \n\t"
+      "li     %[sum],     0                         \n\t"
+      "li     %[accum],   0                         \n\t"
+    "1:                                             \n\t"
+      "addu   %[accum],   %[accum],   %[x_add]      \n\t"
+      "li     %[base],    0                         \n\t"
+      "blez   %[accum],   3f                        \n\t"
+    "2:                                             \n\t"
+      "lbu    %[base],    0(%[src1])                \n\t"
+      "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
+      "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
+      "addu   %[sum],     %[sum],     %[base]       \n\t"
+      "bgtz   %[accum],   2b                        \n\t"
+    "3:                                             \n\t"
+      "negu   %[accum1],  %[accum]                  \n\t"
+      "mul    %[frac],    %[base],    %[accum1]     \n\t"
+      "mul    %[temp3],   %[sum],     %[x_sub]      \n\t"
+      "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
+      "mult   %[temp1],   %[temp2]                  \n\t"
+      "maddu  %[frac],    %[fx_scale]               \n\t"
+      "mfhi   %[sum]                                \n\t"
+      "subu   %[temp3],   %[temp3],   %[frac]       \n\t"
+      "sw     %[temp3],   0(%[frow])                \n\t"
+      "addu   %[frow],    %[frow],    %[x_stride1]  \n\t"
+      "bgtz   %[loop_c],  1b                        \n\t"
+      : [accum]"=&r"(accum), [src1]"+r"(src1), [temp3]"=&r"(temp3),
+        [sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac),
+        [frow]"+r"(frow), [accum1]"=&r"(accum1),
+        [temp2]"=&r"(temp2), [temp1]"=&r"(temp1)
+      : [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale),
+        [x_sub]"r"(x_sub), [x_add]"r"(x_add),
+        [loop_c]"r"(loop_c), [x_stride1]"r"(x_stride1)
+      : "memory", "hi", "lo"
+    );
+    assert(accum == 0);
+  }
+}
+
+static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
+                                   const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const int x_add = wrk->x_add;
+  const int x_sub = wrk->x_sub;
+  const int src_width = wrk->src_width;
+  const int x_stride1 = x_stride << 2;
+  int channel;
+  assert(wrk->x_expand);
+  assert(!WebPRescalerInputDone(wrk));
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    rescaler_t* frow = wrk->frow + channel;
+    int temp1, temp2, temp3, temp4;
+    int frac;
+    int accum;
+    int x_out = channel;
+
+    __asm__ volatile (
+      "addiu  %[temp3],   %[src_width], -1            \n\t"
+      "lbu    %[temp2],   0(%[src1])                  \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "bgtz   %[temp3],   0f                          \n\t"
+      "addiu  %[temp1],   %[temp2],     0             \n\t"
+      "b      3f                                      \n\t"
+    "0:                                               \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+    "3:                                               \n\t"
+      "addiu  %[accum],   %[x_add],     0             \n\t"
+    "1:                                               \n\t"
+      "subu   %[temp3],   %[temp2],     %[temp1]      \n\t"
+      "mul    %[temp3],   %[temp3],     %[accum]      \n\t"
+      "mul    %[temp4],   %[temp1],     %[x_add]      \n\t"
+      "addu   %[temp3],   %[temp4],     %[temp3]      \n\t"
+      "sw     %[temp3],   0(%[frow])                  \n\t"
+      "addu   %[frow],    %[frow],      %[x_stride1]  \n\t"
+      "addu   %[x_out],   %[x_out],     %[x_stride]   \n\t"
+      "subu   %[temp3],   %[x_out],     %[x_out_max]  \n\t"
+      "bgez   %[temp3],   2f                          \n\t"
+      "subu   %[accum],   %[accum],     %[x_sub]      \n\t"
+      "bgez   %[accum],   4f                          \n\t"
+      "addiu  %[temp2],   %[temp1],     0             \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+      "addu   %[accum],   %[accum],     %[x_add]      \n\t"
+    "4:                                               \n\t"
+      "b      1b                                      \n\t"
+    "2:                                               \n\t"
+      : [src1]"+r"(src1), [accum]"=&r"(accum), [temp1]"=&r"(temp1),
+        [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+        [x_out]"+r"(x_out), [frac]"=&r"(frac), [frow]"+r"(frow)
+      : [x_stride]"r"(x_stride), [x_add]"r"(x_add), [x_sub]"r"(x_sub),
+        [x_stride1]"r"(x_stride1), [src_width]"r"(src_width),
+        [x_out_max]"r"(x_out_max)
+      : "memory", "hi", "lo"
+    );
+    assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Row export
+
+static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  int temp0, temp1, temp3, temp4, temp5, loop_end;
+  const int temp2 = (int)wrk->fy_scale;
+  const int temp6 = x_out_max << 2;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "lw       %[temp1],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[A],        %[temp0]                   \n\t"
+      "maddu    %[B],        %[temp1]                   \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp5],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
+      : "memory", "hi", "lo"
+    );
+  }
+}
+
+#if 0  // disabled for now. TODO(skal): make match the C-code
+static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) {
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const rescaler_t* frow = wrk->frow;
+  const int yscale = wrk->fy_scale * (-wrk->y_accum);
+  int temp0, temp1, temp3, temp4, temp5, loop_end;
+  const int temp2 = (int)wrk->fxy_scale;
+  const int temp6 = x_out_max << 2;
+
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  assert(wrk->fxy_scale != 0);
+  if (yscale) {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "maddu    %[temp0],    %[yscale]                  \n\t"
+      "mfhi     %[temp1]                                \n\t"
+      "lw       %[temp0],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "subu     %[temp0],    %[temp0],    %[temp1]      \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sw       %[temp1],    -4(%[irow])                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
+  } else {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[irow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sw       $zero,       -4(%[irow])                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[irow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
+        [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
+  }
+}
+#endif  // 0
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
+  WebPRescalerImportRowExpand = ImportRowExpand_MIPS32;
+  WebPRescalerImportRowShrink = ImportRowShrink_MIPS32;
+  WebPRescalerExportRowExpand = ExportRowExpand_MIPS32;
+//  WebPRescalerExportRowShrink = ExportRowShrink_MIPS32;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/rescaler_mips_dsp_r2.c b/media/libwebp/dsp/rescaler_mips_dsp_r2.c
new file mode 100644
index 0000000000..d6f2996578
--- /dev/null
+++ b/media/libwebp/dsp/rescaler_mips_dsp_r2.c
@@ -0,0 +1,314 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of rescaling functions
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2) && !defined(WEBP_REDUCE_SIZE)
+
+#include <assert.h>
+#include "../utils/rescaler_utils.h"
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
+
+//------------------------------------------------------------------------------
+// Row export
+
+#if 0  // disabled for now. TODO(skal): make match the C-code
+static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
+  int i;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const rescaler_t* frow = wrk->frow;
+  const int yscale = wrk->fy_scale * (-wrk->y_accum);
+  int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
+  const int temp7 = (int)wrk->fxy_scale;
+  const int temp6 = (x_out_max & ~0x3) << 2;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  assert(wrk->fxy_scale != 0);
+  if (yscale) {
+    if (x_out_max >= 4) {
+      int temp8, temp9, temp10, temp11;
+      __asm__ volatile (
+        "li       %[temp3],    0x10000                    \n\t"
+        "li       %[temp4],    0x8000                     \n\t"
+        "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[frow])                 \n\t"
+        "lw       %[temp1],    4(%[frow])                 \n\t"
+        "lw       %[temp2],    8(%[frow])                 \n\t"
+        "lw       %[temp5],    12(%[frow])                \n\t"
+        "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac0,        %[temp0],    %[yscale]     \n\t"
+        "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac1,        %[temp1],    %[yscale]     \n\t"
+        "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac2,        %[temp2],    %[yscale]     \n\t"
+        "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac3,        %[temp5],    %[yscale]     \n\t"
+        "addiu    %[frow],     %[frow],     16            \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp5],    $ac3                       \n\t"
+        "lw       %[temp8],    0(%[irow])                 \n\t"
+        "lw       %[temp9],    4(%[irow])                 \n\t"
+        "lw       %[temp10],   8(%[irow])                 \n\t"
+        "lw       %[temp11],   12(%[irow])                \n\t"
+        "addiu    %[dst],      %[dst],      4             \n\t"
+        "addiu    %[irow],     %[irow],     16            \n\t"
+        "subu     %[temp8],    %[temp8],    %[temp0]      \n\t"
+        "subu     %[temp9],    %[temp9],    %[temp1]      \n\t"
+        "subu     %[temp10],   %[temp10],   %[temp2]      \n\t"
+        "subu     %[temp11],   %[temp11],   %[temp5]      \n\t"
+        "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac0,        %[temp8],    %[temp7]      \n\t"
+        "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac1,        %[temp9],    %[temp7]      \n\t"
+        "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac2,        %[temp10],   %[temp7]      \n\t"
+        "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac3,        %[temp11],   %[temp7]      \n\t"
+        "mfhi     %[temp8],    $ac0                       \n\t"
+        "mfhi     %[temp9],    $ac1                       \n\t"
+        "mfhi     %[temp10],   $ac2                       \n\t"
+        "mfhi     %[temp11],   $ac3                       \n\t"
+        "sw       %[temp0],    -16(%[irow])               \n\t"
+        "sw       %[temp1],    -12(%[irow])               \n\t"
+        "sw       %[temp2],    -8(%[irow])                \n\t"
+        "sw       %[temp5],    -4(%[irow])                \n\t"
+        "sb       %[temp8],    -4(%[dst])                 \n\t"
+        "sb       %[temp9],    -3(%[dst])                 \n\t"
+        "sb       %[temp10],   -2(%[dst])                 \n\t"
+        "sb       %[temp11],   -1(%[dst])                 \n\t"
+        "bne      %[frow],     %[loop_end], 1b            \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+          [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
+          [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
+          [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
+        : [temp7]"r"(temp7), [yscale]"r"(yscale), [temp6]"r"(temp6)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+      );
+    }
+    for (i = 0; i < (x_out_max & 0x3); ++i) {
+      const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(*frow++, yscale);
+      const int v = (int)MULT_FIX(*irow - frac, wrk->fxy_scale);
+      *dst++ = (v > 255) ? 255u : (uint8_t)v;
+      *irow++ = frac;   // new fractional start
+    }
+  } else {
+    if (x_out_max >= 4) {
+      __asm__ volatile (
+        "li       %[temp3],    0x10000                    \n\t"
+        "li       %[temp4],    0x8000                     \n\t"
+        "addu     %[loop_end], %[irow],     %[temp6]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[irow])                 \n\t"
+        "lw       %[temp1],    4(%[irow])                 \n\t"
+        "lw       %[temp2],    8(%[irow])                 \n\t"
+        "lw       %[temp5],    12(%[irow])                \n\t"
+        "addiu    %[dst],      %[dst],      4             \n\t"
+        "addiu    %[irow],     %[irow],     16            \n\t"
+        "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
+        "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
+        "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
+        "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac3,        %[temp5],    %[temp7]      \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp5],    $ac3                       \n\t"
+        "sw       $zero,       -16(%[irow])               \n\t"
+        "sw       $zero,       -12(%[irow])               \n\t"
+        "sw       $zero,       -8(%[irow])                \n\t"
+        "sw       $zero,       -4(%[irow])                \n\t"
+        "sb       %[temp0],    -4(%[dst])                 \n\t"
+        "sb       %[temp1],    -3(%[dst])                 \n\t"
+        "sb       %[temp2],    -2(%[dst])                 \n\t"
+        "sb       %[temp5],    -1(%[dst])                 \n\t"
+        "bne      %[irow],     %[loop_end], 1b            \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
+          [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
+        : [temp7]"r"(temp7), [temp6]"r"(temp6)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+      );
+    }
+    for (i = 0; i < (x_out_max & 0x3); ++i) {
+      const int v = (int)MULT_FIX_FLOOR(*irow, wrk->fxy_scale);
+      *dst++ = (v > 255) ? 255u : (uint8_t)v;
+      *irow++ = 0;
+    }
+  }
+}
+#endif  // 0
+
+static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
+  int i;
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
+  const int temp6 = (x_out_max & ~0x3) << 2;
+  const int temp7 = (int)wrk->fy_scale;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    if (x_out_max >= 4) {
+      __asm__ volatile (
+        "li       %[temp4],    0x10000                    \n\t"
+        "li       %[temp5],    0x8000                     \n\t"
+        "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[frow])                 \n\t"
+        "lw       %[temp1],    4(%[frow])                 \n\t"
+        "lw       %[temp2],    8(%[frow])                 \n\t"
+        "lw       %[temp3],    12(%[frow])                \n\t"
+        "addiu    %[dst],      %[dst],      4             \n\t"
+        "addiu    %[frow],     %[frow],     16            \n\t"
+        "mult     $ac0,        %[temp4],    %[temp5]      \n\t"
+        "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
+        "mult     $ac1,        %[temp4],    %[temp5]      \n\t"
+        "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
+        "mult     $ac2,        %[temp4],    %[temp5]      \n\t"
+        "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
+        "mult     $ac3,        %[temp4],    %[temp5]      \n\t"
+        "maddu    $ac3,        %[temp3],    %[temp7]      \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp3],    $ac3                       \n\t"
+        "sb       %[temp0],    -4(%[dst])                 \n\t"
+        "sb       %[temp1],    -3(%[dst])                 \n\t"
+        "sb       %[temp2],    -2(%[dst])                 \n\t"
+        "sb       %[temp3],    -1(%[dst])                 \n\t"
+        "bne      %[frow],     %[loop_end], 1b            \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+          [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
+        : [temp7]"r"(temp7), [temp6]"r"(temp6)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+      );
+    }
+    for (i = 0; i < (x_out_max & 0x3); ++i) {
+      const uint32_t J = *frow++;
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      *dst++ = (v > 255) ? 255u : (uint8_t)v;
+    }
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    if (x_out_max >= 4) {
+      int temp8, temp9, temp10, temp11;
+      __asm__ volatile (
+        "li       %[temp8],    0x10000                    \n\t"
+        "li       %[temp9],    0x8000                     \n\t"
+        "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[frow])                 \n\t"
+        "lw       %[temp1],    4(%[frow])                 \n\t"
+        "lw       %[temp2],    8(%[frow])                 \n\t"
+        "lw       %[temp3],    12(%[frow])                \n\t"
+        "lw       %[temp4],    0(%[irow])                 \n\t"
+        "lw       %[temp5],    4(%[irow])                 \n\t"
+        "lw       %[temp10],   8(%[irow])                 \n\t"
+        "lw       %[temp11],   12(%[irow])                \n\t"
+        "addiu    %[dst],      %[dst],      4             \n\t"
+        "mult     $ac0,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac0,        %[A],        %[temp0]      \n\t"
+        "maddu    $ac0,        %[B],        %[temp4]      \n\t"
+        "mult     $ac1,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac1,        %[A],        %[temp1]      \n\t"
+        "maddu    $ac1,        %[B],        %[temp5]      \n\t"
+        "mult     $ac2,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac2,        %[A],        %[temp2]      \n\t"
+        "maddu    $ac2,        %[B],        %[temp10]     \n\t"
+        "mult     $ac3,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac3,        %[A],        %[temp3]      \n\t"
+        "maddu    $ac3,        %[B],        %[temp11]     \n\t"
+        "addiu    %[frow],     %[frow],     16            \n\t"
+        "addiu    %[irow],     %[irow],     16            \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp3],    $ac3                       \n\t"
+        "mult     $ac0,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
+        "mult     $ac1,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
+        "mult     $ac2,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
+        "mult     $ac3,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac3,        %[temp3],    %[temp7]      \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp3],    $ac3                       \n\t"
+        "sb       %[temp0],    -4(%[dst])                 \n\t"
+        "sb       %[temp1],    -3(%[dst])                 \n\t"
+        "sb       %[temp2],    -2(%[dst])                 \n\t"
+        "sb       %[temp3],    -1(%[dst])                 \n\t"
+        "bne      %[frow],     %[loop_end], 1b            \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+          [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
+          [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
+          [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
+        : [temp7]"r"(temp7), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+      );
+    }
+    for (i = 0; i < (x_out_max & 0x3); ++i) {
+      const uint64_t I = (uint64_t)A * *frow++
+                       + (uint64_t)B * *irow++;
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      *dst++ = (v > 255) ? 255u : (uint8_t)v;
+    }
+  }
+}
+
+#undef MULT_FIX_FLOOR
+#undef MULT_FIX
+#undef ROUNDER
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
+  WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2;
+//  WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/rescaler_msa.c b/media/libwebp/dsp/rescaler_msa.c
new file mode 100644
index 0000000000..3366b6d637
--- /dev/null
+++ b/media/libwebp/dsp/rescaler_msa.c
@@ -0,0 +1,443 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of rescaling functions
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
+
+#include <assert.h>
+
+#include "../utils/rescaler_utils.h"
+#include "../dsp/msa_macro.h"
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
+
+#define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do {  \
+  v4u32 tmp0, tmp1, tmp2, tmp3;                                       \
+  v16u8 t0, t1, t2, t3, t4, t5;                                       \
+  v2u64 out0, out1, out2, out3;                                       \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                                 \
+  ILVRL_W2_UW(zero, in1, tmp2, tmp3);                                 \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
+  DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
+  PCKEV_B2_UB(out1, out0, out3, out2, t0, t1);                        \
+  ILVRL_W2_UW(zero, in2, tmp0, tmp1);                                 \
+  ILVRL_W2_UW(zero, in3, tmp2, tmp3);                                 \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
+  DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
+  PCKEV_B2_UB(out1, out0, out3, out2, t2, t3);                        \
+  PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);                                \
+  dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);                   \
+} while (0)
+
+#define CALC_MULT_FIX_4(in0, scale, shift, dst) do {  \
+  v4u32 tmp0, tmp1;                                   \
+  v16i8 t0, t1;                                       \
+  v2u64 out0, out1;                                   \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                 \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);  \
+  SRAR_D2_UD(out0, out1, shift);                      \
+  t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);       \
+  t1 = __msa_pckev_b(t0, t0);                         \
+  t0 = __msa_pckev_b(t1, t1);                         \
+  dst = __msa_copy_s_w((v4i32)t0, 0);                 \
+} while (0)
+
+#define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift,  \
+                          dst0, dst1, dst2, dst3) do {         \
+  v4u32 tmp0, tmp1, tmp2, tmp3;                                \
+  v2u64 out0, out1, out2, out3;                                \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                          \
+  ILVRL_W2_UW(zero, in1, tmp2, tmp3);                          \
+  DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
+  DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
+  PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1);             \
+  ILVRL_W2_UW(zero, in2, tmp0, tmp1);                          \
+  ILVRL_W2_UW(zero, in3, tmp2, tmp3);                          \
+  DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
+  DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
+  PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3);             \
+} while (0)
+
+#define CALC_MULT_FIX1_4(in0, scale, shift, dst) do {    \
+  v4u32 tmp0, tmp1;                                      \
+  v2u64 out0, out1;                                      \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                    \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);     \
+  SRAR_D2_UD(out0, out1, shift);                         \
+  dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0);  \
+} while (0)
+
+#define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift,  \
+                          dst0, dst1) do {                         \
+  v4u32 tmp0, tmp1, tmp2, tmp3;                                    \
+  v2u64 out0, out1, out2, out3;                                    \
+  ILVRL_W2_UW(in0, in2, tmp0, tmp1);                               \
+  ILVRL_W2_UW(in1, in3, tmp2, tmp3);                               \
+  DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                 \
+  DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3);                 \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
+  DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);               \
+  DOTP_UW2_UD(out2, out3, scale, scale, out2, out3);               \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
+  PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1);                 \
+} while (0)
+
+#define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do {  \
+  v4u32 tmp0, tmp1;                                               \
+  v2u64 out0, out1;                                               \
+  v16i8 t0, t1;                                                   \
+  ILVRL_W2_UW(in0, in1, tmp0, tmp1);                              \
+  DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                \
+  SRAR_D2_UD(out0, out1, shift);                                  \
+  DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);              \
+  SRAR_D2_UD(out0, out1, shift);                                  \
+  t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);                   \
+  t1 = __msa_pckev_b(t0, t0);                                     \
+  t0 = __msa_pckev_b(t1, t1);                                     \
+  dst = __msa_copy_s_w((v4i32)t0, 0);                             \
+} while (0)
+
+static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
+                                          int length,
+                                          WebPRescaler* const wrk) {
+  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
+  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+  const v4i32 zero = { 0 };
+
+  while (length >= 16) {
+    v4u32 src0, src1, src2, src3;
+    v16u8 out;
+    LD_UW4(frow, 4, src0, src1, src2, src3);
+    CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
+    ST_UB(out, dst);
+    length -= 16;
+    frow   += 16;
+    dst    += 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 src0, src1, src2;
+      LD_UW3(frow, 4, src0, src1, src2);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      CALC_MULT_FIX_4(src2, scale, shift, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      length -= 12;
+      frow   += 12;
+      dst    += 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 src0, src1;
+      LD_UW2(frow, 4, src0, src1);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      length -= 8;
+      frow   += 8;
+      dst    += 8;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      const v4u32 src0 = LD_UW(frow);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      SW(val0_m, dst);
+      length -= 4;
+      frow   += 4;
+      dst    += 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const uint32_t J = frow[x_out];
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
+    }
+  }
+}
+
+static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
+                                          uint8_t* dst, int length,
+                                          WebPRescaler* const wrk) {
+  const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+  const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+  const v4i32 B1 = __msa_fill_w(B);
+  const v4i32 A1 = __msa_fill_w(A);
+  const v4i32 AB = __msa_ilvr_w(A1, B1);
+  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
+  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+
+  while (length >= 16) {
+    v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
+    v16u8 t0, t1, t2, t3, t4, t5;
+    LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
+    LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
+    CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
+    CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
+    PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
+    t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
+    ST_UB(t0, dst);
+    frow   += 16;
+    irow   += 16;
+    dst    += 16;
+    length -= 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
+      LD_UW3(frow, 4, frow0, frow1, frow2);
+      LD_UW3(irow, 4, irow0, irow1, irow2);
+      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+      CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
+      CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      frow   += 12;
+      irow   += 12;
+      dst    += 12;
+      length -= 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 frow0, frow1, irow0, irow1;
+      LD_UW2(frow, 4, frow0, frow1);
+      LD_UW2(irow, 4, irow0, irow1);
+      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+      CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      frow   += 4;
+      irow   += 4;
+      dst    += 4;
+      length -= 4;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      const v4u32 frow0 = LD_UW(frow + 0);
+      const v4u32 irow0 = LD_UW(irow + 0);
+      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+      SW(val0_m, dst);
+      frow   += 4;
+      irow   += 4;
+      dst    += 4;
+      length -= 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const uint64_t I = (uint64_t)A * frow[x_out]
+                       + (uint64_t)B * irow[x_out];
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
+    }
+  }
+}
+
+static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    ExportRowExpand_0(frow, dst, x_out_max, wrk);
+  } else {
+    ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
+  }
+}
+
+#if 0  // disabled for now. TODO(skal): make match the C-code
+static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
+                                          uint8_t* dst, int length,
+                                          const uint32_t yscale,
+                                          WebPRescaler* const wrk) {
+  const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
+  const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
+  const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+  const v4i32 zero = { 0 };
+
+  while (length >= 16) {
+    v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
+    v16u8 out;
+    LD_UW4(frow, 4, src0, src1, src2, src3);
+    CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
+                      frac0, frac1, frac2, frac3);
+    LD_UW4(irow, 4, src0, src1, src2, src3);
+    SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
+         src0, src1, src2, src3);
+    CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
+    ST_UB(out, dst);
+    ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
+    frow   += 16;
+    irow   += 16;
+    dst    += 16;
+    length -= 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 src0, src1, src2, frac0, frac1, frac2;
+      LD_UW3(frow, 4, src0, src1, src2);
+      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+      CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
+      CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
+      LD_UW3(irow, 4, src0, src1, src2);
+      SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
+      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+      CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
+      CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      ST_UW3(frac0, frac1, frac2, irow, 4);
+      frow   += 12;
+      irow   += 12;
+      dst    += 12;
+      length -= 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 src0, src1, frac0, frac1;
+      LD_UW2(frow, 4, src0, src1);
+      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+      CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
+      LD_UW2(irow, 4, src0, src1);
+      SUB2(src0, frac0, src1, frac1, src0, src1);
+      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+      CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      ST_UW2(frac0, frac1, irow, 4);
+      frow   += 8;
+      irow   += 8;
+      dst    += 8;
+      length -= 8;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      v4u32 frac0;
+      v4u32 src0 = LD_UW(frow);
+      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+      src0 = LD_UW(irow);
+      src0 = src0 - frac0;
+      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+      SW(val0_m, dst);
+      ST_UW(frac0, irow);
+      frow   += 4;
+      irow   += 4;
+      dst    += 4;
+      length -= 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
+      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
+      irow[x_out] = frac;
+    }
+  }
+}
+
+static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
+                                          int length,
+                                          WebPRescaler* const wrk) {
+  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
+  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+  const v4i32 zero = { 0 };
+
+  while (length >= 16) {
+    v4u32 src0, src1, src2, src3;
+    v16u8 dst0;
+    LD_UW4(irow, 4, src0, src1, src2, src3);
+    CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
+    ST_UB(dst0, dst);
+    ST_SW4(zero, zero, zero, zero, irow, 4);
+    length -= 16;
+    irow   += 16;
+    dst    += 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 src0, src1, src2;
+      LD_UW3(irow, 4, src0, src1, src2);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      CALC_MULT_FIX_4(src2, scale, shift, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      ST_SW3(zero, zero, zero, irow, 4);
+      length -= 12;
+      irow   += 12;
+      dst    += 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 src0, src1;
+      LD_UW2(irow, 4, src0, src1);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      ST_SW2(zero, zero, irow, 4);
+      length -= 8;
+      irow   += 8;
+      dst    += 8;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      const v4u32 src0 = LD_UW(irow + 0);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      SW(val0_m, dst);
+      ST_SW(zero, irow);
+      length -= 4;
+      irow   += 4;
+      dst    += 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
+      irow[x_out] = 0;
+    }
+  }
+}
+
+static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  if (yscale) {
+    ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
+  } else {
+    ExportRowShrink_1(irow, dst, x_out_max, wrk);
+  }
+}
+#endif  // 0
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
+  WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
+//  WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
+}
+
+#else     // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
+
+#endif    // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/rescaler_neon.c b/media/libwebp/dsp/rescaler_neon.c
index b560d0cdcc..62bef72113 100644
--- a/media/libwebp/dsp/rescaler_neon.c
+++ b/media/libwebp/dsp/rescaler_neon.c
@@ -81,14 +81,13 @@ static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
       const uint32x4_t B1 = MULT_FIX(A1, fy_scale_half);
       const uint16x4_t C0 = vmovn_u32(B0);
       const uint16x4_t C1 = vmovn_u32(B1);
-      const uint8x8_t D = vmovn_u16(vcombine_u16(C0, C1));
+      const uint8x8_t D = vqmovn_u16(vcombine_u16(C0, C1));
       vst1_u8(dst + x_out, D);
     }
     for (; x_out < x_out_max; ++x_out) {
       const uint32_t J = frow[x_out];
       const int v = (int)MULT_FIX_C(J, fy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   } else {
     const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
@@ -102,7 +101,7 @@ static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
       const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
       const uint16x4_t E0 = vmovn_u32(D0);
       const uint16x4_t E1 = vmovn_u32(D1);
-      const uint8x8_t F = vmovn_u16(vcombine_u16(E0, E1));
+      const uint8x8_t F = vqmovn_u16(vcombine_u16(E0, E1));
       vst1_u8(dst + x_out, F);
     }
     for (; x_out < x_out_max; ++x_out) {
@@ -110,8 +109,7 @@ static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
                        + (uint64_t)B * irow[x_out];
       const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
       const int v = (int)MULT_FIX_C(J, fy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   }
 }
@@ -135,23 +133,22 @@ static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
     for (x_out = 0; x_out < max_span; x_out += 8) {
       LOAD_32x8(frow + x_out, in0, in1);
       LOAD_32x8(irow + x_out, in2, in3);
-      const uint32x4_t A0 = MULT_FIX(in0, yscale_half);
-      const uint32x4_t A1 = MULT_FIX(in1, yscale_half);
+      const uint32x4_t A0 = MULT_FIX_FLOOR(in0, yscale_half);
+      const uint32x4_t A1 = MULT_FIX_FLOOR(in1, yscale_half);
       const uint32x4_t B0 = vqsubq_u32(in2, A0);
       const uint32x4_t B1 = vqsubq_u32(in3, A1);
-      const uint32x4_t C0 = MULT_FIX_FLOOR(B0, fxy_scale_half);
-      const uint32x4_t C1 = MULT_FIX_FLOOR(B1, fxy_scale_half);
+      const uint32x4_t C0 = MULT_FIX(B0, fxy_scale_half);
+      const uint32x4_t C1 = MULT_FIX(B1, fxy_scale_half);
       const uint16x4_t D0 = vmovn_u32(C0);
       const uint16x4_t D1 = vmovn_u32(C1);
-      const uint8x8_t E = vmovn_u16(vcombine_u16(D0, D1));
+      const uint8x8_t E = vqmovn_u16(vcombine_u16(D0, D1));
       vst1_u8(dst + x_out, E);
       STORE_32x8(A0, A1, irow + x_out);
     }
     for (; x_out < x_out_max; ++x_out) {
-      const uint32_t frac = (uint32_t)MULT_FIX_C(frow[x_out], yscale);
-      const int v = (int)MULT_FIX_FLOOR_C(irow[x_out] - frac, fxy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      const uint32_t frac = (uint32_t)MULT_FIX_FLOOR_C(frow[x_out], yscale);
+      const int v = (int)MULT_FIX_C(irow[x_out] - frac, fxy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = frac;   // new fractional start
     }
   } else {
@@ -161,14 +158,13 @@ static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
       const uint32x4_t A1 = MULT_FIX(in1, fxy_scale_half);
       const uint16x4_t B0 = vmovn_u32(A0);
       const uint16x4_t B1 = vmovn_u32(A1);
-      const uint8x8_t C = vmovn_u16(vcombine_u16(B0, B1));
+      const uint8x8_t C = vqmovn_u16(vcombine_u16(B0, B1));
       vst1_u8(dst + x_out, C);
       STORE_32x8(zero, zero, irow + x_out);
     }
     for (; x_out < x_out_max; ++x_out) {
       const int v = (int)MULT_FIX_C(irow[x_out], fxy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = 0;
     }
   }
diff --git a/media/libwebp/dsp/rescaler_sse2.c b/media/libwebp/dsp/rescaler_sse2.c
index 2d35f76ab0..237997c808 100644
--- a/media/libwebp/dsp/rescaler_sse2.c
+++ b/media/libwebp/dsp/rescaler_sse2.c
@@ -225,35 +225,6 @@ static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
   _mm_storel_epi64((__m128i*)dst, G);
 }
 
-static WEBP_INLINE void ProcessRow_Floor_SSE2(const __m128i* const A0,
-                                              const __m128i* const A1,
-                                              const __m128i* const A2,
-                                              const __m128i* const A3,
-                                              const __m128i* const mult,
-                                              uint8_t* const dst) {
-  const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
-  const __m128i B0 = _mm_mul_epu32(*A0, *mult);
-  const __m128i B1 = _mm_mul_epu32(*A1, *mult);
-  const __m128i B2 = _mm_mul_epu32(*A2, *mult);
-  const __m128i B3 = _mm_mul_epu32(*A3, *mult);
-  const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX);
-  const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX);
-#if (WEBP_RESCALER_RFIX < 32)
-  const __m128i D2 =
-      _mm_and_si128(_mm_slli_epi64(B2, 32 - WEBP_RESCALER_RFIX), mask);
-  const __m128i D3 =
-      _mm_and_si128(_mm_slli_epi64(B3, 32 - WEBP_RESCALER_RFIX), mask);
-#else
-  const __m128i D2 = _mm_and_si128(B2, mask);
-  const __m128i D3 = _mm_and_si128(B3, mask);
-#endif
-  const __m128i E0 = _mm_or_si128(D0, D2);
-  const __m128i E1 = _mm_or_si128(D1, D3);
-  const __m128i F = _mm_packs_epi32(E0, E1);
-  const __m128i G = _mm_packus_epi16(F, F);
-  _mm_storel_epi64((__m128i*)dst, G);
-}
-
 static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
@@ -274,8 +245,7 @@ static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
     for (; x_out < x_out_max; ++x_out) {
       const uint32_t J = frow[x_out];
       const int v = (int)MULT_FIX(J, wrk->fy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   } else {
     const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
@@ -308,8 +278,7 @@ static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
                        + (uint64_t)B * irow[x_out];
       const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
       const int v = (int)MULT_FIX(J, wrk->fy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   }
 }
@@ -328,20 +297,15 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
     const int scale_xy = wrk->fxy_scale;
     const __m128i mult_xy = _mm_set_epi32(0, scale_xy, 0, scale_xy);
     const __m128i mult_y = _mm_set_epi32(0, yscale, 0, yscale);
-    const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
     for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
       __m128i A0, A1, A2, A3, B0, B1, B2, B3;
       LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
       LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
       {
-        const __m128i C0 = _mm_add_epi64(B0, rounder);
-        const __m128i C1 = _mm_add_epi64(B1, rounder);
-        const __m128i C2 = _mm_add_epi64(B2, rounder);
-        const __m128i C3 = _mm_add_epi64(B3, rounder);
-        const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);   // = frac
-        const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
-        const __m128i D2 = _mm_srli_epi64(C2, WEBP_RESCALER_RFIX);
-        const __m128i D3 = _mm_srli_epi64(C3, WEBP_RESCALER_RFIX);
+        const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX);   // = frac
+        const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX);
+        const __m128i D2 = _mm_srli_epi64(B2, WEBP_RESCALER_RFIX);
+        const __m128i D3 = _mm_srli_epi64(B3, WEBP_RESCALER_RFIX);
         const __m128i E0 = _mm_sub_epi64(A0, D0);   // irow[x] - frac
         const __m128i E1 = _mm_sub_epi64(A1, D1);
         const __m128i E2 = _mm_sub_epi64(A2, D2);
@@ -352,14 +316,13 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
         const __m128i G1 = _mm_or_si128(D1, F3);
         _mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
         _mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
-        ProcessRow_Floor_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
+        ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
       }
     }
     for (; x_out < x_out_max; ++x_out) {
-      const uint32_t frac = (int)MULT_FIX(frow[x_out], yscale);
-      const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      const uint32_t frac = (int)MULT_FIX_FLOOR(frow[x_out], yscale);
+      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = frac;   // new fractional start
     }
   } else {
@@ -375,8 +338,7 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
     }
     for (; x_out < x_out_max; ++x_out) {
       const int v = (int)MULT_FIX(irow[x_out], scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = 0;
     }
   }
diff --git a/media/libwebp/dsp/ssim.c b/media/libwebp/dsp/ssim.c
new file mode 100644
index 0000000000..4141e32fd8
--- /dev/null
+++ b/media/libwebp/dsp/ssim.c
@@ -0,0 +1,159 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>  // for abs()
+
+#include "../dsp/dsp.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR
+
+// hat-shaped filter. Sum of coefficients is equal to 16.
+static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
+  1, 2, 3, 4, 3, 2, 1
+};
+static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
+
+static WEBP_INLINE double SSIMCalculation(
+    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
+  const uint32_t w2 =  N * N;
+  const uint32_t C1 = 20 * w2;
+  const uint32_t C2 = 60 * w2;
+  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
+  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
+  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
+  if (xmxm + ymym >= C3) {
+    const int64_t xmym = (int64_t)stats->xm * stats->ym;
+    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
+    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
+    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
+    // we descale by 8 to prevent overflow during the fnum/fden multiply.
+    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
+    const uint64_t den_S = (sxx + syy + C2) >> 8;
+    const uint64_t fnum = (2 * xmym + C1) * num_S;
+    const uint64_t fden = (xmxm + ymym + C1) * den_S;
+    const double r = (double)fnum / fden;
+    assert(r >= 0. && r <= 1.0);
+    return r;
+  }
+  return 1.;   // area is too dark to contribute meaningfully
+}
+
+double VP8SSIMFromStats(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, kWeightSum);
+}
+
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, stats->w);
+}
+
+static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
+                               const uint8_t* src2, int stride2,
+                               int xo, int yo, int W, int H) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
+  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
+                                                  : yo + VP8_SSIM_KERNEL;
+  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
+  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
+                                                  : xo + VP8_SSIM_KERNEL;
+  int x, y;
+  src1 += ymin * stride1;
+  src2 += ymin * stride2;
+  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
+    for (x = xmin; x <= xmax; ++x) {
+      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
+                       * kWeight[VP8_SSIM_KERNEL + y - yo];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.w   += w;
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
+    }
+  }
+  return VP8SSIMFromStatsClipped(&stats);
+}
+
+static double SSIMGet_C(const uint8_t* src1, int stride1,
+                        const uint8_t* src2, int stride2) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+  int x, y;
+  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
+    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
+      const uint32_t w = kWeight[x] * kWeight[y];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
+    }
+  }
+  return VP8SSIMFromStats(&stats);
+}
+
+#endif  // !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_DISABLE_STATS)
+static uint32_t AccumulateSSE_C(const uint8_t* src1,
+                                const uint8_t* src2, int len) {
+  int i;
+  uint32_t sse2 = 0;
+  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
+  for (i = 0; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_REDUCE_SIZE)
+VP8SSIMGetFunc VP8SSIMGet;
+VP8SSIMGetClippedFunc VP8SSIMGetClipped;
+#endif
+#if !defined(WEBP_DISABLE_STATS)
+VP8AccumulateSSEFunc VP8AccumulateSSE;
+#endif
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
+#if !defined(WEBP_REDUCE_SIZE)
+  VP8SSIMGetClipped = SSIMGetClipped_C;
+  VP8SSIMGet = SSIMGet_C;
+#endif
+
+#if !defined(WEBP_DISABLE_STATS)
+  VP8AccumulateSSE = AccumulateSSE_C;
+#endif
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_HAVE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8SSIMDspInitSSE2();
+    }
+#endif
+  }
+}
diff --git a/media/libwebp/dsp/ssim_sse2.c b/media/libwebp/dsp/ssim_sse2.c
new file mode 100644
index 0000000000..89810a77c8
--- /dev/null
+++ b/media/libwebp/dsp/ssim_sse2.c
@@ -0,0 +1,165 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "../dsp/common_sse2.h"
+
+#if !defined(WEBP_DISABLE_STATS)
+
+// Helper function
+static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
+                                               __m128i* const sum) {
+  // take abs(a-b) in 8b
+  const __m128i a_b = _mm_subs_epu8(a, b);
+  const __m128i b_a = _mm_subs_epu8(b, a);
+  const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
+  // zero-extend to 16b
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
+  const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
+  // multiply with self
+  const __m128i sum1 = _mm_madd_epi16(C0, C0);
+  const __m128i sum2 = _mm_madd_epi16(C1, C1);
+  *sum = _mm_add_epi32(sum1, sum2);
+}
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR entry point
+
+static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
+                                   const uint8_t* src2, int len) {
+  int i = 0;
+  uint32_t sse2 = 0;
+  if (len >= 16) {
+    const int limit = len - 32;
+    int32_t tmp[4];
+    __m128i sum1;
+    __m128i sum = _mm_setzero_si128();
+    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+    i += 16;
+    while (i <= limit) {
+      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      __m128i sum2;
+      i += 16;
+      SubtractAndSquare_SSE2(a0, b0, &sum1);
+      sum = _mm_add_epi32(sum, sum1);
+      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      i += 16;
+      SubtractAndSquare_SSE2(a1, b1, &sum2);
+      sum = _mm_add_epi32(sum, sum2);
+    }
+    SubtractAndSquare_SSE2(a0, b0, &sum1);
+    sum = _mm_add_epi32(sum, sum1);
+    _mm_storeu_si128((__m128i*)tmp, sum);
+    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+  }
+
+  for (; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+#endif  // !defined(WEBP_DISABLE_STATS)
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
+  uint16_t tmp[8];
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi16(*m, a);
+  _mm_storeu_si128((__m128i*)tmp, b);
+  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
+}
+
+static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi32(*m, a);
+  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
+  return (uint32_t)_mm_cvtsi128_si32(c);
+}
+
+static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
+
+#define ACCUMULATE_ROW(WEIGHT) do {                         \
+  /* compute row weight (Wx * Wy) */                        \
+  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
+  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
+  /* process 8 bytes at a time (7 bytes, actually) */       \
+  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
+  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
+  /* convert to 16b and multiply by weight */               \
+  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
+  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
+  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
+  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
+  /* accumulate */                                          \
+  xm  = _mm_add_epi16(xm, wa1);                             \
+  ym  = _mm_add_epi16(ym, wb1);                             \
+  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
+  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
+  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
+  src1 += stride1;                                          \
+  src2 += stride2;                                          \
+} while (0)
+
+static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
+                           const uint8_t* src2, int stride2) {
+  VP8DistoStats stats;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i xm = zero, ym = zero;                // 16b accums
+  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
+  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
+  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
+  ACCUMULATE_ROW(1);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(4);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(1);
+  stats.xm  = HorizontalAdd16b_SSE2(&xm);
+  stats.ym  = HorizontalAdd16b_SSE2(&ym);
+  stats.xxm = HorizontalAdd32b_SSE2(&xxm);
+  stats.xym = HorizontalAdd32b_SSE2(&xym);
+  stats.yym = HorizontalAdd32b_SSE2(&yym);
+  return VP8SSIMFromStats(&stats);
+}
+
+#endif  // !defined(WEBP_REDUCE_SIZE)
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
+#if !defined(WEBP_DISABLE_STATS)
+  VP8AccumulateSSE = AccumulateSSE_SSE2;
+#endif
+#if !defined(WEBP_REDUCE_SIZE)
+  VP8SSIMGet = SSIMGet_SSE2;
+#endif
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/upsampling.c b/media/libwebp/dsp/upsampling.c
index b76483a3a6..aa1c807e89 100644
--- a/media/libwebp/dsp/upsampling.c
+++ b/media/libwebp/dsp/upsampling.c
@@ -233,12 +233,12 @@ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
   WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitYUV444ConvertersSSE2();
     }
 #endif
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitYUV444ConvertersSSE41();
     }
@@ -278,12 +278,12 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitUpsamplersSSE2();
     }
 #endif
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitUpsamplersSSE41();
     }
@@ -300,7 +300,7 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPInitUpsamplersNEON();
diff --git a/media/libwebp/dsp/upsampling_mips_dsp_r2.c b/media/libwebp/dsp/upsampling_mips_dsp_r2.c
new file mode 100644
index 0000000000..2789c29e02
--- /dev/null
+++ b/media/libwebp/dsp/upsampling_mips_dsp_r2.c
@@ -0,0 +1,291 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV to RGB upsampling functions.
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+//            Djordje Pesut  (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include <assert.h>
+#include "../dsp/yuv.h"
+
+#define YUV_TO_RGB(Y, U, V, R, G, B) do {                                      \
+    const int t1 = MultHi(Y, 19077);                                           \
+    const int t2 = MultHi(V, 13320);                                           \
+    R = MultHi(V, 26149);                                                      \
+    G = MultHi(U, 6419);                                                       \
+    B = MultHi(U, 33050);                                                      \
+    R = t1 + R;                                                                \
+    G = t1 - G;                                                                \
+    B = t1 + B;                                                                \
+    R = R - 14234;                                                             \
+    G = G - t2 + 8708;                                                         \
+    B = B - 17685;                                                             \
+    __asm__ volatile (                                                         \
+      "shll_s.w         %[" #R "],      %[" #R "],        17         \n\t"     \
+      "shll_s.w         %[" #G "],      %[" #G "],        17         \n\t"     \
+      "shll_s.w         %[" #B "],      %[" #B "],        17         \n\t"     \
+      "precrqu_s.qb.ph  %[" #R "],      %[" #R "],        $zero      \n\t"     \
+      "precrqu_s.qb.ph  %[" #G "],      %[" #G "],        $zero      \n\t"     \
+      "precrqu_s.qb.ph  %[" #B "],      %[" #B "],        $zero      \n\t"     \
+      "srl              %[" #R "],      %[" #R "],        24         \n\t"     \
+      "srl              %[" #G "],      %[" #G "],        24         \n\t"     \
+      "srl              %[" #B "],      %[" #B "],        24         \n\t"     \
+      : [R]"+r"(R), [G]"+r"(G), [B]"+r"(B)                                     \
+      :                                                                        \
+    );                                                                         \
+  } while (0)
+
+#if !defined(WEBP_REDUCE_CSP)
+static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  rgb[0] = r;
+  rgb[1] = g;
+  rgb[2] = b;
+}
+static WEBP_INLINE void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  bgr[0] = b;
+  bgr[1] = g;
+  bgr[2] = r;
+}
+static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  {
+    const int rg = (r & 0xf8) | (g >> 5);
+    const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    rgb[0] = gb;
+    rgb[1] = rg;
+#else
+    rgb[0] = rg;
+    rgb[1] = gb;
+#endif
+  }
+}
+static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
+                                      uint8_t* const argb) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  {
+    const int rg = (r & 0xf0) | (g >> 4);
+    const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    argb[0] = ba;
+    argb[1] = rg;
+#else
+    argb[0] = rg;
+    argb[1] = ba;
+#endif
+   }
+}
+#endif   // WEBP_REDUCE_CSP
+
+//-----------------------------------------------------------------------------
+// Alpha handling variants
+
+#if !defined(WEBP_REDUCE_CSP)
+static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
+                                  uint8_t* const argb) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  argb[0] = 0xff;
+  argb[1] = r;
+  argb[2] = g;
+  argb[3] = b;
+}
+#endif   // WEBP_REDUCE_CSP
+static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
+                                  uint8_t* const bgra) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  bgra[0] = b;
+  bgra[1] = g;
+  bgra[2] = r;
+  bgra[3] = 0xff;
+}
+static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
+                                  uint8_t* const rgba) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  rgba[0] = r;
+  rgba[1] = g;
+  rgba[2] = b;
+  rgba[3] = 0xff;
+}
+
+//------------------------------------------------------------------------------
+// Fancy upsampler
+
+#ifdef FANCY_UPSAMPLING
+
+// Given samples laid out in a square as:
+//  [a b]
+//  [c d]
+// we interpolate u/v as:
+//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
+//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
+
+// We process u and v together stashed into 32bit (16bit each).
+#define LOAD_UV(u, v) ((u) | ((v) << 16))
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int x;                                                                       \
+  const int last_pixel_pair = (len - 1) >> 1;                                  \
+  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
+  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
+  assert(top_y != NULL);                                                       \
+  {                                                                            \
+    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
+    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
+  }                                                                            \
+  if (bottom_y != NULL) {                                                      \
+    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
+    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
+  }                                                                            \
+  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
+    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
+    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
+    /* precompute invariant values associated with first and second diagonals*/\
+    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
+    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
+    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
+    {                                                                          \
+      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
+      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
+      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
+           top_dst + (2 * x - 1) * XSTEP);                                     \
+      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
+           top_dst + (2 * x - 0) * XSTEP);                                     \
+    }                                                                          \
+    if (bottom_y != NULL) {                                                    \
+      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
+      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
+      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
+           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
+           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+    }                                                                          \
+    tl_uv = t_uv;                                                              \
+    l_uv = uv;                                                                 \
+  }                                                                            \
+  if (!(len & 1)) {                                                            \
+    {                                                                          \
+      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
+      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
+           top_dst + (len - 1) * XSTEP);                                       \
+    }                                                                          \
+    if (bottom_y != NULL) {                                                    \
+      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
+      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
+           bottom_dst + (len - 1) * XSTEP);                                    \
+    }                                                                          \
+  }                                                                            \
+}
+
+// All variants implemented.
+UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
+UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
+#endif   // WEBP_REDUCE_CSP
+
+#undef LOAD_UV
+#undef UPSAMPLE_FUNC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitUpsamplersMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#endif   // WEBP_REDUCE_CSP
+}
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+// YUV444 converter
+
+#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
+}
+
+YUV444_FUNC(Yuv444ToRgba,     YuvToRgba,     4)
+YUV444_FUNC(Yuv444ToBgra,     YuvToBgra,     4)
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(Yuv444ToRgb,      YuvToRgb,      3)
+YUV444_FUNC(Yuv444ToBgr,      YuvToBgr,      3)
+YUV444_FUNC(Yuv444ToArgb,     YuvToArgb,     4)
+YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2)
+YUV444_FUNC(Yuv444ToRgb565,   YuvToRgb565,   2)
+#endif   // WEBP_REDUCE_CSP
+
+#undef YUV444_FUNC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitYUV444ConvertersMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) {
+  WebPYUV444Converters[MODE_RGBA]      = Yuv444ToRgba;
+  WebPYUV444Converters[MODE_BGRA]      = Yuv444ToBgra;
+  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba;
+  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb;
+  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr;
+  WebPYUV444Converters[MODE_ARGB]      = Yuv444ToArgb;
+  WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
+  WebPYUV444Converters[MODE_RGB_565]   = Yuv444ToRgb565;
+  WebPYUV444Converters[MODE_Argb]      = Yuv444ToArgb;
+  WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
+#endif   // WEBP_REDUCE_CSP
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MIPS_DSP_R2))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersMIPSdspR2)
+#endif
diff --git a/media/libwebp/dsp/upsampling_msa.c b/media/libwebp/dsp/upsampling_msa.c
new file mode 100644
index 0000000000..d8ef6feb58
--- /dev/null
+++ b/media/libwebp/dsp/upsampling_msa.c
@@ -0,0 +1,688 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of YUV to RGB upsampling functions.
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include <string.h>
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/msa_macro.h"
+#include "../dsp/yuv.h"
+
+#ifdef FANCY_UPSAMPLING
+
+#define ILVR_UW2(in, out0, out1) do {                            \
+  const v8i16 t0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in);  \
+  out0 = (v4u32)__msa_ilvr_h((v8i16)zero, t0);                   \
+  out1 = (v4u32)__msa_ilvl_h((v8i16)zero, t0);                   \
+} while (0)
+
+#define ILVRL_UW4(in, out0, out1, out2, out3) do {  \
+  v16u8 t0, t1;                                     \
+  ILVRL_B2_UB(zero, in, t0, t1);                    \
+  ILVRL_H2_UW(zero, t0, out0, out1);                \
+  ILVRL_H2_UW(zero, t1, out2, out3);                \
+} while (0)
+
+#define MULTHI_16(in0, in1, in2, in3, cnst, out0, out1) do {   \
+  const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256);        \
+  v4u32 temp0, temp1, temp2, temp3;                            \
+  MUL4(in0, const0, in1, const0, in2, const0, in3, const0,     \
+       temp0, temp1, temp2, temp3);                            \
+  PCKOD_H2_UH(temp1, temp0, temp3, temp2, out0, out1);         \
+} while (0)
+
+#define MULTHI_8(in0, in1, cnst, out0) do {                 \
+  const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256);     \
+  v4u32 temp0, temp1;                                       \
+  MUL2(in0, const0, in1, const0, temp0, temp1);             \
+  out0 = (v8u16)__msa_pckod_h((v8i16)temp1, (v8i16)temp0);  \
+} while (0)
+
+#define CALC_R16(y0, y1, v0, v1, dst) do {                \
+  const v8i16 const_a = (v8i16)__msa_fill_h(14234);       \
+  const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0);  \
+  const v8i16 a1 = __msa_adds_s_h((v8i16)y1, (v8i16)v1);  \
+  v8i16 b0 = __msa_subs_s_h(a0, const_a);                 \
+  v8i16 b1 = __msa_subs_s_h(a1, const_a);                 \
+  SRAI_H2_SH(b0, b1, 6);                                  \
+  CLIP_SH2_0_255(b0, b1);                                 \
+  dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0);       \
+} while (0)
+
+#define CALC_R8(y0, v0, dst) do {                         \
+  const v8i16 const_a = (v8i16)__msa_fill_h(14234);       \
+  const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0);  \
+  v8i16 b0 = __msa_subs_s_h(a0, const_a);                 \
+  b0 = SRAI_H(b0, 6);                                     \
+  CLIP_SH_0_255(b0);                                      \
+  dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0);       \
+} while (0)
+
+#define CALC_G16(y0, y1, u0, u1, v0, v1, dst) do {   \
+  const v8i16 const_a = (v8i16)__msa_fill_h(8708);   \
+  v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0);   \
+  v8i16 a1 = __msa_subs_s_h((v8i16)y1, (v8i16)u1);   \
+  const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0);    \
+  const v8i16 b1 = __msa_subs_s_h(a1, (v8i16)v1);    \
+  a0 = __msa_adds_s_h(b0, const_a);                  \
+  a1 = __msa_adds_s_h(b1, const_a);                  \
+  SRAI_H2_SH(a0, a1, 6);                             \
+  CLIP_SH2_0_255(a0, a1);                            \
+  dst = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);  \
+} while (0)
+
+#define CALC_G8(y0, u0, v0, dst) do {                \
+  const v8i16 const_a = (v8i16)__msa_fill_h(8708);   \
+  v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0);   \
+  const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0);    \
+  a0 = __msa_adds_s_h(b0, const_a);                  \
+  a0 = SRAI_H(a0, 6);                                \
+  CLIP_SH_0_255(a0);                                 \
+  dst = (v16u8)__msa_pckev_b((v16i8)a0, (v16i8)a0);  \
+} while (0)
+
+#define CALC_B16(y0, y1, u0, u1, dst) do {           \
+  const v8u16 const_a = (v8u16)__msa_fill_h(17685);  \
+  const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0);    \
+  const v8u16 a1 = __msa_adds_u_h((v8u16)y1, u1);    \
+  v8u16 b0 = __msa_subs_u_h(a0, const_a);            \
+  v8u16 b1 = __msa_subs_u_h(a1, const_a);            \
+  SRAI_H2_UH(b0, b1, 6);                             \
+  CLIP_UH2_0_255(b0, b1);                            \
+  dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0);  \
+} while (0)
+
+#define CALC_B8(y0, u0, dst) do {                    \
+  const v8u16 const_a = (v8u16)__msa_fill_h(17685);  \
+  const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0);    \
+  v8u16 b0 = __msa_subs_u_h(a0, const_a);            \
+  b0 = SRAI_H(b0, 6);                                \
+  CLIP_UH_0_255(b0);                                 \
+  dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0);  \
+} while (0)
+
+#define CALC_RGB16(y, u, v, R, G, B) do {    \
+  const v16u8 zero = { 0 };                  \
+  v8u16 y0, y1, u0, u1, v0, v1;              \
+  v4u32 p0, p1, p2, p3;                      \
+  const v16u8 in_y = LD_UB(y);               \
+  const v16u8 in_u = LD_UB(u);               \
+  const v16u8 in_v = LD_UB(v);               \
+  ILVRL_UW4(in_y, p0, p1, p2, p3);           \
+  MULTHI_16(p0, p1, p2, p3, 19077, y0, y1);  \
+  ILVRL_UW4(in_v, p0, p1, p2, p3);           \
+  MULTHI_16(p0, p1, p2, p3, 26149, v0, v1);  \
+  CALC_R16(y0, y1, v0, v1, R);               \
+  MULTHI_16(p0, p1, p2, p3, 13320, v0, v1);  \
+  ILVRL_UW4(in_u, p0, p1, p2, p3);           \
+  MULTHI_16(p0, p1, p2, p3, 6419, u0, u1);   \
+  CALC_G16(y0, y1, u0, u1, v0, v1, G);       \
+  MULTHI_16(p0, p1, p2, p3, 33050, u0, u1);  \
+  CALC_B16(y0, y1, u0, u1, B);               \
+} while (0)
+
+#define CALC_RGB8(y, u, v, R, G, B) do {  \
+  const v16u8 zero = { 0 };               \
+  v8u16 y0, u0, v0;                       \
+  v4u32 p0, p1;                           \
+  const v16u8 in_y = LD_UB(y);            \
+  const v16u8 in_u = LD_UB(u);            \
+  const v16u8 in_v = LD_UB(v);            \
+  ILVR_UW2(in_y, p0, p1);                 \
+  MULTHI_8(p0, p1, 19077, y0);            \
+  ILVR_UW2(in_v, p0, p1);                 \
+  MULTHI_8(p0, p1, 26149, v0);            \
+  CALC_R8(y0, v0, R);                     \
+  MULTHI_8(p0, p1, 13320, v0);            \
+  ILVR_UW2(in_u, p0, p1);                 \
+  MULTHI_8(p0, p1, 6419, u0);             \
+  CALC_G8(y0, u0, v0, G);                 \
+  MULTHI_8(p0, p1, 33050, u0);            \
+  CALC_B8(y0, u0, B);                     \
+} while (0)
+
+#define STORE16_3(a0, a1, a2, dst) do {                          \
+  const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19,  \
+                        8, 9, 20, 10 };                          \
+  const v16u8 mask1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7,  \
+                        8, 25, 9, 10 };                          \
+  const v16u8 mask2 = { 26, 0, 1, 27, 2, 3, 28, 4, 5, 29, 6, 7,  \
+                        30, 8, 9, 31 };                          \
+  v16u8 out0, out1, out2, tmp0, tmp1, tmp2;                      \
+  ILVRL_B2_UB(a1, a0, tmp0, tmp1);                               \
+  out0 = VSHF_UB(tmp0, a2, mask0);                               \
+  tmp2 = SLDI_UB(tmp1, tmp0, 11);                                \
+  out1 = VSHF_UB(tmp2, a2, mask1);                               \
+  tmp2 = SLDI_UB(tmp1, tmp1, 6);                                 \
+  out2 = VSHF_UB(tmp2, a2, mask2);                               \
+  ST_UB(out0, dst +  0);                                         \
+  ST_UB(out1, dst + 16);                                         \
+  ST_UB(out2, dst + 32);                                         \
+} while (0)
+
+#define STORE8_3(a0, a1, a2, dst) do {                             \
+  int64_t out_m;                                                   \
+  const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19,    \
+                        8, 9, 20, 10 };                            \
+  const v16u8 mask1 = { 11, 21, 12, 13, 22, 14, 15, 23,            \
+                        255, 255, 255, 255, 255, 255, 255, 255 };  \
+  const v16u8 tmp0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0);    \
+  v16u8 out0, out1;                                                \
+  VSHF_B2_UB(tmp0, a2, tmp0, a2, mask0, mask1, out0, out1);        \
+  ST_UB(out0, dst);                                                \
+  out_m = __msa_copy_s_d((v2i64)out1, 0);                          \
+  SD(out_m, dst + 16);                                             \
+} while (0)
+
+#define STORE16_4(a0, a1, a2, a3, dst) do {  \
+  v16u8 tmp0, tmp1, tmp2, tmp3;              \
+  v16u8 out0, out1, out2, out3;              \
+  ILVRL_B2_UB(a1, a0, tmp0, tmp1);           \
+  ILVRL_B2_UB(a3, a2, tmp2, tmp3);           \
+  ILVRL_H2_UB(tmp2, tmp0, out0, out1);       \
+  ILVRL_H2_UB(tmp3, tmp1, out2, out3);       \
+  ST_UB(out0, dst +  0);                     \
+  ST_UB(out1, dst + 16);                     \
+  ST_UB(out2, dst + 32);                     \
+  ST_UB(out3, dst + 48);                     \
+} while (0)
+
+#define STORE8_4(a0, a1, a2, a3, dst) do {  \
+  v16u8 tmp0, tmp1, tmp2, tmp3;             \
+  ILVR_B2_UB(a1, a0, a3, a2, tmp0, tmp1);   \
+  ILVRL_H2_UB(tmp1, tmp0, tmp2, tmp3);      \
+  ST_UB(tmp2, dst +  0);                    \
+  ST_UB(tmp3, dst + 16);                    \
+} while (0)
+
+#define STORE2_16(a0, a1, dst) do {  \
+  v16u8 out0, out1;                  \
+  ILVRL_B2_UB(a1, a0, out0, out1);   \
+  ST_UB(out0, dst +  0);             \
+  ST_UB(out1, dst + 16);             \
+} while (0)
+
+#define STORE2_8(a0, a1, dst) do {                               \
+  const v16u8 out0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0);  \
+  ST_UB(out0, dst);                                              \
+} while (0)
+
+#define CALC_RGBA4444(y, u, v, out0, out1, N, dst) do {  \
+  CALC_RGB##N(y, u, v, R, G, B);                         \
+  tmp0 = ANDI_B(R, 0xf0);                                \
+  tmp1 = SRAI_B(G, 4);                                   \
+  RG = tmp0 | tmp1;                                      \
+  tmp0 = ANDI_B(B, 0xf0);                                \
+  BA = ORI_B(tmp0, 0x0f);                                \
+  STORE2_##N(out0, out1, dst);                           \
+} while (0)
+
+#define CALC_RGB565(y, u, v, out0, out1, N, dst) do {  \
+  CALC_RGB##N(y, u, v, R, G, B);                       \
+  tmp0 = ANDI_B(R, 0xf8);                              \
+  tmp1 = SRAI_B(G, 5);                                 \
+  RG = tmp0 | tmp1;                                    \
+  tmp0 = SLLI_B(G, 3);                                 \
+  tmp1 = ANDI_B(tmp0, 0xe0);                           \
+  tmp0 = SRAI_B(B, 3);                                 \
+  GB = tmp0 | tmp1;                                    \
+  STORE2_##N(out0, out1, dst);                         \
+} while (0)
+
+static WEBP_INLINE int Clip8(int v) {
+  return v < 0 ? 0 : v > 255 ? 255 : v;
+}
+
+static void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  rgb[0] = Clip8(r1 >> 6);
+  rgb[1] = Clip8(g1 >> 6);
+  rgb[2] = Clip8(b1 >> 6);
+}
+
+static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  bgr[0] = Clip8(b1 >> 6);
+  bgr[1] = Clip8(g1 >> 6);
+  bgr[2] = Clip8(r1 >> 6);
+}
+
+#if !defined(WEBP_REDUCE_CSP)
+static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  const int r = Clip8(r1 >> 6);
+  const int g = Clip8(g1 >> 6);
+  const int b = Clip8(b1 >> 6);
+  const int rg = (r & 0xf8) | (g >> 5);
+  const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#if (WEBP_SWAP_16BIT_CSP == 1)
+  rgb[0] = gb;
+  rgb[1] = rg;
+#else
+  rgb[0] = rg;
+  rgb[1] = gb;
+#endif
+}
+
+static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  const int r = Clip8(r1 >> 6);
+  const int g = Clip8(g1 >> 6);
+  const int b = Clip8(b1 >> 6);
+  const int rg = (r & 0xf0) | (g >> 4);
+  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
+#if (WEBP_SWAP_16BIT_CSP == 1)
+  argb[0] = ba;
+  argb[1] = rg;
+#else
+  argb[0] = rg;
+  argb[1] = ba;
+#endif
+}
+
+static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {
+  argb[0] = 0xff;
+  YuvToRgb(y, u, v, argb + 1);
+}
+#endif  // WEBP_REDUCE_CSP
+
+static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {
+  YuvToBgr(y, u, v, bgra);
+  bgra[3] = 0xff;
+}
+
+static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
+  YuvToRgb(y, u, v, rgba);
+  rgba[3] = 0xff;
+}
+
+#if !defined(WEBP_REDUCE_CSP)
+static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
+                         const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_3(R, G, B, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 3;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[3 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_3(R, G, B, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[3 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_3(R, G, B, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  }
+}
+
+static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
+                         const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_3(B, G, R, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 3;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[3 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_3(B, G, R, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[3 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_3(B, G, R, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  }
+}
+#endif  // WEBP_REDUCE_CSP
+
+static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
+                          const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_4(R, G, B, A, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 4;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[4 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(&temp[0], u, v, R, G, B);
+    STORE16_4(R, G, B, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[4 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_4(R, G, B, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  }
+}
+
+static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
+                          const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_4(B, G, R, A, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 4;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[4 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_4(B, G, R, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[4 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_4(B, G, R, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  }
+}
+
+#if !defined(WEBP_REDUCE_CSP)
+static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
+                          const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_4(A, R, G, B, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 4;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[4 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_4(A, R, G, B, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[4 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_4(A, R, G, B, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  }
+}
+
+static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
+                              const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B, RG, BA, tmp0, tmp1;
+  while (length >= 16) {
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
+#else
+    CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
+#endif
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 2;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[2 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
+#else
+    CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[2 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
+#else
+    CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  }
+}
+
+static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
+                            const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B, RG, GB, tmp0, tmp1;
+  while (length >= 16) {
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    CALC_RGB565(y, u, v, GB, RG, 16, dst);
+#else
+    CALC_RGB565(y, u, v, RG, GB, 16, dst);
+#endif
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 2;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[2 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    CALC_RGB565(temp, u, v, GB, RG, 16, temp);
+#else
+    CALC_RGB565(temp, u, v, RG, GB, 16, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[2 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    CALC_RGB565(temp, u, v, GB, RG, 8, temp);
+#else
+    CALC_RGB565(temp, u, v, RG, GB, 8, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  }
+}
+#endif  // WEBP_REDUCE_CSP
+
+#define UPSAMPLE_32PIXELS(a, b, c, d) do {    \
+  v16u8 s = __msa_aver_u_b(a, d);             \
+  v16u8 t = __msa_aver_u_b(b, c);             \
+  const v16u8 st = s ^ t;                     \
+  v16u8 ad = a ^ d;                           \
+  v16u8 bc = b ^ c;                           \
+  v16u8 t0 = ad | bc;                         \
+  v16u8 t1 = t0 | st;                         \
+  v16u8 t2 = ANDI_B(t1, 1);                   \
+  v16u8 t3 = __msa_aver_u_b(s, t);            \
+  const v16u8 k = t3 - t2;                    \
+  v16u8 diag1, diag2;                         \
+  AVER_UB2_UB(t, k, s, k, t0, t1);            \
+  bc = bc & st;                               \
+  ad = ad & st;                               \
+  t = t ^ k;                                  \
+  s = s ^ k;                                  \
+  t2 = bc | t;                                \
+  t3 = ad | s;                                \
+  t2 = ANDI_B(t2, 1);                         \
+  t3 = ANDI_B(t3, 1);                         \
+  SUB2(t0, t2, t1, t3, diag1, diag2);         \
+  AVER_UB2_UB(a, diag1, b, diag2, t0, t1);    \
+  ILVRL_B2_UB(t1, t0, a, b);                  \
+  if (pbot_y != NULL) {                       \
+    AVER_UB2_UB(c, diag2, d, diag1, t0, t1);  \
+    ILVRL_B2_UB(t1, t0, c, d);                \
+  }                                           \
+} while (0)
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                            \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,        \
+                      const uint8_t* top_u, const uint8_t* top_v,        \
+                      const uint8_t* cur_u, const uint8_t* cur_v,        \
+                      uint8_t* top_dst, uint8_t* bot_dst, int len)       \
+{                                                                        \
+  int size = (len - 1) >> 1;                                             \
+  uint8_t temp_u[64];                                                    \
+  uint8_t temp_v[64];                                                    \
+  const uint32_t tl_uv = ((top_u[0]) | ((top_v[0]) << 16));              \
+  const uint32_t l_uv = ((cur_u[0]) | ((cur_v[0]) << 16));               \
+  const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;            \
+  const uint8_t* ptop_y = &top_y[1];                                     \
+  uint8_t* ptop_dst = top_dst + XSTEP;                                   \
+  const uint8_t* pbot_y = &bot_y[1];                                     \
+  uint8_t* pbot_dst = bot_dst + XSTEP;                                   \
+                                                                         \
+  FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                      \
+  if (bot_y != NULL) {                                                   \
+    const uint32_t uv1 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;          \
+    FUNC(bot_y[0], uv1 & 0xff, (uv1 >> 16), bot_dst);                    \
+  }                                                                      \
+  while (size >= 16) {                                                   \
+    v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1;                        \
+    LD_UB2(top_u, 1, tu0, tu1);                                          \
+    LD_UB2(cur_u, 1, cu0, cu1);                                          \
+    LD_UB2(top_v, 1, tv0, tv1);                                          \
+    LD_UB2(cur_v, 1, cv0, cv1);                                          \
+    UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1);                               \
+    UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1);                               \
+    ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16);                          \
+    ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16);                          \
+    FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, 32);           \
+    if (bot_y != NULL) {                                                 \
+      FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, 32);        \
+    }                                                                    \
+    ptop_y   += 32;                                                      \
+    pbot_y   += 32;                                                      \
+    ptop_dst += XSTEP * 32;                                              \
+    pbot_dst += XSTEP * 32;                                              \
+    top_u    += 16;                                                      \
+    top_v    += 16;                                                      \
+    cur_u    += 16;                                                      \
+    cur_v    += 16;                                                      \
+    size     -= 16;                                                      \
+  }                                                                      \
+  if (size > 0) {                                                        \
+    v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1;                        \
+    memcpy(&temp_u[ 0], top_u, 17 * sizeof(uint8_t));                    \
+    memcpy(&temp_u[32], cur_u, 17 * sizeof(uint8_t));                    \
+    memcpy(&temp_v[ 0], top_v, 17 * sizeof(uint8_t));                    \
+    memcpy(&temp_v[32], cur_v, 17 * sizeof(uint8_t));                    \
+    LD_UB2(&temp_u[ 0], 1, tu0, tu1);                                    \
+    LD_UB2(&temp_u[32], 1, cu0, cu1);                                    \
+    LD_UB2(&temp_v[ 0], 1, tv0, tv1);                                    \
+    LD_UB2(&temp_v[32], 1, cv0, cv1);                                    \
+    UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1);                               \
+    UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1);                               \
+    ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16);                          \
+    ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16);                          \
+    FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, size * 2);     \
+    if (bot_y != NULL) {                                                 \
+      FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, size * 2);  \
+    }                                                                    \
+    top_u += size;                                                       \
+    top_v += size;                                                       \
+    cur_u += size;                                                       \
+    cur_v += size;                                                       \
+  }                                                                      \
+  if (!(len & 1)) {                                                      \
+    const uint32_t t0 = ((top_u[0]) | ((top_v[0]) << 16));               \
+    const uint32_t c0  = ((cur_u[0]) | ((cur_v[0]) << 16));              \
+    const uint32_t tmp0 = (3 * t0 + c0 + 0x00020002u) >> 2;              \
+    FUNC(top_y[len - 1], tmp0 & 0xff, (tmp0 >> 16),                      \
+                top_dst + (len - 1) * XSTEP);                            \
+    if (bot_y != NULL) {                                                 \
+      const uint32_t tmp1 = (3 * c0 + t0 + 0x00020002u) >> 2;            \
+      FUNC(bot_y[len - 1], tmp1 & 0xff, (tmp1 >> 16),                    \
+           bot_dst + (len - 1) * XSTEP);                                 \
+    }                                                                    \
+  }                                                                      \
+}
+
+UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
+UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
+#endif   // WEBP_REDUCE_CSP
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+extern void WebPInitUpsamplersMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#endif   // WEBP_REDUCE_CSP
+}
+
+#endif  // FANCY_UPSAMPLING
+
+#endif  // WEBP_USE_MSA
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MSA))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersMSA)
+#endif
diff --git a/media/libwebp/dsp/upsampling_neon.c b/media/libwebp/dsp/upsampling_neon.c
index c847d70d46..41cb44b03f 100644
--- a/media/libwebp/dsp/upsampling_neon.c
+++ b/media/libwebp/dsp/upsampling_neon.c
@@ -58,8 +58,8 @@
 } while (0)
 
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample16Pixels_NEON(const uint8_t *r1, const uint8_t *r2,
-                                  uint8_t *out) {
+static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2,
+                                  uint8_t* out) {
   UPSAMPLE_16PIXELS(r1, r2, out);
 }
 
@@ -190,14 +190,14 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
 }
 
 #define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP)                       \
-static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
-                      const uint8_t *top_u, const uint8_t *top_v,       \
-                      const uint8_t *cur_u, const uint8_t *cur_v,       \
-                      uint8_t *top_dst, uint8_t *bottom_dst, int len) { \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,    \
+                      const uint8_t* top_u, const uint8_t* top_v,       \
+                      const uint8_t* cur_u, const uint8_t* cur_v,       \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
   int block;                                                            \
   /* 16 byte aligned array to cache reconstructed u and v */            \
   uint8_t uv_buf[2 * 32 + 15];                                          \
-  uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
+  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
   const int uv_len = (len + 1) >> 1;                                    \
   /* 9 pixels must be read-able for each block */                       \
   const int num_blocks = (uv_len - 1) >> 3;                             \
diff --git a/media/libwebp/dsp/yuv.c b/media/libwebp/dsp/yuv.c
index 12c04ca426..bd9db04149 100644
--- a/media/libwebp/dsp/yuv.c
+++ b/media/libwebp/dsp/yuv.c
@@ -90,16 +90,16 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitSamplersSSE2();
     }
-#endif  // WEBP_USE_SSE2
-#if defined(WEBP_USE_SSE41)
+#endif  // WEBP_HAVE_SSE2
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitSamplersSSE41();
     }
-#endif  // WEBP_USE_SSE41
+#endif  // WEBP_HAVE_SSE41
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       WebPInitSamplersMIPS32();
@@ -276,26 +276,26 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
 #endif
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitConvertARGBToYUVSSE2();
       WebPInitSharpYUVSSE2();
     }
-#endif  // WEBP_USE_SSE2
-#if defined(WEBP_USE_SSE41)
+#endif  // WEBP_HAVE_SSE2
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitConvertARGBToYUVSSE41();
     }
-#endif  // WEBP_USE_SSE41
+#endif  // WEBP_HAVE_SSE41
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPInitConvertARGBToYUVNEON();
     WebPInitSharpYUVNEON();
   }
-#endif  // WEBP_USE_NEON
+#endif  // WEBP_HAVE_NEON
 
   assert(WebPConvertARGBToY != NULL);
   assert(WebPConvertARGBToUV != NULL);
diff --git a/media/libwebp/dsp/yuv.h b/media/libwebp/dsp/yuv.h
index 947b89e13c..28524ec422 100644
--- a/media/libwebp/dsp/yuv.h
+++ b/media/libwebp/dsp/yuv.h
@@ -10,7 +10,7 @@
 // inline YUV<->RGB conversion function
 //
 // The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
-// More information at: http://en.wikipedia.org/wiki/YCbCr
+// More information at: https://en.wikipedia.org/wiki/YCbCr
 // Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
 // U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
 // V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
diff --git a/media/libwebp/dsp/yuv_mips32.c b/media/libwebp/dsp/yuv_mips32.c
new file mode 100644
index 0000000000..fc7c2cda0e
--- /dev/null
+++ b/media/libwebp/dsp/yuv_mips32.c
@@ -0,0 +1,103 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of YUV to RGB upsampling functions.
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../dsp/yuv.h"
+
+//------------------------------------------------------------------------------
+// simple point-sampling
+
+#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
+static void FUNC_NAME(const uint8_t* y,                                        \
+                      const uint8_t* u, const uint8_t* v,                      \
+                      uint8_t* dst, int len) {                                 \
+  int i, r, g, b;                                                              \
+  int temp0, temp1, temp2, temp3, temp4;                                       \
+  for (i = 0; i < (len >> 1); i++) {                                           \
+    temp1 = MultHi(v[0], 26149);                                               \
+    temp3 = MultHi(v[0], 13320);                                               \
+    temp2 = MultHi(u[0], 6419);                                                \
+    temp4 = MultHi(u[0], 33050);                                               \
+    temp0 = MultHi(y[0], 19077);                                               \
+    temp1 -= 14234;                                                            \
+    temp3 -= 8708;                                                             \
+    temp2 += temp3;                                                            \
+    temp4 -= 17685;                                                            \
+    r = VP8Clip8(temp0 + temp1);                                               \
+    g = VP8Clip8(temp0 - temp2);                                               \
+    b = VP8Clip8(temp0 + temp4);                                               \
+    temp0 = MultHi(y[1], 19077);                                               \
+    dst[R] = r;                                                                \
+    dst[G] = g;                                                                \
+    dst[B] = b;                                                                \
+    if (A) dst[A] = 0xff;                                                      \
+    r = VP8Clip8(temp0 + temp1);                                               \
+    g = VP8Clip8(temp0 - temp2);                                               \
+    b = VP8Clip8(temp0 + temp4);                                               \
+    dst[R + XSTEP] = r;                                                        \
+    dst[G + XSTEP] = g;                                                        \
+    dst[B + XSTEP] = b;                                                        \
+    if (A) dst[A + XSTEP] = 0xff;                                              \
+    y += 2;                                                                    \
+    ++u;                                                                       \
+    ++v;                                                                       \
+    dst += 2 * XSTEP;                                                          \
+  }                                                                            \
+  if (len & 1) {                                                               \
+    temp1 = MultHi(v[0], 26149);                                               \
+    temp3 = MultHi(v[0], 13320);                                               \
+    temp2 = MultHi(u[0], 6419);                                                \
+    temp4 = MultHi(u[0], 33050);                                               \
+    temp0 = MultHi(y[0], 19077);                                               \
+    temp1 -= 14234;                                                            \
+    temp3 -= 8708;                                                             \
+    temp2 += temp3;                                                            \
+    temp4 -= 17685;                                                            \
+    r = VP8Clip8(temp0 + temp1);                                               \
+    g = VP8Clip8(temp0 - temp2);                                               \
+    b = VP8Clip8(temp0 + temp4);                                               \
+    dst[R] = r;                                                                \
+    dst[G] = g;                                                                \
+    dst[B] = b;                                                                \
+    if (A) dst[A] = 0xff;                                                      \
+  }                                                                            \
+}
+
+ROW_FUNC(YuvToRgbRow_MIPS32,      3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow_MIPS32,     4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow_MIPS32,      3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow_MIPS32,     4, 2, 1, 0, 3)
+
+#undef ROW_FUNC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitSamplersMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) {
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_MIPS32;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPS32;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_MIPS32;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPS32;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/yuv_mips_dsp_r2.c b/media/libwebp/dsp/yuv_mips_dsp_r2.c
new file mode 100644
index 0000000000..1418a9fba1
--- /dev/null
+++ b/media/libwebp/dsp/yuv_mips_dsp_r2.c
@@ -0,0 +1,134 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS DSPr2 version of YUV to RGB upsampling functions.
+//
+// Author(s):  Branimir Vasic (branimir.vasic@imgtec.com)
+//             Djordje Pesut  (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/yuv.h"
+
+//------------------------------------------------------------------------------
+// simple point-sampling
+
+#define ROW_FUNC_PART_1()                                                      \
+  "lbu              %[temp3],   0(%[v])                         \n\t"          \
+  "lbu              %[temp4],   0(%[u])                         \n\t"          \
+  "lbu              %[temp0],   0(%[y])                         \n\t"          \
+  "mul              %[temp1],   %[t_con_1],     %[temp3]        \n\t"          \
+  "mul              %[temp3],   %[t_con_2],     %[temp3]        \n\t"          \
+  "mul              %[temp2],   %[t_con_3],     %[temp4]        \n\t"          \
+  "mul              %[temp4],   %[t_con_4],     %[temp4]        \n\t"          \
+  "mul              %[temp0],   %[t_con_5],     %[temp0]        \n\t"          \
+  "subu             %[temp1],   %[temp1],       %[t_con_6]      \n\t"          \
+  "subu             %[temp3],   %[temp3],       %[t_con_7]      \n\t"          \
+  "addu             %[temp2],   %[temp2],       %[temp3]        \n\t"          \
+  "subu             %[temp4],   %[temp4],       %[t_con_8]      \n\t"          \
+
+#define ROW_FUNC_PART_2(R, G, B, K)                                            \
+  "addu             %[temp5],   %[temp0],       %[temp1]        \n\t"          \
+  "subu             %[temp6],   %[temp0],       %[temp2]        \n\t"          \
+  "addu             %[temp7],   %[temp0],       %[temp4]        \n\t"          \
+".if " #K "                                                     \n\t"          \
+  "lbu              %[temp0],   1(%[y])                         \n\t"          \
+".endif                                                         \n\t"          \
+  "shll_s.w         %[temp5],   %[temp5],       17              \n\t"          \
+  "shll_s.w         %[temp6],   %[temp6],       17              \n\t"          \
+".if " #K "                                                     \n\t"          \
+  "mul              %[temp0],   %[t_con_5],     %[temp0]        \n\t"          \
+".endif                                                         \n\t"          \
+  "shll_s.w         %[temp7],   %[temp7],       17              \n\t"          \
+  "precrqu_s.qb.ph  %[temp5],   %[temp5],       $zero           \n\t"          \
+  "precrqu_s.qb.ph  %[temp6],   %[temp6],       $zero           \n\t"          \
+  "precrqu_s.qb.ph  %[temp7],   %[temp7],       $zero           \n\t"          \
+  "srl              %[temp5],   %[temp5],       24              \n\t"          \
+  "srl              %[temp6],   %[temp6],       24              \n\t"          \
+  "srl              %[temp7],   %[temp7],       24              \n\t"          \
+  "sb               %[temp5],   " #R "(%[dst])                  \n\t"          \
+  "sb               %[temp6],   " #G "(%[dst])                  \n\t"          \
+  "sb               %[temp7],   " #B "(%[dst])                  \n\t"          \
+
+#define ASM_CLOBBER_LIST()                                                     \
+  : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),             \
+    [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),             \
+    [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)                                   \
+  : [t_con_1]"r"(t_con_1), [t_con_2]"r"(t_con_2), [t_con_3]"r"(t_con_3),       \
+    [t_con_4]"r"(t_con_4), [t_con_5]"r"(t_con_5), [t_con_6]"r"(t_con_6),       \
+    [u]"r"(u), [v]"r"(v), [y]"r"(y), [dst]"r"(dst),                            \
+    [t_con_7]"r"(t_con_7), [t_con_8]"r"(t_con_8)                               \
+  : "memory", "hi", "lo"                                                       \
+
+#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
+static void FUNC_NAME(const uint8_t* y,                                        \
+                      const uint8_t* u, const uint8_t* v,                      \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;             \
+  const int t_con_1 = 26149;                                                   \
+  const int t_con_2 = 13320;                                                   \
+  const int t_con_3 = 6419;                                                    \
+  const int t_con_4 = 33050;                                                   \
+  const int t_con_5 = 19077;                                                   \
+  const int t_con_6 = 14234;                                                   \
+  const int t_con_7 = 8708;                                                    \
+  const int t_con_8 = 17685;                                                   \
+  for (i = 0; i < (len >> 1); i++) {                                           \
+    __asm__ volatile (                                                         \
+      ROW_FUNC_PART_1()                                                        \
+      ROW_FUNC_PART_2(R, G, B, 1)                                              \
+      ROW_FUNC_PART_2(R + XSTEP, G + XSTEP, B + XSTEP, 0)                      \
+      ASM_CLOBBER_LIST()                                                       \
+    );                                                                         \
+    if (A) dst[A] = dst[A + XSTEP] = 0xff;                                     \
+    y += 2;                                                                    \
+    ++u;                                                                       \
+    ++v;                                                                       \
+    dst += 2 * XSTEP;                                                          \
+  }                                                                            \
+  if (len & 1) {                                                               \
+    __asm__ volatile (                                                         \
+      ROW_FUNC_PART_1()                                                        \
+      ROW_FUNC_PART_2(R, G, B, 0)                                              \
+      ASM_CLOBBER_LIST()                                                       \
+    );                                                                         \
+    if (A) dst[A] = 0xff;                                                      \
+  }                                                                            \
+}
+
+ROW_FUNC(YuvToRgbRow_MIPSdspR2,      3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow_MIPSdspR2,     4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow_MIPSdspR2,      3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow_MIPSdspR2,     4, 2, 1, 0, 3)
+
+#undef ROW_FUNC
+#undef ASM_CLOBBER_LIST
+#undef ROW_FUNC_PART_2
+#undef ROW_FUNC_PART_1
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitSamplersMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) {
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_MIPSdspR2;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPSdspR2;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_MIPSdspR2;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/enc/alpha_enc.c b/media/libwebp/enc/alpha_enc.c
new file mode 100644
index 0000000000..edfe95ec94
--- /dev/null
+++ b/media/libwebp/enc/alpha_enc.c
@@ -0,0 +1,443 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Alpha-plane compression.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../dsp/dsp.h"
+#include "../utils/filters_utils.h"
+#include "../utils/quant_levels_utils.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+// -----------------------------------------------------------------------------
+// Encodes the given alpha data via specified compression method 'method'.
+// The pre-processing (quantization) is performed if 'quality' is less than 100.
+// For such cases, the encoding is lossy. The valid range is [0, 100] for
+// 'quality' and [0, 1] for 'method':
+//   'method = 0' - No compression;
+//   'method = 1' - Use lossless coder on the alpha plane only
+// 'filter' values [0, 4] correspond to prediction modes none, horizontal,
+// vertical & gradient filters. The prediction mode 4 will try all the
+// prediction modes 0 to 3 and pick the best one.
+// 'effort_level': specifies how much effort must be spent to try and reduce
+//  the compressed output size. In range 0 (quick) to 6 (slow).
+//
+// 'output' corresponds to the buffer containing compressed alpha data.
+//          This buffer is allocated by this method and caller should call
+//          WebPSafeFree(*output) when done.
+// 'output_size' corresponds to size of this compressed alpha buffer.
+//
+// Returns 1 on successfully encoding the alpha and
+//         0 if either:
+//           invalid quality or method, or
+//           memory allocation for the compressed data fails.
+
+#include "../enc/vp8li_enc.h"
+
+static int EncodeLossless(const uint8_t* const data, int width, int height,
+                          int effort_level,  // in [0..6] range
+                          int use_quality_100, VP8LBitWriter* const bw,
+                          WebPAuxStats* const stats) {
+  int ok = 0;
+  WebPConfig config;
+  WebPPicture picture;
+
+  WebPPictureInit(&picture);
+  picture.width = width;
+  picture.height = height;
+  picture.use_argb = 1;
+  picture.stats = stats;
+  if (!WebPPictureAlloc(&picture)) return 0;
+
+  // Transfer the alpha values to the green channel.
+  WebPDispatchAlphaToGreen(data, width, picture.width, picture.height,
+                           picture.argb, picture.argb_stride);
+
+  WebPConfigInit(&config);
+  config.lossless = 1;
+  // Enable exact, or it would alter RGB values of transparent alpha, which is
+  // normally OK but not here since we are not encoding the input image but  an
+  // internal encoding-related image containing necessary exact information in
+  // RGB channels.
+  config.exact = 1;
+  config.method = effort_level;  // impact is very small
+  // Set a low default quality for encoding alpha. Ensure that Alpha quality at
+  // lower methods (3 and below) is less than the threshold for triggering
+  // costly 'BackwardReferencesTraceBackwards'.
+  // If the alpha quality is set to 100 and the method to 6, allow for a high
+  // lossless quality to trigger the cruncher.
+  config.quality =
+      (use_quality_100 && effort_level == 6) ? 100 : 8.f * effort_level;
+  assert(config.quality >= 0 && config.quality <= 100.f);
+
+  // TODO(urvang): Temporary fix to avoid generating images that trigger
+  // a decoder bug related to alpha with color cache.
+  // See: https://code.google.com/p/webp/issues/detail?id=239
+  // Need to re-enable this later.
+  ok = (VP8LEncodeStream(&config, &picture, bw, 0 /*use_cache*/) == VP8_ENC_OK);
+  WebPPictureFree(&picture);
+  ok = ok && !bw->error_;
+  if (!ok) {
+    VP8LBitWriterWipeOut(bw);
+    return 0;
+  }
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+
+// Small struct to hold the result of a filter mode compression attempt.
+typedef struct {
+  size_t score;
+  VP8BitWriter bw;
+  WebPAuxStats stats;
+} FilterTrial;
+
+// This function always returns an initialized 'bw' object, even upon error.
+static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
+                               int method, int filter, int reduce_levels,
+                               int effort_level,  // in [0..6] range
+                               uint8_t* const tmp_alpha,
+                               FilterTrial* result) {
+  int ok = 0;
+  const uint8_t* alpha_src;
+  WebPFilterFunc filter_func;
+  uint8_t header;
+  const size_t data_size = width * height;
+  const uint8_t* output = NULL;
+  size_t output_size = 0;
+  VP8LBitWriter tmp_bw;
+
+  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
+  assert(filter >= 0 && filter < WEBP_FILTER_LAST);
+  assert(method >= ALPHA_NO_COMPRESSION);
+  assert(method <= ALPHA_LOSSLESS_COMPRESSION);
+  assert(sizeof(header) == ALPHA_HEADER_LEN);
+
+  filter_func = WebPFilters[filter];
+  if (filter_func != NULL) {
+    filter_func(data, width, height, width, tmp_alpha);
+    alpha_src = tmp_alpha;
+  }  else {
+    alpha_src = data;
+  }
+
+  if (method != ALPHA_NO_COMPRESSION) {
+    ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
+    ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
+                              !reduce_levels, &tmp_bw, &result->stats);
+    if (ok) {
+      output = VP8LBitWriterFinish(&tmp_bw);
+      output_size = VP8LBitWriterNumBytes(&tmp_bw);
+      if (output_size > data_size) {
+        // compressed size is larger than source! Revert to uncompressed mode.
+        method = ALPHA_NO_COMPRESSION;
+        VP8LBitWriterWipeOut(&tmp_bw);
+      }
+    } else {
+      VP8LBitWriterWipeOut(&tmp_bw);
+      return 0;
+    }
+  }
+
+  if (method == ALPHA_NO_COMPRESSION) {
+    output = alpha_src;
+    output_size = data_size;
+    ok = 1;
+  }
+
+  // Emit final result.
+  header = method | (filter << 2);
+  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
+
+  VP8BitWriterInit(&result->bw, ALPHA_HEADER_LEN + output_size);
+  ok = ok && VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
+  ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
+
+  if (method != ALPHA_NO_COMPRESSION) {
+    VP8LBitWriterWipeOut(&tmp_bw);
+  }
+  ok = ok && !result->bw.error_;
+  result->score = VP8BitWriterSize(&result->bw);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+
+static int GetNumColors(const uint8_t* data, int width, int height,
+                        int stride) {
+  int j;
+  int colors = 0;
+  uint8_t color[256] = { 0 };
+
+  for (j = 0; j < height; ++j) {
+    int i;
+    const uint8_t* const p = data + j * stride;
+    for (i = 0; i < width; ++i) {
+      color[p[i]] = 1;
+    }
+  }
+  for (j = 0; j < 256; ++j) {
+    if (color[j] > 0) ++colors;
+  }
+  return colors;
+}
+
+#define FILTER_TRY_NONE (1 << WEBP_FILTER_NONE)
+#define FILTER_TRY_ALL ((1 << WEBP_FILTER_LAST) - 1)
+
+// Given the input 'filter' option, return an OR'd bit-set of filters to try.
+static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
+                             int filter, int effort_level) {
+  uint32_t bit_map = 0U;
+  if (filter == WEBP_FILTER_FAST) {
+    // Quick estimate of the best candidate.
+    int try_filter_none = (effort_level > 3);
+    const int kMinColorsForFilterNone = 16;
+    const int kMaxColorsForFilterNone = 192;
+    const int num_colors = GetNumColors(alpha, width, height, width);
+    // For low number of colors, NONE yields better compression.
+    filter = (num_colors <= kMinColorsForFilterNone)
+        ? WEBP_FILTER_NONE
+        : WebPEstimateBestFilter(alpha, width, height, width);
+    bit_map |= 1 << filter;
+    // For large number of colors, try FILTER_NONE in addition to the best
+    // filter as well.
+    if (try_filter_none || num_colors > kMaxColorsForFilterNone) {
+      bit_map |= FILTER_TRY_NONE;
+    }
+  } else if (filter == WEBP_FILTER_NONE) {
+    bit_map = FILTER_TRY_NONE;
+  } else {  // WEBP_FILTER_BEST -> try all
+    bit_map = FILTER_TRY_ALL;
+  }
+  return bit_map;
+}
+
+static void InitFilterTrial(FilterTrial* const score) {
+  score->score = (size_t)~0U;
+  VP8BitWriterInit(&score->bw, 0);
+}
+
+static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
+                                 size_t data_size, int method, int filter,
+                                 int reduce_levels, int effort_level,
+                                 uint8_t** const output,
+                                 size_t* const output_size,
+                                 WebPAuxStats* const stats) {
+  int ok = 1;
+  FilterTrial best;
+  uint32_t try_map =
+      GetFilterMap(alpha, width, height, filter, effort_level);
+  InitFilterTrial(&best);
+
+  if (try_map != FILTER_TRY_NONE) {
+    uint8_t* filtered_alpha =  (uint8_t*)WebPSafeMalloc(1ULL, data_size);
+    if (filtered_alpha == NULL) return 0;
+
+    for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
+      if (try_map & 1) {
+        FilterTrial trial;
+        ok = EncodeAlphaInternal(alpha, width, height, method, filter,
+                                 reduce_levels, effort_level, filtered_alpha,
+                                 &trial);
+        if (ok && trial.score < best.score) {
+          VP8BitWriterWipeOut(&best.bw);
+          best = trial;
+        } else {
+          VP8BitWriterWipeOut(&trial.bw);
+        }
+      }
+    }
+    WebPSafeFree(filtered_alpha);
+  } else {
+    ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
+                             reduce_levels, effort_level, NULL, &best);
+  }
+  if (ok) {
+#if !defined(WEBP_DISABLE_STATS)
+    if (stats != NULL) {
+      stats->lossless_features = best.stats.lossless_features;
+      stats->histogram_bits = best.stats.histogram_bits;
+      stats->transform_bits = best.stats.transform_bits;
+      stats->cache_bits = best.stats.cache_bits;
+      stats->palette_size = best.stats.palette_size;
+      stats->lossless_size = best.stats.lossless_size;
+      stats->lossless_hdr_size = best.stats.lossless_hdr_size;
+      stats->lossless_data_size = best.stats.lossless_data_size;
+    }
+#else
+    (void)stats;
+#endif
+    *output_size = VP8BitWriterSize(&best.bw);
+    *output = VP8BitWriterBuf(&best.bw);
+  } else {
+    VP8BitWriterWipeOut(&best.bw);
+  }
+  return ok;
+}
+
+static int EncodeAlpha(VP8Encoder* const enc,
+                       int quality, int method, int filter,
+                       int effort_level,
+                       uint8_t** const output, size_t* const output_size) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+
+  uint8_t* quant_alpha = NULL;
+  const size_t data_size = width * height;
+  uint64_t sse = 0;
+  int ok = 1;
+  const int reduce_levels = (quality < 100);
+
+  // quick correctness checks
+  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
+  assert(enc != NULL && pic != NULL && pic->a != NULL);
+  assert(output != NULL && output_size != NULL);
+  assert(width > 0 && height > 0);
+  assert(pic->a_stride >= width);
+  assert(filter >= WEBP_FILTER_NONE && filter <= WEBP_FILTER_FAST);
+
+  if (quality < 0 || quality > 100) {
+    return 0;
+  }
+
+  if (method < ALPHA_NO_COMPRESSION || method > ALPHA_LOSSLESS_COMPRESSION) {
+    return 0;
+  }
+
+  if (method == ALPHA_NO_COMPRESSION) {
+    // Don't filter, as filtering will make no impact on compressed size.
+    filter = WEBP_FILTER_NONE;
+  }
+
+  quant_alpha = (uint8_t*)WebPSafeMalloc(1ULL, data_size);
+  if (quant_alpha == NULL) {
+    return 0;
+  }
+
+  // Extract alpha data (width x height) from raw_data (stride x height).
+  WebPCopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);
+
+  if (reduce_levels) {  // No Quantization required for 'quality = 100'.
+    // 16 alpha levels gives quite a low MSE w.r.t original alpha plane hence
+    // mapped to moderate quality 70. Hence Quality:[0, 70] -> Levels:[2, 16]
+    // and Quality:]70, 100] -> Levels:]16, 256].
+    const int alpha_levels = (quality <= 70) ? (2 + quality / 5)
+                                             : (16 + (quality - 70) * 8);
+    ok = QuantizeLevels(quant_alpha, width, height, alpha_levels, &sse);
+  }
+
+  if (ok) {
+    VP8FiltersInit();
+    ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
+                               filter, reduce_levels, effort_level, output,
+                               output_size, pic->stats);
+#if !defined(WEBP_DISABLE_STATS)
+    if (pic->stats != NULL) {  // need stats?
+      pic->stats->coded_size += (int)(*output_size);
+      enc->sse_[3] = sse;
+    }
+#endif
+  }
+
+  WebPSafeFree(quant_alpha);
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+// Main calls
+
+static int CompressAlphaJob(void* arg1, void* unused) {
+  VP8Encoder* const enc = (VP8Encoder*)arg1;
+  const WebPConfig* config = enc->config_;
+  uint8_t* alpha_data = NULL;
+  size_t alpha_size = 0;
+  const int effort_level = config->method;  // maps to [0..6]
+  const WEBP_FILTER_TYPE filter =
+      (config->alpha_filtering == 0) ? WEBP_FILTER_NONE :
+      (config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
+                                       WEBP_FILTER_BEST;
+  if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
+                   filter, effort_level, &alpha_data, &alpha_size)) {
+    return 0;
+  }
+  if (alpha_size != (uint32_t)alpha_size) {  // Soundness check.
+    WebPSafeFree(alpha_data);
+    return 0;
+  }
+  enc->alpha_data_size_ = (uint32_t)alpha_size;
+  enc->alpha_data_ = alpha_data;
+  (void)unused;
+  return 1;
+}
+
+void VP8EncInitAlpha(VP8Encoder* const enc) {
+  WebPInitAlphaProcessing();
+  enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
+  enc->alpha_data_ = NULL;
+  enc->alpha_data_size_ = 0;
+  if (enc->thread_level_ > 0) {
+    WebPWorker* const worker = &enc->alpha_worker_;
+    WebPGetWorkerInterface()->Init(worker);
+    worker->data1 = enc;
+    worker->data2 = NULL;
+    worker->hook = CompressAlphaJob;
+  }
+}
+
+int VP8EncStartAlpha(VP8Encoder* const enc) {
+  if (enc->has_alpha_) {
+    if (enc->thread_level_ > 0) {
+      WebPWorker* const worker = &enc->alpha_worker_;
+      // Makes sure worker is good to go.
+      if (!WebPGetWorkerInterface()->Reset(worker)) {
+        return 0;
+      }
+      WebPGetWorkerInterface()->Launch(worker);
+      return 1;
+    } else {
+      return CompressAlphaJob(enc, NULL);   // just do the job right away
+    }
+  }
+  return 1;
+}
+
+int VP8EncFinishAlpha(VP8Encoder* const enc) {
+  if (enc->has_alpha_) {
+    if (enc->thread_level_ > 0) {
+      WebPWorker* const worker = &enc->alpha_worker_;
+      if (!WebPGetWorkerInterface()->Sync(worker)) return 0;  // error
+    }
+  }
+  return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+}
+
+int VP8EncDeleteAlpha(VP8Encoder* const enc) {
+  int ok = 1;
+  if (enc->thread_level_ > 0) {
+    WebPWorker* const worker = &enc->alpha_worker_;
+    // finish anything left in flight
+    ok = WebPGetWorkerInterface()->Sync(worker);
+    // still need to end the worker, even if !ok
+    WebPGetWorkerInterface()->End(worker);
+  }
+  WebPSafeFree(enc->alpha_data_);
+  enc->alpha_data_ = NULL;
+  enc->alpha_data_size_ = 0;
+  enc->has_alpha_ = 0;
+  return ok;
+}
diff --git a/media/libwebp/enc/analysis_enc.c b/media/libwebp/enc/analysis_enc.c
new file mode 100644
index 0000000000..489434b3d1
--- /dev/null
+++ b/media/libwebp/enc/analysis_enc.c
@@ -0,0 +1,475 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Macroblock analysis
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../enc/cost_enc.h"
+#include "../utils/utils.h"
+
+#define MAX_ITERS_K_MEANS  6
+
+//------------------------------------------------------------------------------
+// Smooth the segment map by replacing isolated block by the majority of its
+// neighbours.
+
+static void SmoothSegmentMap(VP8Encoder* const enc) {
+  int n, x, y;
+  const int w = enc->mb_w_;
+  const int h = enc->mb_h_;
+  const int majority_cnt_3_x_3_grid = 5;
+  uint8_t* const tmp = (uint8_t*)WebPSafeMalloc(w * h, sizeof(*tmp));
+  assert((uint64_t)(w * h) == (uint64_t)w * h);   // no overflow, as per spec
+
+  if (tmp == NULL) return;
+  for (y = 1; y < h - 1; ++y) {
+    for (x = 1; x < w - 1; ++x) {
+      int cnt[NUM_MB_SEGMENTS] = { 0 };
+      const VP8MBInfo* const mb = &enc->mb_info_[x + w * y];
+      int majority_seg = mb->segment_;
+      // Check the 8 neighbouring segment values.
+      cnt[mb[-w - 1].segment_]++;  // top-left
+      cnt[mb[-w + 0].segment_]++;  // top
+      cnt[mb[-w + 1].segment_]++;  // top-right
+      cnt[mb[   - 1].segment_]++;  // left
+      cnt[mb[   + 1].segment_]++;  // right
+      cnt[mb[ w - 1].segment_]++;  // bottom-left
+      cnt[mb[ w + 0].segment_]++;  // bottom
+      cnt[mb[ w + 1].segment_]++;  // bottom-right
+      for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
+        if (cnt[n] >= majority_cnt_3_x_3_grid) {
+          majority_seg = n;
+          break;
+        }
+      }
+      tmp[x + y * w] = majority_seg;
+    }
+  }
+  for (y = 1; y < h - 1; ++y) {
+    for (x = 1; x < w - 1; ++x) {
+      VP8MBInfo* const mb = &enc->mb_info_[x + w * y];
+      mb->segment_ = tmp[x + y * w];
+    }
+  }
+  WebPSafeFree(tmp);
+}
+
+//------------------------------------------------------------------------------
+// set segment susceptibility alpha_ / beta_
+
+static WEBP_INLINE int clip(int v, int m, int M) {
+  return (v < m) ? m : (v > M) ? M : v;
+}
+
+static void SetSegmentAlphas(VP8Encoder* const enc,
+                             const int centers[NUM_MB_SEGMENTS],
+                             int mid) {
+  const int nb = enc->segment_hdr_.num_segments_;
+  int min = centers[0], max = centers[0];
+  int n;
+
+  if (nb > 1) {
+    for (n = 0; n < nb; ++n) {
+      if (min > centers[n]) min = centers[n];
+      if (max < centers[n]) max = centers[n];
+    }
+  }
+  if (max == min) max = min + 1;
+  assert(mid <= max && mid >= min);
+  for (n = 0; n < nb; ++n) {
+    const int alpha = 255 * (centers[n] - mid) / (max - min);
+    const int beta = 255 * (centers[n] - min) / (max - min);
+    enc->dqm_[n].alpha_ = clip(alpha, -127, 127);
+    enc->dqm_[n].beta_ = clip(beta, 0, 255);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+#define MAX_ALPHA 255                // 8b of precision for susceptibilities.
+#define ALPHA_SCALE (2 * MAX_ALPHA)  // scaling factor for alpha.
+#define DEFAULT_ALPHA (-1)
+#define IS_BETTER_ALPHA(alpha, best_alpha) ((alpha) > (best_alpha))
+
+static int FinalAlphaValue(int alpha) {
+  alpha = MAX_ALPHA - alpha;
+  return clip(alpha, 0, MAX_ALPHA);
+}
+
+static int GetAlpha(const VP8Histogram* const histo) {
+  // 'alpha' will later be clipped to [0..MAX_ALPHA] range, clamping outer
+  // values which happen to be mostly noise. This leaves the maximum precision
+  // for handling the useful small values which contribute most.
+  const int max_value = histo->max_value;
+  const int last_non_zero = histo->last_non_zero;
+  const int alpha =
+      (max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
+  return alpha;
+}
+
+static void InitHistogram(VP8Histogram* const histo) {
+  histo->max_value = 0;
+  histo->last_non_zero = 1;
+}
+
+//------------------------------------------------------------------------------
+// Simplified k-Means, to assign Nb segments based on alpha-histogram
+
+static void AssignSegments(VP8Encoder* const enc,
+                           const int alphas[MAX_ALPHA + 1]) {
+  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
+  // explicit check is needed to avoid spurious warning about 'n + 1' exceeding
+  // array bounds of 'centers' with some compilers (noticed with gcc-4.9).
+  const int nb = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS) ?
+                 enc->segment_hdr_.num_segments_ : NUM_MB_SEGMENTS;
+  int centers[NUM_MB_SEGMENTS];
+  int weighted_average = 0;
+  int map[MAX_ALPHA + 1];
+  int a, n, k;
+  int min_a = 0, max_a = MAX_ALPHA, range_a;
+  // 'int' type is ok for histo, and won't overflow
+  int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];
+
+  assert(nb >= 1);
+  assert(nb <= NUM_MB_SEGMENTS);
+
+  // bracket the input
+  for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
+  min_a = n;
+  for (n = MAX_ALPHA; n > min_a && alphas[n] == 0; --n) {}
+  max_a = n;
+  range_a = max_a - min_a;
+
+  // Spread initial centers evenly
+  for (k = 0, n = 1; k < nb; ++k, n += 2) {
+    assert(n < 2 * nb);
+    centers[k] = min_a + (n * range_a) / (2 * nb);
+  }
+
+  for (k = 0; k < MAX_ITERS_K_MEANS; ++k) {     // few iters are enough
+    int total_weight;
+    int displaced;
+    // Reset stats
+    for (n = 0; n < nb; ++n) {
+      accum[n] = 0;
+      dist_accum[n] = 0;
+    }
+    // Assign nearest center for each 'a'
+    n = 0;    // track the nearest center for current 'a'
+    for (a = min_a; a <= max_a; ++a) {
+      if (alphas[a]) {
+        while (n + 1 < nb && abs(a - centers[n + 1]) < abs(a - centers[n])) {
+          n++;
+        }
+        map[a] = n;
+        // accumulate contribution into best centroid
+        dist_accum[n] += a * alphas[a];
+        accum[n] += alphas[a];
+      }
+    }
+    // All point are classified. Move the centroids to the
+    // center of their respective cloud.
+    displaced = 0;
+    weighted_average = 0;
+    total_weight = 0;
+    for (n = 0; n < nb; ++n) {
+      if (accum[n]) {
+        const int new_center = (dist_accum[n] + accum[n] / 2) / accum[n];
+        displaced += abs(centers[n] - new_center);
+        centers[n] = new_center;
+        weighted_average += new_center * accum[n];
+        total_weight += accum[n];
+      }
+    }
+    weighted_average = (weighted_average + total_weight / 2) / total_weight;
+    if (displaced < 5) break;   // no need to keep on looping...
+  }
+
+  // Map each original value to the closest centroid
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    VP8MBInfo* const mb = &enc->mb_info_[n];
+    const int alpha = mb->alpha_;
+    mb->segment_ = map[alpha];
+    mb->alpha_ = centers[map[alpha]];  // for the record.
+  }
+
+  if (nb > 1) {
+    const int smooth = (enc->config_->preprocessing & 1);
+    if (smooth) SmoothSegmentMap(enc);
+  }
+
+  SetSegmentAlphas(enc, centers, weighted_average);  // pick some alphas.
+}
+
+//------------------------------------------------------------------------------
+// Macroblock analysis: collect histogram for each mode, deduce the maximal
+// susceptibility and set best modes for this macroblock.
+// Segment assignment is done later.
+
+// Number of modes to inspect for alpha_ evaluation. We don't need to test all
+// the possible modes during the analysis phase: we risk falling into a local
+// optimum, or be subject to boundary effect
+#define MAX_INTRA16_MODE 2
+#define MAX_INTRA4_MODE  2
+#define MAX_UV_MODE      2
+
+static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
+  const int max_mode = MAX_INTRA16_MODE;
+  int mode;
+  int best_alpha = DEFAULT_ALPHA;
+  int best_mode = 0;
+
+  VP8MakeLuma16Preds(it);
+  for (mode = 0; mode < max_mode; ++mode) {
+    VP8Histogram histo;
+    int alpha;
+
+    InitHistogram(&histo);
+    VP8CollectHistogram(it->yuv_in_ + Y_OFF_ENC,
+                        it->yuv_p_ + VP8I16ModeOffsets[mode],
+                        0, 16, &histo);
+    alpha = GetAlpha(&histo);
+    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
+      best_alpha = alpha;
+      best_mode = mode;
+    }
+  }
+  VP8SetIntra16Mode(it, best_mode);
+  return best_alpha;
+}
+
+static int FastMBAnalyze(VP8EncIterator* const it) {
+  // Empirical cut-off value, should be around 16 (~=block size). We use the
+  // [8-17] range and favor intra4 at high quality, intra16 for low quality.
+  const int q = (int)it->enc_->config_->quality;
+  const uint32_t kThreshold = 8 + (17 - 8) * q / 100;
+  int k;
+  uint32_t dc[16], m, m2;
+  for (k = 0; k < 16; k += 4) {
+    VP8Mean16x4(it->yuv_in_ + Y_OFF_ENC + k * BPS, &dc[k]);
+  }
+  for (m = 0, m2 = 0, k = 0; k < 16; ++k) {
+    m += dc[k];
+    m2 += dc[k] * dc[k];
+  }
+  if (kThreshold * m2 < m * m) {
+    VP8SetIntra16Mode(it, 0);   // DC16
+  } else {
+    const uint8_t modes[16] = { 0 };  // DC4
+    VP8SetIntra4Mode(it, modes);
+  }
+  return 0;
+}
+
+static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
+  int best_alpha = DEFAULT_ALPHA;
+  int smallest_alpha = 0;
+  int best_mode = 0;
+  const int max_mode = MAX_UV_MODE;
+  int mode;
+
+  VP8MakeChroma8Preds(it);
+  for (mode = 0; mode < max_mode; ++mode) {
+    VP8Histogram histo;
+    int alpha;
+    InitHistogram(&histo);
+    VP8CollectHistogram(it->yuv_in_ + U_OFF_ENC,
+                        it->yuv_p_ + VP8UVModeOffsets[mode],
+                        16, 16 + 4 + 4, &histo);
+    alpha = GetAlpha(&histo);
+    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
+      best_alpha = alpha;
+    }
+    // The best prediction mode tends to be the one with the smallest alpha.
+    if (mode == 0 || alpha < smallest_alpha) {
+      smallest_alpha = alpha;
+      best_mode = mode;
+    }
+  }
+  VP8SetIntraUVMode(it, best_mode);
+  return best_alpha;
+}
+
+static void MBAnalyze(VP8EncIterator* const it,
+                      int alphas[MAX_ALPHA + 1],
+                      int* const alpha, int* const uv_alpha) {
+  const VP8Encoder* const enc = it->enc_;
+  int best_alpha, best_uv_alpha;
+
+  VP8SetIntra16Mode(it, 0);  // default: Intra16, DC_PRED
+  VP8SetSkip(it, 0);         // not skipped
+  VP8SetSegment(it, 0);      // default segment, spec-wise.
+
+  if (enc->method_ <= 1) {
+    best_alpha = FastMBAnalyze(it);
+  } else {
+    best_alpha = MBAnalyzeBestIntra16Mode(it);
+  }
+  best_uv_alpha = MBAnalyzeBestUVMode(it);
+
+  // Final susceptibility mix
+  best_alpha = (3 * best_alpha + best_uv_alpha + 2) >> 2;
+  best_alpha = FinalAlphaValue(best_alpha);
+  alphas[best_alpha]++;
+  it->mb_->alpha_ = best_alpha;   // for later remapping.
+
+  // Accumulate for later complexity analysis.
+  *alpha += best_alpha;   // mixed susceptibility (not just luma)
+  *uv_alpha += best_uv_alpha;
+}
+
+static void DefaultMBInfo(VP8MBInfo* const mb) {
+  mb->type_ = 1;     // I16x16
+  mb->uv_mode_ = 0;
+  mb->skip_ = 0;     // not skipped
+  mb->segment_ = 0;  // default segment
+  mb->alpha_ = 0;
+}
+
+//------------------------------------------------------------------------------
+// Main analysis loop:
+// Collect all susceptibilities for each macroblock and record their
+// distribution in alphas[]. Segments is assigned a-posteriori, based on
+// this histogram.
+// We also pick an intra16 prediction mode, which shouldn't be considered
+// final except for fast-encode settings. We can also pick some intra4 modes
+// and decide intra4/intra16, but that's usually almost always a bad choice at
+// this stage.
+
+static void ResetAllMBInfo(VP8Encoder* const enc) {
+  int n;
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    DefaultMBInfo(&enc->mb_info_[n]);
+  }
+  // Default susceptibilities.
+  enc->dqm_[0].alpha_ = 0;
+  enc->dqm_[0].beta_ = 0;
+  // Note: we can't compute this alpha_ / uv_alpha_ -> set to default value.
+  enc->alpha_ = 0;
+  enc->uv_alpha_ = 0;
+  WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+}
+
+// struct used to collect job result
+typedef struct {
+  WebPWorker worker;
+  int alphas[MAX_ALPHA + 1];
+  int alpha, uv_alpha;
+  VP8EncIterator it;
+  int delta_progress;
+} SegmentJob;
+
+// main work call
+static int DoSegmentsJob(void* arg1, void* arg2) {
+  SegmentJob* const job = (SegmentJob*)arg1;
+  VP8EncIterator* const it = (VP8EncIterator*)arg2;
+  int ok = 1;
+  if (!VP8IteratorIsDone(it)) {
+    uint8_t tmp[32 + WEBP_ALIGN_CST];
+    uint8_t* const scratch = (uint8_t*)WEBP_ALIGN(tmp);
+    do {
+      // Let's pretend we have perfect lossless reconstruction.
+      VP8IteratorImport(it, scratch);
+      MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha);
+      ok = VP8IteratorProgress(it, job->delta_progress);
+    } while (ok && VP8IteratorNext(it));
+  }
+  return ok;
+}
+
+static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
+  int i;
+  for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
+  dst->alpha += src->alpha;
+  dst->uv_alpha += src->uv_alpha;
+}
+
+// initialize the job struct with some tasks to perform
+static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
+                           int start_row, int end_row) {
+  WebPGetWorkerInterface()->Init(&job->worker);
+  job->worker.data1 = job;
+  job->worker.data2 = &job->it;
+  job->worker.hook = DoSegmentsJob;
+  VP8IteratorInit(enc, &job->it);
+  VP8IteratorSetRow(&job->it, start_row);
+  VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
+  memset(job->alphas, 0, sizeof(job->alphas));
+  job->alpha = 0;
+  job->uv_alpha = 0;
+  // only one of both jobs can record the progress, since we don't
+  // expect the user's hook to be multi-thread safe
+  job->delta_progress = (start_row == 0) ? 20 : 0;
+}
+
+// main entry point
+int VP8EncAnalyze(VP8Encoder* const enc) {
+  int ok = 1;
+  const int do_segments =
+      enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
+      (enc->segment_hdr_.num_segments_ > 1) ||
+      (enc->method_ <= 1);  // for method 0 - 1, we need preds_[] to be filled.
+  if (do_segments) {
+    const int last_row = enc->mb_h_;
+    // We give a little more than a half work to the main thread.
+    const int split_row = (9 * last_row + 15) >> 4;
+    const int total_mb = last_row * enc->mb_w_;
+#ifdef WEBP_USE_THREAD
+    const int kMinSplitRow = 2;  // minimal rows needed for mt to be worth it
+    const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
+#else
+    const int do_mt = 0;
+#endif
+    const WebPWorkerInterface* const worker_interface =
+        WebPGetWorkerInterface();
+    SegmentJob main_job;
+    if (do_mt) {
+      SegmentJob side_job;
+      // Note the use of '&' instead of '&&' because we must call the functions
+      // no matter what.
+      InitSegmentJob(enc, &main_job, 0, split_row);
+      InitSegmentJob(enc, &side_job, split_row, last_row);
+      // we don't need to call Reset() on main_job.worker, since we're calling
+      // WebPWorkerExecute() on it
+      ok &= worker_interface->Reset(&side_job.worker);
+      // launch the two jobs in parallel
+      if (ok) {
+        worker_interface->Launch(&side_job.worker);
+        worker_interface->Execute(&main_job.worker);
+        ok &= worker_interface->Sync(&side_job.worker);
+        ok &= worker_interface->Sync(&main_job.worker);
+      }
+      worker_interface->End(&side_job.worker);
+      if (ok) MergeJobs(&side_job, &main_job);  // merge results together
+    } else {
+      // Even for single-thread case, we use the generic Worker tools.
+      InitSegmentJob(enc, &main_job, 0, last_row);
+      worker_interface->Execute(&main_job.worker);
+      ok &= worker_interface->Sync(&main_job.worker);
+    }
+    worker_interface->End(&main_job.worker);
+    if (ok) {
+      enc->alpha_ = main_job.alpha / total_mb;
+      enc->uv_alpha_ = main_job.uv_alpha / total_mb;
+      AssignSegments(enc, main_job.alphas);
+    }
+  } else {   // Use only one default segment.
+    ResetAllMBInfo(enc);
+  }
+  return ok;
+}
+
diff --git a/media/libwebp/enc/backward_references_cost_enc.c b/media/libwebp/enc/backward_references_cost_enc.c
new file mode 100644
index 0000000000..59e2c0f611
--- /dev/null
+++ b/media/libwebp/enc/backward_references_cost_enc.c
@@ -0,0 +1,790 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Improves a given set of backward references by analyzing its bit cost.
+// The algorithm is similar to the Zopfli compression algorithm but tailored to
+// images.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+//
+
+#include <assert.h>
+
+#include "../enc/backward_references_enc.h"
+#include "../enc/histogram_enc.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/color_cache_utils.h"
+#include "../utils/utils.h"
+
+#define VALUES_IN_BYTE 256
+
+extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+extern int VP8LDistanceToPlaneCode(int xsize, int dist);
+extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                                      const PixOrCopy v);
+
+typedef struct {
+  double alpha_[VALUES_IN_BYTE];
+  double red_[VALUES_IN_BYTE];
+  double blue_[VALUES_IN_BYTE];
+  double distance_[NUM_DISTANCE_CODES];
+  double* literal_;
+} CostModel;
+
+static void ConvertPopulationCountTableToBitEstimates(
+    int num_symbols, const uint32_t population_counts[], double output[]) {
+  uint32_t sum = 0;
+  int nonzeros = 0;
+  int i;
+  for (i = 0; i < num_symbols; ++i) {
+    sum += population_counts[i];
+    if (population_counts[i] > 0) {
+      ++nonzeros;
+    }
+  }
+  if (nonzeros <= 1) {
+    memset(output, 0, num_symbols * sizeof(*output));
+  } else {
+    const double logsum = VP8LFastLog2(sum);
+    for (i = 0; i < num_symbols; ++i) {
+      output[i] = logsum - VP8LFastLog2(population_counts[i]);
+    }
+  }
+}
+
+static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
+                          const VP8LBackwardRefs* const refs) {
+  int ok = 0;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
+  if (histo == NULL) goto Error;
+
+  // The following code is similar to VP8LHistogramCreate but converts the
+  // distance to plane code.
+  VP8LHistogramInit(histo, cache_bits, /*init_arrays=*/ 1);
+  while (VP8LRefsCursorOk(&c)) {
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode,
+                                    xsize);
+    VP8LRefsCursorNext(&c);
+  }
+
+  ConvertPopulationCountTableToBitEstimates(
+      VP8LHistogramNumCodes(histo->palette_code_bits_),
+      histo->literal_, m->literal_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->red_, m->red_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->blue_, m->blue_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->alpha_, m->alpha_);
+  ConvertPopulationCountTableToBitEstimates(
+      NUM_DISTANCE_CODES, histo->distance_, m->distance_);
+  ok = 1;
+
+ Error:
+  VP8LFreeHistogram(histo);
+  return ok;
+}
+
+static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+  return m->alpha_[v >> 24] +
+         m->red_[(v >> 16) & 0xff] +
+         m->literal_[(v >> 8) & 0xff] +
+         m->blue_[v & 0xff];
+}
+
+static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
+  const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
+  return m->literal_[literal_idx];
+}
+
+static WEBP_INLINE double GetLengthCost(const CostModel* const m,
+                                        uint32_t length) {
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(length, &code, &extra_bits);
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
+}
+
+static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
+                                          uint32_t distance) {
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
+  return m->distance_[code] + extra_bits;
+}
+
+static WEBP_INLINE void AddSingleLiteralWithCostModel(
+    const uint32_t* const argb, VP8LColorCache* const hashers,
+    const CostModel* const cost_model, int idx, int use_color_cache,
+    float prev_cost, float* const cost, uint16_t* const dist_array) {
+  double cost_val = prev_cost;
+  const uint32_t color = argb[idx];
+  const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
+  if (ix >= 0) {
+    // use_color_cache is true and hashers contains color
+    const double mul0 = 0.68;
+    cost_val += GetCacheCost(cost_model, ix) * mul0;
+  } else {
+    const double mul1 = 0.82;
+    if (use_color_cache) VP8LColorCacheInsert(hashers, color);
+    cost_val += GetLiteralCost(cost_model, color) * mul1;
+  }
+  if (cost[idx] > cost_val) {
+    cost[idx] = (float)cost_val;
+    dist_array[idx] = 1;  // only one is inserted.
+  }
+}
+
+// -----------------------------------------------------------------------------
+// CostManager and interval handling
+
+// Empirical value to avoid high memory consumption but good for performance.
+#define COST_CACHE_INTERVAL_SIZE_MAX 500
+
+// To perform backward reference every pixel at index index_ is considered and
+// the cost for the MAX_LENGTH following pixels computed. Those following pixels
+// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
+//     cost_ = distance cost at index + GetLengthCost(cost_model, k)
+// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
+// array of size MAX_LENGTH.
+// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
+// minimal values using intervals of constant cost.
+// An interval is defined by the index_ of the pixel that generated it and
+// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
+// it contains the minimum value for pixels between start_ and end_.
+// Intervals are stored in a linked list and ordered by start_. When a new
+// interval has a better value, old intervals are split or removed. There are
+// therefore no overlapping intervals.
+typedef struct CostInterval CostInterval;
+struct CostInterval {
+  float cost_;
+  int start_;
+  int end_;
+  int index_;
+  CostInterval* previous_;
+  CostInterval* next_;
+};
+
+// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
+typedef struct {
+  double cost_;
+  int start_;
+  int end_;       // Exclusive.
+} CostCacheInterval;
+
+// This structure is in charge of managing intervals and costs.
+// It caches the different CostCacheInterval, caches the different
+// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
+// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
+#define COST_MANAGER_MAX_FREE_LIST 10
+typedef struct {
+  CostInterval* head_;
+  int count_;  // The number of stored intervals.
+  CostCacheInterval* cache_intervals_;
+  size_t cache_intervals_size_;
+  double cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
+  float* costs_;
+  uint16_t* dist_array_;
+  // Most of the time, we only need few intervals -> use a free-list, to avoid
+  // fragmentation with small allocs in most common cases.
+  CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
+  CostInterval* free_intervals_;
+  // These are regularly malloc'd remains. This list can't grow larger than than
+  // size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
+  CostInterval* recycled_intervals_;
+} CostManager;
+
+static void CostIntervalAddToFreeList(CostManager* const manager,
+                                      CostInterval* const interval) {
+  interval->next_ = manager->free_intervals_;
+  manager->free_intervals_ = interval;
+}
+
+static int CostIntervalIsInFreeList(const CostManager* const manager,
+                                    const CostInterval* const interval) {
+  return (interval >= &manager->intervals_[0] &&
+          interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
+}
+
+static void CostManagerInitFreeList(CostManager* const manager) {
+  int i;
+  manager->free_intervals_ = NULL;
+  for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
+    CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
+  }
+}
+
+static void DeleteIntervalList(CostManager* const manager,
+                               const CostInterval* interval) {
+  while (interval != NULL) {
+    const CostInterval* const next = interval->next_;
+    if (!CostIntervalIsInFreeList(manager, interval)) {
+      WebPSafeFree((void*)interval);
+    }  // else: do nothing
+    interval = next;
+  }
+}
+
+static void CostManagerClear(CostManager* const manager) {
+  if (manager == NULL) return;
+
+  WebPSafeFree(manager->costs_);
+  WebPSafeFree(manager->cache_intervals_);
+
+  // Clear the interval lists.
+  DeleteIntervalList(manager, manager->head_);
+  manager->head_ = NULL;
+  DeleteIntervalList(manager, manager->recycled_intervals_);
+  manager->recycled_intervals_ = NULL;
+
+  // Reset pointers, count_ and cache_intervals_size_.
+  memset(manager, 0, sizeof(*manager));
+  CostManagerInitFreeList(manager);
+}
+
+static int CostManagerInit(CostManager* const manager,
+                           uint16_t* const dist_array, int pix_count,
+                           const CostModel* const cost_model) {
+  int i;
+  const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
+
+  manager->costs_ = NULL;
+  manager->cache_intervals_ = NULL;
+  manager->head_ = NULL;
+  manager->recycled_intervals_ = NULL;
+  manager->count_ = 0;
+  manager->dist_array_ = dist_array;
+  CostManagerInitFreeList(manager);
+
+  // Fill in the cost_cache_.
+  manager->cache_intervals_size_ = 1;
+  manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
+  for (i = 1; i < cost_cache_size; ++i) {
+    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
+    // Get the number of bound intervals.
+    if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
+      ++manager->cache_intervals_size_;
+    }
+  }
+
+  // With the current cost model, we usually have below 20 intervals.
+  // The worst case scenario with a cost model would be if every length has a
+  // different cost, hence MAX_LENGTH but that is impossible with the current
+  // implementation that spirals around a pixel.
+  assert(manager->cache_intervals_size_ <= MAX_LENGTH);
+  manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
+      manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
+  if (manager->cache_intervals_ == NULL) {
+    CostManagerClear(manager);
+    return 0;
+  }
+
+  // Fill in the cache_intervals_.
+  {
+    CostCacheInterval* cur = manager->cache_intervals_;
+
+    // Consecutive values in cost_cache_ are compared and if a big enough
+    // difference is found, a new interval is created and bounded.
+    cur->start_ = 0;
+    cur->end_ = 1;
+    cur->cost_ = manager->cost_cache_[0];
+    for (i = 1; i < cost_cache_size; ++i) {
+      const double cost_val = manager->cost_cache_[i];
+      if (cost_val != cur->cost_) {
+        ++cur;
+        // Initialize an interval.
+        cur->start_ = i;
+        cur->cost_ = cost_val;
+      }
+      cur->end_ = i + 1;
+    }
+  }
+
+  manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
+  if (manager->costs_ == NULL) {
+    CostManagerClear(manager);
+    return 0;
+  }
+  // Set the initial costs_ high for every pixel as we will keep the minimum.
+  for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
+
+  return 1;
+}
+
+// Given the cost and the position that define an interval, update the cost at
+// pixel 'i' if it is smaller than the previously computed value.
+static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
+                                   int position, float cost) {
+  const int k = i - position;
+  assert(k >= 0 && k < MAX_LENGTH);
+
+  if (manager->costs_[i] > cost) {
+    manager->costs_[i] = cost;
+    manager->dist_array_[i] = k + 1;
+  }
+}
+
+// Given the cost and the position that define an interval, update the cost for
+// all the pixels between 'start' and 'end' excluded.
+static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
+                                              int start, int end, int position,
+                                              float cost) {
+  int i;
+  for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
+}
+
+// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
+static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
+                                         CostInterval* const prev,
+                                         CostInterval* const next) {
+  if (prev != NULL) {
+    prev->next_ = next;
+  } else {
+    manager->head_ = next;
+  }
+
+  if (next != NULL) next->previous_ = prev;
+}
+
+// Pop an interval in the manager.
+static WEBP_INLINE void PopInterval(CostManager* const manager,
+                                    CostInterval* const interval) {
+  if (interval == NULL) return;
+
+  ConnectIntervals(manager, interval->previous_, interval->next_);
+  if (CostIntervalIsInFreeList(manager, interval)) {
+    CostIntervalAddToFreeList(manager, interval);
+  } else {  // recycle regularly malloc'd intervals too
+    interval->next_ = manager->recycled_intervals_;
+    manager->recycled_intervals_ = interval;
+  }
+  --manager->count_;
+  assert(manager->count_ >= 0);
+}
+
+// Update the cost at index i by going over all the stored intervals that
+// overlap with i.
+// If 'do_clean_intervals' is set to something different than 0, intervals that
+// end before 'i' will be popped.
+static WEBP_INLINE void UpdateCostAtIndex(CostManager* const manager, int i,
+                                          int do_clean_intervals) {
+  CostInterval* current = manager->head_;
+
+  while (current != NULL && current->start_ <= i) {
+    CostInterval* const next = current->next_;
+    if (current->end_ <= i) {
+      if (do_clean_intervals) {
+        // We have an outdated interval, remove it.
+        PopInterval(manager, current);
+      }
+    } else {
+      UpdateCost(manager, i, current->index_, current->cost_);
+    }
+    current = next;
+  }
+}
+
+// Given a current orphan interval and its previous interval, before
+// it was orphaned (which can be NULL), set it at the right place in the list
+// of intervals using the start_ ordering and the previous interval as a hint.
+static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
+                                               CostInterval* const current,
+                                               CostInterval* previous) {
+  assert(current != NULL);
+
+  if (previous == NULL) previous = manager->head_;
+  while (previous != NULL && current->start_ < previous->start_) {
+    previous = previous->previous_;
+  }
+  while (previous != NULL && previous->next_ != NULL &&
+         previous->next_->start_ < current->start_) {
+    previous = previous->next_;
+  }
+
+  if (previous != NULL) {
+    ConnectIntervals(manager, current, previous->next_);
+  } else {
+    ConnectIntervals(manager, current, manager->head_);
+  }
+  ConnectIntervals(manager, previous, current);
+}
+
+// Insert an interval in the list contained in the manager by starting at
+// interval_in as a hint. The intervals are sorted by start_ value.
+static WEBP_INLINE void InsertInterval(CostManager* const manager,
+                                       CostInterval* const interval_in,
+                                       float cost, int position, int start,
+                                       int end) {
+  CostInterval* interval_new;
+
+  if (start >= end) return;
+  if (manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
+    // Serialize the interval if we cannot store it.
+    UpdateCostPerInterval(manager, start, end, position, cost);
+    return;
+  }
+  if (manager->free_intervals_ != NULL) {
+    interval_new = manager->free_intervals_;
+    manager->free_intervals_ = interval_new->next_;
+  } else if (manager->recycled_intervals_ != NULL) {
+    interval_new = manager->recycled_intervals_;
+    manager->recycled_intervals_ = interval_new->next_;
+  } else {  // malloc for good
+    interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
+    if (interval_new == NULL) {
+      // Write down the interval if we cannot create it.
+      UpdateCostPerInterval(manager, start, end, position, cost);
+      return;
+    }
+  }
+
+  interval_new->cost_ = cost;
+  interval_new->index_ = position;
+  interval_new->start_ = start;
+  interval_new->end_ = end;
+  PositionOrphanInterval(manager, interval_new, interval_in);
+
+  ++manager->count_;
+}
+
+// Given a new cost interval defined by its start at position, its length value
+// and distance_cost, add its contributions to the previous intervals and costs.
+// If handling the interval or one of its subintervals becomes to heavy, its
+// contribution is added to the costs right away.
+static WEBP_INLINE void PushInterval(CostManager* const manager,
+                                     double distance_cost, int position,
+                                     int len) {
+  size_t i;
+  CostInterval* interval = manager->head_;
+  CostInterval* interval_next;
+  const CostCacheInterval* const cost_cache_intervals =
+      manager->cache_intervals_;
+  // If the interval is small enough, no need to deal with the heavy
+  // interval logic, just serialize it right away. This constant is empirical.
+  const int kSkipDistance = 10;
+
+  if (len < kSkipDistance) {
+    int j;
+    for (j = position; j < position + len; ++j) {
+      const int k = j - position;
+      float cost_tmp;
+      assert(k >= 0 && k < MAX_LENGTH);
+      cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
+
+      if (manager->costs_[j] > cost_tmp) {
+        manager->costs_[j] = cost_tmp;
+        manager->dist_array_[j] = k + 1;
+      }
+    }
+    return;
+  }
+
+  for (i = 0; i < manager->cache_intervals_size_ &&
+              cost_cache_intervals[i].start_ < len;
+       ++i) {
+    // Define the intersection of the ith interval with the new one.
+    int start = position + cost_cache_intervals[i].start_;
+    const int end = position + (cost_cache_intervals[i].end_ > len
+                                 ? len
+                                 : cost_cache_intervals[i].end_);
+    const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
+
+    for (; interval != NULL && interval->start_ < end;
+         interval = interval_next) {
+      interval_next = interval->next_;
+
+      // Make sure we have some overlap
+      if (start >= interval->end_) continue;
+
+      if (cost >= interval->cost_) {
+        // When intervals are represented, the lower, the better.
+        // [**********************************************************[
+        // start                                                    end
+        //                   [----------------------------------[
+        //                   interval->start_       interval->end_
+        // If we are worse than what we already have, add whatever we have so
+        // far up to interval.
+        const int start_new = interval->end_;
+        InsertInterval(manager, interval, cost, position, start,
+                       interval->start_);
+        start = start_new;
+        if (start >= end) break;
+        continue;
+      }
+
+      if (start <= interval->start_) {
+        if (interval->end_ <= end) {
+          //                   [----------------------------------[
+          //                   interval->start_       interval->end_
+          // [**************************************************************[
+          // start                                                        end
+          // We can safely remove the old interval as it is fully included.
+          PopInterval(manager, interval);
+        } else {
+          //              [------------------------------------[
+          //              interval->start_        interval->end_
+          // [*****************************[
+          // start                       end
+          interval->start_ = end;
+          break;
+        }
+      } else {
+        if (end < interval->end_) {
+          // [--------------------------------------------------------------[
+          // interval->start_                                  interval->end_
+          //                     [*****************************[
+          //                     start                       end
+          // We have to split the old interval as it fully contains the new one.
+          const int end_original = interval->end_;
+          interval->end_ = start;
+          InsertInterval(manager, interval, interval->cost_, interval->index_,
+                         end, end_original);
+          interval = interval->next_;
+          break;
+        } else {
+          // [------------------------------------[
+          // interval->start_        interval->end_
+          //                     [*****************************[
+          //                     start                       end
+          interval->end_ = start;
+        }
+      }
+    }
+    // Insert the remaining interval from start to end.
+    InsertInterval(manager, interval, cost, position, start, end);
+  }
+}
+
+static int BackwardReferencesHashChainDistanceOnly(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain, const VP8LBackwardRefs* const refs,
+    uint16_t* const dist_array) {
+  int i;
+  int ok = 0;
+  int cc_init = 0;
+  const int pix_count = xsize * ysize;
+  const int use_color_cache = (cache_bits > 0);
+  const size_t literal_array_size =
+      sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
+                        ((cache_bits > 0) ? (1 << cache_bits) : 0));
+  const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
+  CostModel* const cost_model =
+      (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
+  VP8LColorCache hashers;
+  CostManager* cost_manager =
+      (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
+  int offset_prev = -1, len_prev = -1;
+  double offset_cost = -1;
+  int first_offset_is_constant = -1;  // initialized with 'impossible' value
+  int reach = 0;
+
+  if (cost_model == NULL || cost_manager == NULL) goto Error;
+
+  cost_model->literal_ = (double*)(cost_model + 1);
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  if (!CostModelBuild(cost_model, xsize, cache_bits, refs)) {
+    goto Error;
+  }
+
+  if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
+    goto Error;
+  }
+
+  // We loop one pixel at a time, but store all currently best points to
+  // non-processed locations from this point.
+  dist_array[0] = 0;
+  // Add first pixel as literal.
+  AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
+                                0.f, cost_manager->costs_, dist_array);
+
+  for (i = 1; i < pix_count; ++i) {
+    const float prev_cost = cost_manager->costs_[i - 1];
+    int offset, len;
+    VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
+
+    // Try adding the pixel as a literal.
+    AddSingleLiteralWithCostModel(argb, &hashers, cost_model, i,
+                                  use_color_cache, prev_cost,
+                                  cost_manager->costs_, dist_array);
+
+    // If we are dealing with a non-literal.
+    if (len >= 2) {
+      if (offset != offset_prev) {
+        const int code = VP8LDistanceToPlaneCode(xsize, offset);
+        offset_cost = GetDistanceCost(cost_model, code);
+        first_offset_is_constant = 1;
+        PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+      } else {
+        assert(offset_cost >= 0);
+        assert(len_prev >= 0);
+        assert(first_offset_is_constant == 0 || first_offset_is_constant == 1);
+        // Instead of considering all contributions from a pixel i by calling:
+        //         PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+        // we optimize these contributions in case offset_cost stays the same
+        // for consecutive pixels. This describes a set of pixels similar to a
+        // previous set (e.g. constant color regions).
+        if (first_offset_is_constant) {
+          reach = i - 1 + len_prev - 1;
+          first_offset_is_constant = 0;
+        }
+
+        if (i + len - 1 > reach) {
+          // We can only be go further with the same offset if the previous
+          // length was maxed, hence len_prev == len == MAX_LENGTH.
+          // TODO(vrabaud), bump i to the end right away (insert cache and
+          // update cost).
+          // TODO(vrabaud), check if one of the points in between does not have
+          // a lower cost.
+          // Already consider the pixel at "reach" to add intervals that are
+          // better than whatever we add.
+          int offset_j, len_j = 0;
+          int j;
+          assert(len == MAX_LENGTH || len == pix_count - i);
+          // Figure out the last consecutive pixel within [i, reach + 1] with
+          // the same offset.
+          for (j = i; j <= reach; ++j) {
+            VP8LHashChainFindCopy(hash_chain, j + 1, &offset_j, &len_j);
+            if (offset_j != offset) {
+              VP8LHashChainFindCopy(hash_chain, j, &offset_j, &len_j);
+              break;
+            }
+          }
+          // Update the cost at j - 1 and j.
+          UpdateCostAtIndex(cost_manager, j - 1, 0);
+          UpdateCostAtIndex(cost_manager, j, 0);
+
+          PushInterval(cost_manager, cost_manager->costs_[j - 1] + offset_cost,
+                       j, len_j);
+          reach = j + len_j - 1;
+        }
+      }
+    }
+
+    UpdateCostAtIndex(cost_manager, i, 1);
+    offset_prev = offset;
+    len_prev = len;
+  }
+
+  ok = !refs->error_;
+Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  CostManagerClear(cost_manager);
+  WebPSafeFree(cost_model);
+  WebPSafeFree(cost_manager);
+  return ok;
+}
+
+// We pack the path at the end of *dist_array and return
+// a pointer to this part of the array. Example:
+// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
+static void TraceBackwards(uint16_t* const dist_array,
+                           int dist_array_size,
+                           uint16_t** const chosen_path,
+                           int* const chosen_path_size) {
+  uint16_t* path = dist_array + dist_array_size;
+  uint16_t* cur = dist_array + dist_array_size - 1;
+  while (cur >= dist_array) {
+    const int k = *cur;
+    --path;
+    *path = k;
+    cur -= k;
+  }
+  *chosen_path = path;
+  *chosen_path_size = (int)(dist_array + dist_array_size - path);
+}
+
+static int BackwardReferencesHashChainFollowChosenPath(
+    const uint32_t* const argb, int cache_bits,
+    const uint16_t* const chosen_path, int chosen_path_size,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
+  const int use_color_cache = (cache_bits > 0);
+  int ix;
+  int i = 0;
+  int ok = 0;
+  int cc_init = 0;
+  VP8LColorCache hashers;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  VP8LClearBackwardRefs(refs);
+  for (ix = 0; ix < chosen_path_size; ++ix) {
+    const int len = chosen_path[ix];
+    if (len != 1) {
+      int k;
+      const int offset = VP8LHashChainFindOffset(hash_chain, i);
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
+      if (use_color_cache) {
+        for (k = 0; k < len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      i += len;
+    } else {
+      PixOrCopy v;
+      const int idx =
+          use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
+      if (idx >= 0) {
+        // use_color_cache is true and hashers contains argb[i]
+        // push pixel as a color cache index
+        v = PixOrCopyCreateCacheIdx(idx);
+      } else {
+        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
+        v = PixOrCopyCreateLiteral(argb[i]);
+      }
+      VP8LBackwardRefsCursorAdd(refs, v);
+      ++i;
+    }
+  }
+  ok = !refs->error_;
+ Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  return ok;
+}
+
+// Returns 1 on success.
+extern int VP8LBackwardReferencesTraceBackwards(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain,
+    const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
+int VP8LBackwardReferencesTraceBackwards(int xsize, int ysize,
+                                         const uint32_t* const argb,
+                                         int cache_bits,
+                                         const VP8LHashChain* const hash_chain,
+                                         const VP8LBackwardRefs* const refs_src,
+                                         VP8LBackwardRefs* const refs_dst) {
+  int ok = 0;
+  const int dist_array_size = xsize * ysize;
+  uint16_t* chosen_path = NULL;
+  int chosen_path_size = 0;
+  uint16_t* dist_array =
+      (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
+
+  if (dist_array == NULL) goto Error;
+
+  if (!BackwardReferencesHashChainDistanceOnly(
+          xsize, ysize, argb, cache_bits, hash_chain, refs_src, dist_array)) {
+    goto Error;
+  }
+  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
+  if (!BackwardReferencesHashChainFollowChosenPath(
+          argb, cache_bits, chosen_path, chosen_path_size, hash_chain,
+          refs_dst)) {
+    goto Error;
+  }
+  ok = 1;
+ Error:
+  WebPSafeFree(dist_array);
+  return ok;
+}
diff --git a/media/libwebp/enc/backward_references_enc.c b/media/libwebp/enc/backward_references_enc.c
new file mode 100644
index 0000000000..b78610565a
--- /dev/null
+++ b/media/libwebp/enc/backward_references_enc.c
@@ -0,0 +1,1030 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+
+#include "../dsp/dsp.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../enc/backward_references_enc.h"
+#include "../enc/histogram_enc.h"
+#include "../utils/color_cache_utils.h"
+#include "../utils/utils.h"
+
+#define MIN_BLOCK_SIZE 256  // minimum block size for backward references
+
+#define MAX_ENTROPY    (1e30f)
+
+// 1M window (4M bytes) minus 120 special codes for short distances.
+#define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
+
+// Minimum number of pixels for which it is cheaper to encode a
+// distance + length instead of each pixel as a literal.
+#define MIN_LENGTH 4
+
+// -----------------------------------------------------------------------------
+
+static const uint8_t plane_to_code_lut[128] = {
+ 96,   73,  55,  39,  23,  13,   5,  1,  255, 255, 255, 255, 255, 255, 255, 255,
+ 101,  78,  58,  42,  26,  16,   8,  2,    0,   3,  9,   17,  27,  43,  59,  79,
+ 102,  86,  62,  46,  32,  20,  10,  6,    4,   7,  11,  21,  33,  47,  63,  87,
+ 105,  90,  70,  52,  37,  28,  18,  14,  12,  15,  19,  29,  38,  53,  71,  91,
+ 110,  99,  82,  66,  48,  35,  30,  24,  22,  25,  31,  36,  49,  67,  83, 100,
+ 115, 108,  94,  76,  64,  50,  44,  40,  34,  41,  45,  51,  65,  77,  95, 109,
+ 118, 113, 103,  92,  80,  68,  60,  56,  54,  57,  61,  69,  81,  93, 104, 114,
+ 119, 116, 111, 106,  97,  88,  84,  74,  72,  75,  85,  89,  98, 107, 112, 117
+};
+
+extern int VP8LDistanceToPlaneCode(int xsize, int dist);
+int VP8LDistanceToPlaneCode(int xsize, int dist) {
+  const int yoffset = dist / xsize;
+  const int xoffset = dist - yoffset * xsize;
+  if (xoffset <= 8 && yoffset < 8) {
+    return plane_to_code_lut[yoffset * 16 + 8 - xoffset] + 1;
+  } else if (xoffset > xsize - 8 && yoffset < 7) {
+    return plane_to_code_lut[(yoffset + 1) * 16 + 8 + (xsize - xoffset)] + 1;
+  }
+  return dist + 120;
+}
+
+// Returns the exact index where array1 and array2 are different. For an index
+// inferior or equal to best_len_match, the return value just has to be strictly
+// inferior to best_len_match. The current behavior is to return 0 if this index
+// is best_len_match, and the index itself otherwise.
+// If no two elements are the same, it returns max_limit.
+static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
+                                       const uint32_t* const array2,
+                                       int best_len_match, int max_limit) {
+  // Before 'expensive' linear match, check if the two arrays match at the
+  // current best length index.
+  if (array1[best_len_match] != array2[best_len_match]) return 0;
+
+  return VP8LVectorMismatch(array1, array2, max_limit);
+}
+
+// -----------------------------------------------------------------------------
+//  VP8LBackwardRefs
+
+struct PixOrCopyBlock {
+  PixOrCopyBlock* next_;   // next block (or NULL)
+  PixOrCopy* start_;       // data start
+  int size_;               // currently used size
+};
+
+extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) {
+  assert(refs != NULL);
+  if (refs->tail_ != NULL) {
+    *refs->tail_ = refs->free_blocks_;  // recycle all blocks at once
+  }
+  refs->free_blocks_ = refs->refs_;
+  refs->tail_ = &refs->refs_;
+  refs->last_block_ = NULL;
+  refs->refs_ = NULL;
+}
+
+void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
+  assert(refs != NULL);
+  VP8LClearBackwardRefs(refs);
+  while (refs->free_blocks_ != NULL) {
+    PixOrCopyBlock* const next = refs->free_blocks_->next_;
+    WebPSafeFree(refs->free_blocks_);
+    refs->free_blocks_ = next;
+  }
+}
+
+// Swaps the content of two VP8LBackwardRefs.
+static void BackwardRefsSwap(VP8LBackwardRefs* const refs1,
+                             VP8LBackwardRefs* const refs2) {
+  const int point_to_refs1 =
+      (refs1->tail_ != NULL && refs1->tail_ == &refs1->refs_);
+  const int point_to_refs2 =
+      (refs2->tail_ != NULL && refs2->tail_ == &refs2->refs_);
+  const VP8LBackwardRefs tmp = *refs1;
+  *refs1 = *refs2;
+  *refs2 = tmp;
+  if (point_to_refs2) refs1->tail_ = &refs1->refs_;
+  if (point_to_refs1) refs2->tail_ = &refs2->refs_;
+}
+
+void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
+  assert(refs != NULL);
+  memset(refs, 0, sizeof(*refs));
+  refs->tail_ = &refs->refs_;
+  refs->block_size_ =
+      (block_size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : block_size;
+}
+
+VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs) {
+  VP8LRefsCursor c;
+  c.cur_block_ = refs->refs_;
+  if (refs->refs_ != NULL) {
+    c.cur_pos = c.cur_block_->start_;
+    c.last_pos_ = c.cur_pos + c.cur_block_->size_;
+  } else {
+    c.cur_pos = NULL;
+    c.last_pos_ = NULL;
+  }
+  return c;
+}
+
+void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c) {
+  PixOrCopyBlock* const b = c->cur_block_->next_;
+  c->cur_pos = (b == NULL) ? NULL : b->start_;
+  c->last_pos_ = (b == NULL) ? NULL : b->start_ + b->size_;
+  c->cur_block_ = b;
+}
+
+// Create a new block, either from the free list or allocated
+static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
+  PixOrCopyBlock* b = refs->free_blocks_;
+  if (b == NULL) {   // allocate new memory chunk
+    const size_t total_size =
+        sizeof(*b) + refs->block_size_ * sizeof(*b->start_);
+    b = (PixOrCopyBlock*)WebPSafeMalloc(1ULL, total_size);
+    if (b == NULL) {
+      refs->error_ |= 1;
+      return NULL;
+    }
+    b->start_ = (PixOrCopy*)((uint8_t*)b + sizeof(*b));  // not always aligned
+  } else {  // recycle from free-list
+    refs->free_blocks_ = b->next_;
+  }
+  *refs->tail_ = b;
+  refs->tail_ = &b->next_;
+  refs->last_block_ = b;
+  b->next_ = NULL;
+  b->size_ = 0;
+  return b;
+}
+
+// Return 1 on success, 0 on error.
+static int BackwardRefsClone(const VP8LBackwardRefs* const from,
+                             VP8LBackwardRefs* const to) {
+  const PixOrCopyBlock* block_from = from->refs_;
+  VP8LClearBackwardRefs(to);
+  while (block_from != NULL) {
+    PixOrCopyBlock* const block_to = BackwardRefsNewBlock(to);
+    if (block_to == NULL) return 0;
+    memcpy(block_to->start_, block_from->start_,
+           block_from->size_ * sizeof(PixOrCopy));
+    block_to->size_ = block_from->size_;
+    block_from = block_from->next_;
+  }
+  return 1;
+}
+
+extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                                      const PixOrCopy v);
+void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                               const PixOrCopy v) {
+  PixOrCopyBlock* b = refs->last_block_;
+  if (b == NULL || b->size_ == refs->block_size_) {
+    b = BackwardRefsNewBlock(refs);
+    if (b == NULL) return;   // refs->error_ is set
+  }
+  b->start_[b->size_++] = v;
+}
+
+// -----------------------------------------------------------------------------
+// Hash chains
+
+int VP8LHashChainInit(VP8LHashChain* const p, int size) {
+  assert(p->size_ == 0);
+  assert(p->offset_length_ == NULL);
+  assert(size > 0);
+  p->offset_length_ =
+      (uint32_t*)WebPSafeMalloc(size, sizeof(*p->offset_length_));
+  if (p->offset_length_ == NULL) return 0;
+  p->size_ = size;
+
+  return 1;
+}
+
+void VP8LHashChainClear(VP8LHashChain* const p) {
+  assert(p != NULL);
+  WebPSafeFree(p->offset_length_);
+
+  p->size_ = 0;
+  p->offset_length_ = NULL;
+}
+
+// -----------------------------------------------------------------------------
+
+static const uint32_t kHashMultiplierHi = 0xc6a4a793u;
+static const uint32_t kHashMultiplierLo = 0x5bd1e996u;
+
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+uint32_t GetPixPairHash64(const uint32_t* const argb) {
+  uint32_t key;
+  key  = argb[1] * kHashMultiplierHi;
+  key += argb[0] * kHashMultiplierLo;
+  key = key >> (32 - HASH_BITS);
+  return key;
+}
+
+// Returns the maximum number of hash chain lookups to do for a
+// given compression quality. Return value in range [8, 86].
+static int GetMaxItersForQuality(int quality) {
+  return 8 + (quality * quality) / 128;
+}
+
+static int GetWindowSizeForHashChain(int quality, int xsize) {
+  const int max_window_size = (quality > 75) ? WINDOW_SIZE
+                            : (quality > 50) ? (xsize << 8)
+                            : (quality > 25) ? (xsize << 6)
+                            : (xsize << 4);
+  assert(xsize > 0);
+  return (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE : max_window_size;
+}
+
+static WEBP_INLINE int MaxFindCopyLength(int len) {
+  return (len < MAX_LENGTH) ? len : MAX_LENGTH;
+}
+
+int VP8LHashChainFill(VP8LHashChain* const p, int quality,
+                      const uint32_t* const argb, int xsize, int ysize,
+                      int low_effort) {
+  const int size = xsize * ysize;
+  const int iter_max = GetMaxItersForQuality(quality);
+  const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
+  int pos;
+  int argb_comp;
+  uint32_t base_position;
+  int32_t* hash_to_first_index;
+  // Temporarily use the p->offset_length_ as a hash chain.
+  int32_t* chain = (int32_t*)p->offset_length_;
+  assert(size > 0);
+  assert(p->size_ != 0);
+  assert(p->offset_length_ != NULL);
+
+  if (size <= 2) {
+    p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+    return 1;
+  }
+
+  hash_to_first_index =
+      (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
+  if (hash_to_first_index == NULL) return 0;
+
+  // Set the int32_t array to -1.
+  memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
+  // Fill the chain linking pixels with the same hash.
+  argb_comp = (argb[0] == argb[1]);
+  for (pos = 0; pos < size - 2;) {
+    uint32_t hash_code;
+    const int argb_comp_next = (argb[pos + 1] == argb[pos + 2]);
+    if (argb_comp && argb_comp_next) {
+      // Consecutive pixels with the same color will share the same hash.
+      // We therefore use a different hash: the color and its repetition
+      // length.
+      uint32_t tmp[2];
+      uint32_t len = 1;
+      tmp[0] = argb[pos];
+      // Figure out how far the pixels are the same.
+      // The last pixel has a different 64 bit hash, as its next pixel does
+      // not have the same color, so we just need to get to the last pixel equal
+      // to its follower.
+      while (pos + (int)len + 2 < size && argb[pos + len + 2] == argb[pos]) {
+        ++len;
+      }
+      if (len > MAX_LENGTH) {
+        // Skip the pixels that match for distance=1 and length>MAX_LENGTH
+        // because they are linked to their predecessor and we automatically
+        // check that in the main for loop below. Skipping means setting no
+        // predecessor in the chain, hence -1.
+        memset(chain + pos, 0xff, (len - MAX_LENGTH) * sizeof(*chain));
+        pos += len - MAX_LENGTH;
+        len = MAX_LENGTH;
+      }
+      // Process the rest of the hash chain.
+      while (len) {
+        tmp[1] = len--;
+        hash_code = GetPixPairHash64(tmp);
+        chain[pos] = hash_to_first_index[hash_code];
+        hash_to_first_index[hash_code] = pos++;
+      }
+      argb_comp = 0;
+    } else {
+      // Just move one pixel forward.
+      hash_code = GetPixPairHash64(argb + pos);
+      chain[pos] = hash_to_first_index[hash_code];
+      hash_to_first_index[hash_code] = pos++;
+      argb_comp = argb_comp_next;
+    }
+  }
+  // Process the penultimate pixel.
+  chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
+
+  WebPSafeFree(hash_to_first_index);
+
+  // Find the best match interval at each pixel, defined by an offset to the
+  // pixel and a length. The right-most pixel cannot match anything to the right
+  // (hence a best length of 0) and the left-most pixel nothing to the left
+  // (hence an offset of 0).
+  assert(size > 2);
+  p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+  for (base_position = size - 2; base_position > 0;) {
+    const int max_len = MaxFindCopyLength(size - 1 - base_position);
+    const uint32_t* const argb_start = argb + base_position;
+    int iter = iter_max;
+    int best_length = 0;
+    uint32_t best_distance = 0;
+    uint32_t best_argb;
+    const int min_pos =
+        (base_position > window_size) ? base_position - window_size : 0;
+    const int length_max = (max_len < 256) ? max_len : 256;
+    uint32_t max_base_position;
+
+    pos = chain[base_position];
+    if (!low_effort) {
+      int curr_length;
+      // Heuristic: use the comparison with the above line as an initialization.
+      if (base_position >= (uint32_t)xsize) {
+        curr_length = FindMatchLength(argb_start - xsize, argb_start,
+                                      best_length, max_len);
+        if (curr_length > best_length) {
+          best_length = curr_length;
+          best_distance = xsize;
+        }
+        --iter;
+      }
+      // Heuristic: compare to the previous pixel.
+      curr_length =
+          FindMatchLength(argb_start - 1, argb_start, best_length, max_len);
+      if (curr_length > best_length) {
+        best_length = curr_length;
+        best_distance = 1;
+      }
+      --iter;
+      // Skip the for loop if we already have the maximum.
+      if (best_length == MAX_LENGTH) pos = min_pos - 1;
+    }
+    best_argb = argb_start[best_length];
+
+    for (; pos >= min_pos && --iter; pos = chain[pos]) {
+      int curr_length;
+      assert(base_position > (uint32_t)pos);
+
+      if (argb[pos + best_length] != best_argb) continue;
+
+      curr_length = VP8LVectorMismatch(argb + pos, argb_start, max_len);
+      if (best_length < curr_length) {
+        best_length = curr_length;
+        best_distance = base_position - pos;
+        best_argb = argb_start[best_length];
+        // Stop if we have reached a good enough length.
+        if (best_length >= length_max) break;
+      }
+    }
+    // We have the best match but in case the two intervals continue matching
+    // to the left, we have the best matches for the left-extended pixels.
+    max_base_position = base_position;
+    while (1) {
+      assert(best_length <= MAX_LENGTH);
+      assert(best_distance <= WINDOW_SIZE);
+      p->offset_length_[base_position] =
+          (best_distance << MAX_LENGTH_BITS) | (uint32_t)best_length;
+      --base_position;
+      // Stop if we don't have a match or if we are out of bounds.
+      if (best_distance == 0 || base_position == 0) break;
+      // Stop if we cannot extend the matching intervals to the left.
+      if (base_position < best_distance ||
+          argb[base_position - best_distance] != argb[base_position]) {
+        break;
+      }
+      // Stop if we are matching at its limit because there could be a closer
+      // matching interval with the same maximum length. Then again, if the
+      // matching interval is as close as possible (best_distance == 1), we will
+      // never find anything better so let's continue.
+      if (best_length == MAX_LENGTH && best_distance != 1 &&
+          base_position + MAX_LENGTH < max_base_position) {
+        break;
+      }
+      if (best_length < MAX_LENGTH) {
+        ++best_length;
+        max_base_position = base_position;
+      }
+    }
+  }
+  return 1;
+}
+
+static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
+                                         VP8LColorCache* const hashers,
+                                         VP8LBackwardRefs* const refs) {
+  PixOrCopy v;
+  if (use_color_cache) {
+    const uint32_t key = VP8LColorCacheGetIndex(hashers, pixel);
+    if (VP8LColorCacheLookup(hashers, key) == pixel) {
+      v = PixOrCopyCreateCacheIdx(key);
+    } else {
+      v = PixOrCopyCreateLiteral(pixel);
+      VP8LColorCacheSet(hashers, key, pixel);
+    }
+  } else {
+    v = PixOrCopyCreateLiteral(pixel);
+  }
+  VP8LBackwardRefsCursorAdd(refs, v);
+}
+
+static int BackwardReferencesRle(int xsize, int ysize,
+                                 const uint32_t* const argb,
+                                 int cache_bits, VP8LBackwardRefs* const refs) {
+  const int pix_count = xsize * ysize;
+  int i, k;
+  const int use_color_cache = (cache_bits > 0);
+  VP8LColorCache hashers;
+
+  if (use_color_cache && !VP8LColorCacheInit(&hashers, cache_bits)) {
+    return 0;
+  }
+  VP8LClearBackwardRefs(refs);
+  // Add first pixel as literal.
+  AddSingleLiteral(argb[0], use_color_cache, &hashers, refs);
+  i = 1;
+  while (i < pix_count) {
+    const int max_len = MaxFindCopyLength(pix_count - i);
+    const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
+    const int prev_row_len = (i < xsize) ? 0 :
+        FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
+    if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) {
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
+      // We don't need to update the color cache here since it is always the
+      // same pixel being copied, and that does not change the color cache
+      // state.
+      i += rle_len;
+    } else if (prev_row_len >= MIN_LENGTH) {
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
+      if (use_color_cache) {
+        for (k = 0; k < prev_row_len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      i += prev_row_len;
+    } else {
+      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+      i++;
+    }
+  }
+  if (use_color_cache) VP8LColorCacheClear(&hashers);
+  return !refs->error_;
+}
+
+static int BackwardReferencesLz77(int xsize, int ysize,
+                                  const uint32_t* const argb, int cache_bits,
+                                  const VP8LHashChain* const hash_chain,
+                                  VP8LBackwardRefs* const refs) {
+  int i;
+  int i_last_check = -1;
+  int ok = 0;
+  int cc_init = 0;
+  const int use_color_cache = (cache_bits > 0);
+  const int pix_count = xsize * ysize;
+  VP8LColorCache hashers;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+  VP8LClearBackwardRefs(refs);
+  for (i = 0; i < pix_count;) {
+    // Alternative#1: Code the pixels starting at 'i' using backward reference.
+    int offset = 0;
+    int len = 0;
+    int j;
+    VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
+    if (len >= MIN_LENGTH) {
+      const int len_ini = len;
+      int max_reach = 0;
+      const int j_max =
+          (i + len_ini >= pix_count) ? pix_count - 1 : i + len_ini;
+      // Only start from what we have not checked already.
+      i_last_check = (i > i_last_check) ? i : i_last_check;
+      // We know the best match for the current pixel but we try to find the
+      // best matches for the current pixel AND the next one combined.
+      // The naive method would use the intervals:
+      // [i,i+len) + [i+len, length of best match at i+len)
+      // while we check if we can use:
+      // [i,j) (where j<=i+len) + [j, length of best match at j)
+      for (j = i_last_check + 1; j <= j_max; ++j) {
+        const int len_j = VP8LHashChainFindLength(hash_chain, j);
+        const int reach =
+            j + (len_j >= MIN_LENGTH ? len_j : 1);  // 1 for single literal.
+        if (reach > max_reach) {
+          len = j - i;
+          max_reach = reach;
+          if (max_reach >= pix_count) break;
+        }
+      }
+    } else {
+      len = 1;
+    }
+    // Go with literal or backward reference.
+    assert(len > 0);
+    if (len == 1) {
+      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+    } else {
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
+      if (use_color_cache) {
+        for (j = i; j < i + len; ++j) VP8LColorCacheInsert(&hashers, argb[j]);
+      }
+    }
+    i += len;
+  }
+
+  ok = !refs->error_;
+ Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  return ok;
+}
+
+// Compute an LZ77 by forcing matches to happen within a given distance cost.
+// We therefore limit the algorithm to the lowest 32 values in the PlaneCode
+// definition.
+#define WINDOW_OFFSETS_SIZE_MAX 32
+static int BackwardReferencesLz77Box(int xsize, int ysize,
+                                     const uint32_t* const argb, int cache_bits,
+                                     const VP8LHashChain* const hash_chain_best,
+                                     VP8LHashChain* hash_chain,
+                                     VP8LBackwardRefs* const refs) {
+  int i;
+  const int pix_count = xsize * ysize;
+  uint16_t* counts;
+  int window_offsets[WINDOW_OFFSETS_SIZE_MAX] = {0};
+  int window_offsets_new[WINDOW_OFFSETS_SIZE_MAX] = {0};
+  int window_offsets_size = 0;
+  int window_offsets_new_size = 0;
+  uint16_t* const counts_ini =
+      (uint16_t*)WebPSafeMalloc(xsize * ysize, sizeof(*counts_ini));
+  int best_offset_prev = -1, best_length_prev = -1;
+  if (counts_ini == NULL) return 0;
+
+  // counts[i] counts how many times a pixel is repeated starting at position i.
+  i = pix_count - 2;
+  counts = counts_ini + i;
+  counts[1] = 1;
+  for (; i >= 0; --i, --counts) {
+    if (argb[i] == argb[i + 1]) {
+      // Max out the counts to MAX_LENGTH.
+      counts[0] = counts[1] + (counts[1] != MAX_LENGTH);
+    } else {
+      counts[0] = 1;
+    }
+  }
+
+  // Figure out the window offsets around a pixel. They are stored in a
+  // spiraling order around the pixel as defined by VP8LDistanceToPlaneCode.
+  {
+    int x, y;
+    for (y = 0; y <= 6; ++y) {
+      for (x = -6; x <= 6; ++x) {
+        const int offset = y * xsize + x;
+        int plane_code;
+        // Ignore offsets that bring us after the pixel.
+        if (offset <= 0) continue;
+        plane_code = VP8LDistanceToPlaneCode(xsize, offset) - 1;
+        if (plane_code >= WINDOW_OFFSETS_SIZE_MAX) continue;
+        window_offsets[plane_code] = offset;
+      }
+    }
+    // For narrow images, not all plane codes are reached, so remove those.
+    for (i = 0; i < WINDOW_OFFSETS_SIZE_MAX; ++i) {
+      if (window_offsets[i] == 0) continue;
+      window_offsets[window_offsets_size++] = window_offsets[i];
+    }
+    // Given a pixel P, find the offsets that reach pixels unreachable from P-1
+    // with any of the offsets in window_offsets[].
+    for (i = 0; i < window_offsets_size; ++i) {
+      int j;
+      int is_reachable = 0;
+      for (j = 0; j < window_offsets_size && !is_reachable; ++j) {
+        is_reachable |= (window_offsets[i] == window_offsets[j] + 1);
+      }
+      if (!is_reachable) {
+        window_offsets_new[window_offsets_new_size] = window_offsets[i];
+        ++window_offsets_new_size;
+      }
+    }
+  }
+
+  hash_chain->offset_length_[0] = 0;
+  for (i = 1; i < pix_count; ++i) {
+    int ind;
+    int best_length = VP8LHashChainFindLength(hash_chain_best, i);
+    int best_offset;
+    int do_compute = 1;
+
+    if (best_length >= MAX_LENGTH) {
+      // Do not recompute the best match if we already have a maximal one in the
+      // window.
+      best_offset = VP8LHashChainFindOffset(hash_chain_best, i);
+      for (ind = 0; ind < window_offsets_size; ++ind) {
+        if (best_offset == window_offsets[ind]) {
+          do_compute = 0;
+          break;
+        }
+      }
+    }
+    if (do_compute) {
+      // Figure out if we should use the offset/length from the previous pixel
+      // as an initial guess and therefore only inspect the offsets in
+      // window_offsets_new[].
+      const int use_prev =
+          (best_length_prev > 1) && (best_length_prev < MAX_LENGTH);
+      const int num_ind =
+          use_prev ? window_offsets_new_size : window_offsets_size;
+      best_length = use_prev ? best_length_prev - 1 : 0;
+      best_offset = use_prev ? best_offset_prev : 0;
+      // Find the longest match in a window around the pixel.
+      for (ind = 0; ind < num_ind; ++ind) {
+        int curr_length = 0;
+        int j = i;
+        int j_offset =
+            use_prev ? i - window_offsets_new[ind] : i - window_offsets[ind];
+        if (j_offset < 0 || argb[j_offset] != argb[i]) continue;
+        // The longest match is the sum of how many times each pixel is
+        // repeated.
+        do {
+          const int counts_j_offset = counts_ini[j_offset];
+          const int counts_j = counts_ini[j];
+          if (counts_j_offset != counts_j) {
+            curr_length +=
+                (counts_j_offset < counts_j) ? counts_j_offset : counts_j;
+            break;
+          }
+          // The same color is repeated counts_pos times at j_offset and j.
+          curr_length += counts_j_offset;
+          j_offset += counts_j_offset;
+          j += counts_j_offset;
+        } while (curr_length <= MAX_LENGTH && j < pix_count &&
+                 argb[j_offset] == argb[j]);
+        if (best_length < curr_length) {
+          best_offset =
+              use_prev ? window_offsets_new[ind] : window_offsets[ind];
+          if (curr_length >= MAX_LENGTH) {
+            best_length = MAX_LENGTH;
+            break;
+          } else {
+            best_length = curr_length;
+          }
+        }
+      }
+    }
+
+    assert(i + best_length <= pix_count);
+    assert(best_length <= MAX_LENGTH);
+    if (best_length <= MIN_LENGTH) {
+      hash_chain->offset_length_[i] = 0;
+      best_offset_prev = 0;
+      best_length_prev = 0;
+    } else {
+      hash_chain->offset_length_[i] =
+          (best_offset << MAX_LENGTH_BITS) | (uint32_t)best_length;
+      best_offset_prev = best_offset;
+      best_length_prev = best_length;
+    }
+  }
+  hash_chain->offset_length_[0] = 0;
+  WebPSafeFree(counts_ini);
+
+  return BackwardReferencesLz77(xsize, ysize, argb, cache_bits, hash_chain,
+                                refs);
+}
+
+// -----------------------------------------------------------------------------
+
+static void BackwardReferences2DLocality(int xsize,
+                                         const VP8LBackwardRefs* const refs) {
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    if (PixOrCopyIsCopy(c.cur_pos)) {
+      const int dist = c.cur_pos->argb_or_distance;
+      const int transformed_dist = VP8LDistanceToPlaneCode(xsize, dist);
+      c.cur_pos->argb_or_distance = transformed_dist;
+    }
+    VP8LRefsCursorNext(&c);
+  }
+}
+
+// Evaluate optimal cache bits for the local color cache.
+// The input *best_cache_bits sets the maximum cache bits to use (passing 0
+// implies disabling the local color cache). The local color cache is also
+// disabled for the lower (<= 25) quality.
+// Returns 0 in case of memory error.
+static int CalculateBestCacheSize(const uint32_t* argb, int quality,
+                                  const VP8LBackwardRefs* const refs,
+                                  int* const best_cache_bits) {
+  int i;
+  const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
+  double entropy_min = MAX_ENTROPY;
+  int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
+  VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL };
+  int ok = 0;
+
+  assert(cache_bits_max >= 0 && cache_bits_max <= MAX_COLOR_CACHE_BITS);
+
+  if (cache_bits_max == 0) {
+    *best_cache_bits = 0;
+    // Local color cache is disabled.
+    return 1;
+  }
+
+  // Allocate data.
+  for (i = 0; i <= cache_bits_max; ++i) {
+    histos[i] = VP8LAllocateHistogram(i);
+    if (histos[i] == NULL) goto Error;
+    VP8LHistogramInit(histos[i], i, /*init_arrays=*/ 1);
+    if (i == 0) continue;
+    cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
+    if (!cc_init[i]) goto Error;
+  }
+
+  // Find the cache_bits giving the lowest entropy. The search is done in a
+  // brute-force way as the function (entropy w.r.t cache_bits) can be
+  // anything in practice.
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
+    if (PixOrCopyIsLiteral(v)) {
+      const uint32_t pix = *argb++;
+      const uint32_t a = (pix >> 24) & 0xff;
+      const uint32_t r = (pix >> 16) & 0xff;
+      const uint32_t g = (pix >>  8) & 0xff;
+      const uint32_t b = (pix >>  0) & 0xff;
+      // The keys of the caches can be derived from the longest one.
+      int key = VP8LHashPix(pix, 32 - cache_bits_max);
+      // Do not use the color cache for cache_bits = 0.
+      ++histos[0]->blue_[b];
+      ++histos[0]->literal_[g];
+      ++histos[0]->red_[r];
+      ++histos[0]->alpha_[a];
+      // Deal with cache_bits > 0.
+      for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+        if (VP8LColorCacheLookup(&hashers[i], key) == pix) {
+          ++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
+        } else {
+          VP8LColorCacheSet(&hashers[i], key, pix);
+          ++histos[i]->blue_[b];
+          ++histos[i]->literal_[g];
+          ++histos[i]->red_[r];
+          ++histos[i]->alpha_[a];
+        }
+      }
+    } else {
+      int code, extra_bits, extra_bits_value;
+      // We should compute the contribution of the (distance,length)
+      // histograms but those are the same independently from the cache size.
+      // As those constant contributions are in the end added to the other
+      // histogram contributions, we can ignore them, except for the length
+      // prefix that is part of the literal_ histogram.
+      int len = PixOrCopyLength(v);
+      uint32_t argb_prev = *argb ^ 0xffffffffu;
+      VP8LPrefixEncode(len, &code, &extra_bits, &extra_bits_value);
+      for (i = 0; i <= cache_bits_max; ++i) {
+        ++histos[i]->literal_[NUM_LITERAL_CODES + code];
+      }
+      // Update the color caches.
+      do {
+        if (*argb != argb_prev) {
+          // Efficiency: insert only if the color changes.
+          int key = VP8LHashPix(*argb, 32 - cache_bits_max);
+          for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+            hashers[i].colors_[key] = *argb;
+          }
+          argb_prev = *argb;
+        }
+        argb++;
+      } while (--len != 0);
+    }
+    VP8LRefsCursorNext(&c);
+  }
+
+  for (i = 0; i <= cache_bits_max; ++i) {
+    const double entropy = VP8LHistogramEstimateBits(histos[i]);
+    if (i == 0 || entropy < entropy_min) {
+      entropy_min = entropy;
+      *best_cache_bits = i;
+    }
+  }
+  ok = 1;
+Error:
+  for (i = 0; i <= cache_bits_max; ++i) {
+    if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
+    VP8LFreeHistogram(histos[i]);
+  }
+  return ok;
+}
+
+// Update (in-place) backward references for specified cache_bits.
+static int BackwardRefsWithLocalCache(const uint32_t* const argb,
+                                      int cache_bits,
+                                      VP8LBackwardRefs* const refs) {
+  int pixel_index = 0;
+  VP8LColorCache hashers;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
+
+  while (VP8LRefsCursorOk(&c)) {
+    PixOrCopy* const v = c.cur_pos;
+    if (PixOrCopyIsLiteral(v)) {
+      const uint32_t argb_literal = v->argb_or_distance;
+      const int ix = VP8LColorCacheContains(&hashers, argb_literal);
+      if (ix >= 0) {
+        // hashers contains argb_literal
+        *v = PixOrCopyCreateCacheIdx(ix);
+      } else {
+        VP8LColorCacheInsert(&hashers, argb_literal);
+      }
+      ++pixel_index;
+    } else {
+      // refs was created without local cache, so it can not have cache indexes.
+      int k;
+      assert(PixOrCopyIsCopy(v));
+      for (k = 0; k < v->len; ++k) {
+        VP8LColorCacheInsert(&hashers, argb[pixel_index++]);
+      }
+    }
+    VP8LRefsCursorNext(&c);
+  }
+  VP8LColorCacheClear(&hashers);
+  return 1;
+}
+
+static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
+    int width, int height, const uint32_t* const argb,
+    int* const cache_bits, const VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs* const refs_lz77) {
+  *cache_bits = 0;
+  if (!BackwardReferencesLz77(width, height, argb, 0, hash_chain, refs_lz77)) {
+    return NULL;
+  }
+  BackwardReferences2DLocality(width, refs_lz77);
+  return refs_lz77;
+}
+
+extern int VP8LBackwardReferencesTraceBackwards(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain,
+    const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
+static int GetBackwardReferences(int width, int height,
+                                 const uint32_t* const argb, int quality,
+                                 int lz77_types_to_try, int cache_bits_max,
+                                 int do_no_cache,
+                                 const VP8LHashChain* const hash_chain,
+                                 VP8LBackwardRefs* const refs,
+                                 int* const cache_bits_best) {
+  VP8LHistogram* histo = NULL;
+  int i, lz77_type;
+  // Index 0 is for a color cache, index 1 for no cache (if needed).
+  int lz77_types_best[2] = {0, 0};
+  double bit_costs_best[2] = {DBL_MAX, DBL_MAX};
+  VP8LHashChain hash_chain_box;
+  VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1];
+  int status = 0;
+  memset(&hash_chain_box, 0, sizeof(hash_chain_box));
+
+  histo = VP8LAllocateHistogram(MAX_COLOR_CACHE_BITS);
+  if (histo == NULL) goto Error;
+
+  for (lz77_type = 1; lz77_types_to_try;
+       lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
+    int res = 0;
+    double bit_cost = 0.;
+    if ((lz77_types_to_try & lz77_type) == 0) continue;
+    switch (lz77_type) {
+      case kLZ77RLE:
+        res = BackwardReferencesRle(width, height, argb, 0, refs_tmp);
+        break;
+      case kLZ77Standard:
+        // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color
+        // cache is not that different in practice.
+        res = BackwardReferencesLz77(width, height, argb, 0, hash_chain,
+                                     refs_tmp);
+        break;
+      case kLZ77Box:
+        if (!VP8LHashChainInit(&hash_chain_box, width * height)) goto Error;
+        res = BackwardReferencesLz77Box(width, height, argb, 0, hash_chain,
+                                        &hash_chain_box, refs_tmp);
+        break;
+      default:
+        assert(0);
+    }
+    if (!res) goto Error;
+
+    // Start with the no color cache case.
+    for (i = 1; i >= 0; --i) {
+      int cache_bits = (i == 1) ? 0 : cache_bits_max;
+
+      if (i == 1 && !do_no_cache) continue;
+
+      if (i == 0) {
+        // Try with a color cache.
+        if (!CalculateBestCacheSize(argb, quality, refs_tmp, &cache_bits)) {
+          goto Error;
+        }
+        if (cache_bits > 0) {
+          if (!BackwardRefsWithLocalCache(argb, cache_bits, refs_tmp)) {
+            goto Error;
+          }
+        }
+      }
+
+      if (i == 0 && do_no_cache && cache_bits == 0) {
+        // No need to re-compute bit_cost as it was computed at i == 1.
+      } else {
+        VP8LHistogramCreate(histo, refs_tmp, cache_bits);
+        bit_cost = VP8LHistogramEstimateBits(histo);
+      }
+
+      if (bit_cost < bit_costs_best[i]) {
+        if (i == 1) {
+          // Do not swap as the full cache analysis would have the wrong
+          // VP8LBackwardRefs to start with.
+          if (!BackwardRefsClone(refs_tmp, &refs[1])) goto Error;
+        } else {
+          BackwardRefsSwap(refs_tmp, &refs[0]);
+        }
+        bit_costs_best[i] = bit_cost;
+        lz77_types_best[i] = lz77_type;
+        if (i == 0) *cache_bits_best = cache_bits;
+      }
+    }
+  }
+  assert(lz77_types_best[0] > 0);
+  assert(!do_no_cache || lz77_types_best[1] > 0);
+
+  // Improve on simple LZ77 but only for high quality (TraceBackwards is
+  // costly).
+  for (i = 1; i >= 0; --i) {
+    if (i == 1 && !do_no_cache) continue;
+    if ((lz77_types_best[i] == kLZ77Standard ||
+         lz77_types_best[i] == kLZ77Box) &&
+        quality >= 25) {
+      const VP8LHashChain* const hash_chain_tmp =
+          (lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box;
+      const int cache_bits = (i == 1) ? 0 : *cache_bits_best;
+      if (VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
+                                               hash_chain_tmp, &refs[i],
+                                               refs_tmp)) {
+        double bit_cost_trace;
+        VP8LHistogramCreate(histo, refs_tmp, cache_bits);
+        bit_cost_trace = VP8LHistogramEstimateBits(histo);
+        if (bit_cost_trace < bit_costs_best[i]) {
+          BackwardRefsSwap(refs_tmp, &refs[i]);
+        }
+      }
+    }
+
+    BackwardReferences2DLocality(width, &refs[i]);
+
+    if (i == 1 && lz77_types_best[0] == lz77_types_best[1] &&
+        *cache_bits_best == 0) {
+      // If the best cache size is 0 and we have the same best LZ77, just copy
+      // the data over and stop here.
+      if (!BackwardRefsClone(&refs[1], &refs[0])) goto Error;
+      break;
+    }
+  }
+  status = 1;
+
+Error:
+  VP8LHashChainClear(&hash_chain_box);
+  VP8LFreeHistogram(histo);
+  return status;
+}
+
+WebPEncodingError VP8LGetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
+    int* const cache_bits_best) {
+  if (low_effort) {
+    VP8LBackwardRefs* refs_best;
+    *cache_bits_best = cache_bits_max;
+    refs_best = GetBackwardReferencesLowEffort(
+        width, height, argb, cache_bits_best, hash_chain, refs);
+    if (refs_best == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+    // Set it in first position.
+    BackwardRefsSwap(refs_best, &refs[0]);
+  } else {
+    if (!GetBackwardReferences(width, height, argb, quality, lz77_types_to_try,
+                               cache_bits_max, do_no_cache, hash_chain, refs,
+                               cache_bits_best)) {
+      return VP8_ENC_ERROR_OUT_OF_MEMORY;
+    }
+  }
+  return VP8_ENC_OK;
+}
diff --git a/media/libwebp/enc/backward_references_enc.h b/media/libwebp/enc/backward_references_enc.h
index 539e991cfc..292c630e5e 100644
--- a/media/libwebp/enc/backward_references_enc.h
+++ b/media/libwebp/enc/backward_references_enc.h
@@ -16,6 +16,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include "../webp/types.h"
+#include "../webp/encode.h"
 #include "../webp/format_constants.h"
 
 #ifdef __cplusplus
@@ -218,14 +219,19 @@ enum VP8LLZ77Type {
 // Evaluates best possible backward references for specified quality.
 // The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
 // bits to use (passing 0 implies disabling the local color cache).
-// The optimal cache bits is evaluated and set for the *cache_bits parameter.
-// The return value is the pointer to the best of the two backward refs viz,
-// refs[0] or refs[1].
-VP8LBackwardRefs* VP8LGetBackwardReferences(
+// The optimal cache bits is evaluated and set for the *cache_bits_best
+// parameter with the matching refs_best.
+// If do_no_cache == 0, refs is an array of 2 values and the best
+// VP8LBackwardRefs is put in the first element.
+// If do_no_cache != 0, refs is an array of 3 values and the best
+// VP8LBackwardRefs is put in the first element, the best value with no-cache in
+// the second element.
+// In both cases, the last element is used as temporary internally.
+WebPEncodingError VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int lz77_types_to_try, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
-    VP8LBackwardRefs* const refs_tmp2);
+    int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
+    int* const cache_bits_best);
 
 #ifdef __cplusplus
 }
diff --git a/media/libwebp/enc/config_enc.c b/media/libwebp/enc/config_enc.c
new file mode 100644
index 0000000000..97df32d0d4
--- /dev/null
+++ b/media/libwebp/enc/config_enc.c
@@ -0,0 +1,157 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Coding tools configuration
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include "../webp/encode.h"
+
+//------------------------------------------------------------------------------
+// WebPConfig
+//------------------------------------------------------------------------------
+
+int WebPConfigInitInternal(WebPConfig* config,
+                           WebPPreset preset, float quality, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
+    return 0;   // caller/system version mismatch!
+  }
+  if (config == NULL) return 0;
+
+  config->quality = quality;
+  config->target_size = 0;
+  config->target_PSNR = 0.;
+  config->method = 4;
+  config->sns_strength = 50;
+  config->filter_strength = 60;   // mid-filtering
+  config->filter_sharpness = 0;
+  config->filter_type = 1;        // default: strong (so U/V is filtered too)
+  config->partitions = 0;
+  config->segments = 4;
+  config->pass = 1;
+  config->qmin = 0;
+  config->qmax = 100;
+  config->show_compressed = 0;
+  config->preprocessing = 0;
+  config->autofilter = 0;
+  config->partition_limit = 0;
+  config->alpha_compression = 1;
+  config->alpha_filtering = 1;
+  config->alpha_quality = 100;
+  config->lossless = 0;
+  config->exact = 0;
+  config->image_hint = WEBP_HINT_DEFAULT;
+  config->emulate_jpeg_size = 0;
+  config->thread_level = 0;
+  config->low_memory = 0;
+  config->near_lossless = 100;
+  config->use_delta_palette = 0;
+  config->use_sharp_yuv = 0;
+
+  // TODO(skal): tune.
+  switch (preset) {
+    case WEBP_PRESET_PICTURE:
+      config->sns_strength = 80;
+      config->filter_sharpness = 4;
+      config->filter_strength = 35;
+      config->preprocessing &= ~2;   // no dithering
+      break;
+    case WEBP_PRESET_PHOTO:
+      config->sns_strength = 80;
+      config->filter_sharpness = 3;
+      config->filter_strength = 30;
+      config->preprocessing |= 2;
+      break;
+    case WEBP_PRESET_DRAWING:
+      config->sns_strength = 25;
+      config->filter_sharpness = 6;
+      config->filter_strength = 10;
+      break;
+    case WEBP_PRESET_ICON:
+      config->sns_strength = 0;
+      config->filter_strength = 0;   // disable filtering to retain sharpness
+      config->preprocessing &= ~2;   // no dithering
+      break;
+    case WEBP_PRESET_TEXT:
+      config->sns_strength = 0;
+      config->filter_strength = 0;   // disable filtering to retain sharpness
+      config->preprocessing &= ~2;   // no dithering
+      config->segments = 2;
+      break;
+    case WEBP_PRESET_DEFAULT:
+    default:
+      break;
+  }
+  return WebPValidateConfig(config);
+}
+
+int WebPValidateConfig(const WebPConfig* config) {
+  if (config == NULL) return 0;
+  if (config->quality < 0 || config->quality > 100) return 0;
+  if (config->target_size < 0) return 0;
+  if (config->target_PSNR < 0) return 0;
+  if (config->method < 0 || config->method > 6) return 0;
+  if (config->segments < 1 || config->segments > 4) return 0;
+  if (config->sns_strength < 0 || config->sns_strength > 100) return 0;
+  if (config->filter_strength < 0 || config->filter_strength > 100) return 0;
+  if (config->filter_sharpness < 0 || config->filter_sharpness > 7) return 0;
+  if (config->filter_type < 0 || config->filter_type > 1) return 0;
+  if (config->autofilter < 0 || config->autofilter > 1) return 0;
+  if (config->pass < 1 || config->pass > 10) return 0;
+  if (config->qmin < 0 || config->qmax > 100 || config->qmin > config->qmax) {
+    return 0;
+  }
+  if (config->show_compressed < 0 || config->show_compressed > 1) return 0;
+  if (config->preprocessing < 0 || config->preprocessing > 7) return 0;
+  if (config->partitions < 0 || config->partitions > 3) return 0;
+  if (config->partition_limit < 0 || config->partition_limit > 100) return 0;
+  if (config->alpha_compression < 0) return 0;
+  if (config->alpha_filtering < 0) return 0;
+  if (config->alpha_quality < 0 || config->alpha_quality > 100) return 0;
+  if (config->lossless < 0 || config->lossless > 1) return 0;
+  if (config->near_lossless < 0 || config->near_lossless > 100) return 0;
+  if (config->image_hint >= WEBP_HINT_LAST) return 0;
+  if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1) return 0;
+  if (config->thread_level < 0 || config->thread_level > 1) return 0;
+  if (config->low_memory < 0 || config->low_memory > 1) return 0;
+  if (config->exact < 0 || config->exact > 1) return 0;
+  if (config->use_delta_palette < 0 || config->use_delta_palette > 1) {
+    return 0;
+  }
+  if (config->use_sharp_yuv < 0 || config->use_sharp_yuv > 1) return 0;
+
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+#define MAX_LEVEL 9
+
+// Mapping between -z level and -m / -q parameter settings.
+static const struct {
+  uint8_t method_;
+  uint8_t quality_;
+} kLosslessPresets[MAX_LEVEL + 1] = {
+  { 0,  0 }, { 1, 20 }, { 2, 25 }, { 3, 30 }, { 3, 50 },
+  { 4, 50 }, { 4, 75 }, { 4, 90 }, { 5, 90 }, { 6, 100 }
+};
+
+int WebPConfigLosslessPreset(WebPConfig* config, int level) {
+  if (config == NULL || level < 0 || level > MAX_LEVEL) return 0;
+  config->lossless = 1;
+  config->method = kLosslessPresets[level].method_;
+  config->quality = kLosslessPresets[level].quality_;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/cost_enc.c b/media/libwebp/enc/cost_enc.c
new file mode 100644
index 0000000000..bb7fe64fa2
--- /dev/null
+++ b/media/libwebp/enc/cost_enc.c
@@ -0,0 +1,342 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Cost tables for level and modes
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../enc/cost_enc.h"
+
+//------------------------------------------------------------------------------
+// Level cost tables
+
+// For each given level, the following table gives the pattern of contexts to
+// use for coding it (in [][0]) as well as the bit value to use for each
+// context (in [][1]).
+const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
+                  {0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005},
+  {0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023},
+  {0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x0d3, 0x013},
+  {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x013},
+  {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x093},
+  {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093},
+  {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093},
+  {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093},
+  {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x153}
+};
+
+static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {
+  int pattern = VP8LevelCodes[level - 1][0];
+  int bits = VP8LevelCodes[level - 1][1];
+  int cost = 0;
+  int i;
+  for (i = 2; pattern; ++i) {
+    if (pattern & 1) {
+      cost += VP8BitCost(bits & 1, probas[i]);
+    }
+    bits >>= 1;
+    pattern >>= 1;
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Pre-calc level costs once for all
+
+void VP8CalculateLevelCosts(VP8EncProba* const proba) {
+  int ctype, band, ctx;
+
+  if (!proba->dirty_) return;  // nothing to do.
+
+  for (ctype = 0; ctype < NUM_TYPES; ++ctype) {
+    int n;
+    for (band = 0; band < NUM_BANDS; ++band) {
+      for (ctx = 0; ctx < NUM_CTX; ++ctx) {
+        const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
+        uint16_t* const table = proba->level_cost_[ctype][band][ctx];
+        const int cost0 = (ctx > 0) ? VP8BitCost(1, p[0]) : 0;
+        const int cost_base = VP8BitCost(1, p[1]) + cost0;
+        int v;
+        table[0] = VP8BitCost(0, p[1]) + cost0;
+        for (v = 1; v <= MAX_VARIABLE_LEVEL; ++v) {
+          table[v] = cost_base + VariableLevelCost(v, p);
+        }
+        // Starting at level 67 and up, the variable part of the cost is
+        // actually constant.
+      }
+    }
+    for (n = 0; n < 16; ++n) {    // replicate bands. We don't need to sentinel.
+      for (ctx = 0; ctx < NUM_CTX; ++ctx) {
+        proba->remapped_costs_[ctype][n][ctx] =
+            proba->level_cost_[ctype][VP8EncBands[n]][ctx];
+      }
+    }
+  }
+  proba->dirty_ = 0;
+}
+
+//------------------------------------------------------------------------------
+// Mode cost tables.
+
+// These are the fixed probabilities (in the coding trees) turned into bit-cost
+// by calling VP8BitCost().
+const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 };
+// note: these values include the fixed VP8BitCost(1, 145) mode selection cost.
+const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 };
+const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
+  { {   40, 1151, 1723, 1874, 2103, 2019, 1628, 1777, 2226, 2137 },
+    {  192,  469, 1296, 1308, 1849, 1794, 1781, 1703, 1713, 1522 },
+    {  142,  910,  762, 1684, 1849, 1576, 1460, 1305, 1801, 1657 },
+    {  559,  641, 1370,  421, 1182, 1569, 1612, 1725,  863, 1007 },
+    {  299, 1059, 1256, 1108,  636, 1068, 1581, 1883,  869, 1142 },
+    {  277, 1111,  707, 1362, 1089,  672, 1603, 1541, 1545, 1291 },
+    {  214,  781, 1609, 1303, 1632, 2229,  726, 1560, 1713,  918 },
+    {  152, 1037, 1046, 1759, 1983, 2174, 1358,  742, 1740, 1390 },
+    {  512, 1046, 1420,  753,  752, 1297, 1486, 1613,  460, 1207 },
+    {  424,  827, 1362,  719, 1462, 1202, 1199, 1476, 1199,  538 } },
+  { {  240,  402, 1134, 1491, 1659, 1505, 1517, 1555, 1979, 2099 },
+    {  467,  242,  960, 1232, 1714, 1620, 1834, 1570, 1676, 1391 },
+    {  500,  455,  463, 1507, 1699, 1282, 1564,  982, 2114, 2114 },
+    {  672,  643, 1372,  331, 1589, 1667, 1453, 1938,  996,  876 },
+    {  458,  783, 1037,  911,  738,  968, 1165, 1518,  859, 1033 },
+    {  504,  815,  504, 1139, 1219,  719, 1506, 1085, 1268, 1268 },
+    {  333,  630, 1445, 1239, 1883, 3672,  799, 1548, 1865,  598 },
+    {  399,  644,  746, 1342, 1856, 1350, 1493,  613, 1855, 1015 },
+    {  622,  749, 1205,  608, 1066, 1408, 1290, 1406,  546,  971 },
+    {  500,  753, 1041,  668, 1230, 1617, 1297, 1425, 1383,  523 } },
+  { {  394,  553,  523, 1502, 1536,  981, 1608, 1142, 1666, 2181 },
+    {  655,  430,  375, 1411, 1861, 1220, 1677, 1135, 1978, 1553 },
+    {  690,  640,  245, 1954, 2070, 1194, 1528,  982, 1972, 2232 },
+    {  559,  834,  741,  867, 1131,  980, 1225,  852, 1092,  784 },
+    {  690,  875,  516,  959,  673,  894, 1056, 1190, 1528, 1126 },
+    {  740,  951,  384, 1277, 1177,  492, 1579, 1155, 1846, 1513 },
+    {  323,  775, 1062, 1776, 3062, 1274,  813, 1188, 1372,  655 },
+    {  488,  971,  484, 1767, 1515, 1775, 1115,  503, 1539, 1461 },
+    {  740, 1006,  998,  709,  851, 1230, 1337,  788,  741,  721 },
+    {  522, 1073,  573, 1045, 1346,  887, 1046, 1146, 1203,  697 } },
+  { {  105,  864, 1442, 1009, 1934, 1840, 1519, 1920, 1673, 1579 },
+    {  534,  305, 1193,  683, 1388, 2164, 1802, 1894, 1264, 1170 },
+    {  305,  518,  877, 1108, 1426, 3215, 1425, 1064, 1320, 1242 },
+    {  683,  732, 1927,  257, 1493, 2048, 1858, 1552, 1055,  947 },
+    {  394,  814, 1024,  660,  959, 1556, 1282, 1289,  893, 1047 },
+    {  528,  615,  996,  940, 1201,  635, 1094, 2515,  803, 1358 },
+    {  347,  614, 1609, 1187, 3133, 1345, 1007, 1339, 1017,  667 },
+    {  218,  740,  878, 1605, 3650, 3650, 1345,  758, 1357, 1617 },
+    {  672,  750, 1541,  558, 1257, 1599, 1870, 2135,  402, 1087 },
+    {  592,  684, 1161,  430, 1092, 1497, 1475, 1489, 1095,  822 } },
+  { {  228, 1056, 1059, 1368,  752,  982, 1512, 1518,  987, 1782 },
+    {  494,  514,  818,  942,  965,  892, 1610, 1356, 1048, 1363 },
+    {  512,  648,  591, 1042,  761,  991, 1196, 1454, 1309, 1463 },
+    {  683,  749, 1043,  676,  841, 1396, 1133, 1138,  654,  939 },
+    {  622, 1101, 1126,  994,  361, 1077, 1203, 1318,  877, 1219 },
+    {  631, 1068,  857, 1650,  651,  477, 1650, 1419,  828, 1170 },
+    {  555,  727, 1068, 1335, 3127, 1339,  820, 1331, 1077,  429 },
+    {  504,  879,  624, 1398,  889,  889, 1392,  808,  891, 1406 },
+    {  683, 1602, 1289,  977,  578,  983, 1280, 1708,  406, 1122 },
+    {  399,  865, 1433, 1070, 1072,  764,  968, 1477, 1223,  678 } },
+  { {  333,  760,  935, 1638, 1010,  529, 1646, 1410, 1472, 2219 },
+    {  512,  494,  750, 1160, 1215,  610, 1870, 1868, 1628, 1169 },
+    {  572,  646,  492, 1934, 1208,  603, 1580, 1099, 1398, 1995 },
+    {  786,  789,  942,  581, 1018,  951, 1599, 1207,  731,  768 },
+    {  690, 1015,  672, 1078,  582,  504, 1693, 1438, 1108, 2897 },
+    {  768, 1267,  571, 2005, 1243,  244, 2881, 1380, 1786, 1453 },
+    {  452,  899, 1293,  903, 1311, 3100,  465, 1311, 1319,  813 },
+    {  394,  927,  942, 1103, 1358, 1104,  946,  593, 1363, 1109 },
+    {  559, 1005, 1007, 1016,  658, 1173, 1021, 1164,  623, 1028 },
+    {  564,  796,  632, 1005, 1014,  863, 2316, 1268,  938,  764 } },
+  { {  266,  606, 1098, 1228, 1497, 1243,  948, 1030, 1734, 1461 },
+    {  366,  585,  901, 1060, 1407, 1247,  876, 1134, 1620, 1054 },
+    {  452,  565,  542, 1729, 1479, 1479, 1016,  886, 2938, 1150 },
+    {  555, 1088, 1533,  950, 1354,  895,  834, 1019, 1021,  496 },
+    {  704,  815, 1193,  971,  973,  640, 1217, 2214,  832,  578 },
+    {  672, 1245,  579,  871,  875,  774,  872, 1273, 1027,  949 },
+    {  296, 1134, 2050, 1784, 1636, 3425,  442, 1550, 2076,  722 },
+    {  342,  982, 1259, 1846, 1848, 1848,  622,  568, 1847, 1052 },
+    {  555, 1064, 1304,  828,  746, 1343, 1075, 1329, 1078,  494 },
+    {  288, 1167, 1285, 1174, 1639, 1639,  833, 2254, 1304,  509 } },
+  { {  342,  719,  767, 1866, 1757, 1270, 1246,  550, 1746, 2151 },
+    {  483,  653,  694, 1509, 1459, 1410, 1218,  507, 1914, 1266 },
+    {  488,  757,  447, 2979, 1813, 1268, 1654,  539, 1849, 2109 },
+    {  522, 1097, 1085,  851, 1365, 1111,  851,  901,  961,  605 },
+    {  709,  716,  841,  728,  736,  945,  941,  862, 2845, 1057 },
+    {  512, 1323,  500, 1336, 1083,  681, 1342,  717, 1604, 1350 },
+    {  452, 1155, 1372, 1900, 1501, 3290,  311,  944, 1919,  922 },
+    {  403, 1520,  977, 2132, 1733, 3522, 1076,  276, 3335, 1547 },
+    {  559, 1374, 1101,  615,  673, 2462,  974,  795,  984,  984 },
+    {  547, 1122, 1062,  812, 1410,  951, 1140,  622, 1268,  651 } },
+  { {  165,  982, 1235,  938, 1334, 1366, 1659, 1578,  964, 1612 },
+    {  592,  422,  925,  847, 1139, 1112, 1387, 2036,  861, 1041 },
+    {  403,  837,  732,  770,  941, 1658, 1250,  809, 1407, 1407 },
+    {  896,  874, 1071,  381, 1568, 1722, 1437, 2192,  480, 1035 },
+    {  640, 1098, 1012, 1032,  684, 1382, 1581, 2106,  416,  865 },
+    {  559, 1005,  819,  914,  710,  770, 1418,  920,  838, 1435 },
+    {  415, 1258, 1245,  870, 1278, 3067,  770, 1021, 1287,  522 },
+    {  406,  990,  601, 1009, 1265, 1265, 1267,  759, 1017, 1277 },
+    {  968, 1182, 1329,  788, 1032, 1292, 1705, 1714,  203, 1403 },
+    {  732,  877, 1279,  471,  901, 1161, 1545, 1294,  755,  755 } },
+  { {  111,  931, 1378, 1185, 1933, 1648, 1148, 1714, 1873, 1307 },
+    {  406,  414, 1030, 1023, 1910, 1404, 1313, 1647, 1509,  793 },
+    {  342,  640,  575, 1088, 1241, 1349, 1161, 1350, 1756, 1502 },
+    {  559,  766, 1185,  357, 1682, 1428, 1329, 1897, 1219,  802 },
+    {  473,  909, 1164,  771,  719, 2508, 1427, 1432,  722,  782 },
+    {  342,  892,  785, 1145, 1150,  794, 1296, 1550,  973, 1057 },
+    {  208, 1036, 1326, 1343, 1606, 3395,  815, 1455, 1618,  712 },
+    {  228,  928,  890, 1046, 3499, 1711,  994,  829, 1720, 1318 },
+    {  768,  724, 1058,  636,  991, 1075, 1319, 1324,  616,  825 },
+    {  305, 1167, 1358,  899, 1587, 1587,  987, 1988, 1332,  501 } }
+};
+
+//------------------------------------------------------------------------------
+// helper functions for residuals struct VP8Residual.
+
+void VP8InitResidual(int first, int coeff_type,
+                     VP8Encoder* const enc, VP8Residual* const res) {
+  res->coeff_type = coeff_type;
+  res->prob  = enc->proba_.coeffs_[coeff_type];
+  res->stats = enc->proba_.stats_[coeff_type];
+  res->costs = enc->proba_.remapped_costs_[coeff_type];
+  res->first = first;
+}
+
+//------------------------------------------------------------------------------
+// Mode costs
+
+int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]) {
+  const int x = (it->i4_ & 3), y = (it->i4_ >> 2);
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+  int R = 0;
+  int ctx;
+
+  VP8InitResidual(0, 3, enc, &res);
+  ctx = it->top_nz_[x] + it->left_nz_[y];
+  VP8SetResidualCoeffs(levels, &res);
+  R += VP8GetResidualCost(ctx, &res);
+  return R;
+}
+
+int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd) {
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+  int x, y;
+  int R = 0;
+
+  VP8IteratorNzToBytes(it);   // re-import the non-zero context
+
+  // DC
+  VP8InitResidual(0, 1, enc, &res);
+  VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+  R += VP8GetResidualCost(it->top_nz_[8] + it->left_nz_[8], &res);
+
+  // AC
+  VP8InitResidual(1, 0, enc, &res);
+  for (y = 0; y < 4; ++y) {
+    for (x = 0; x < 4; ++x) {
+      const int ctx = it->top_nz_[x] + it->left_nz_[y];
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      R += VP8GetResidualCost(ctx, &res);
+      it->top_nz_[x] = it->left_nz_[y] = (res.last >= 0);
+    }
+  }
+  return R;
+}
+
+int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+  int ch, x, y;
+  int R = 0;
+
+  VP8IteratorNzToBytes(it);  // re-import the non-zero context
+
+  VP8InitResidual(0, 2, enc, &res);
+  for (ch = 0; ch <= 2; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x) {
+        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        R += VP8GetResidualCost(ctx, &res);
+        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = (res.last >= 0);
+      }
+    }
+  }
+  return R;
+}
+
+
+//------------------------------------------------------------------------------
+// Recording of token probabilities.
+
+// We keep the table-free variant around for reference, in case.
+#define USE_LEVEL_CODE_TABLE
+
+// Simulate block coding, but only record statistics.
+// Note: no need to record the fixed probas.
+int VP8RecordCoeffs(int ctx, const VP8Residual* const res) {
+  int n = res->first;
+  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  proba_t* s = res->stats[n][ctx];
+  if (res->last  < 0) {
+    VP8RecordStats(0, s + 0);
+    return 0;
+  }
+  while (n <= res->last) {
+    int v;
+    VP8RecordStats(1, s + 0);  // order of record doesn't matter
+    while ((v = res->coeffs[n++]) == 0) {
+      VP8RecordStats(0, s + 1);
+      s = res->stats[VP8EncBands[n]][0];
+    }
+    VP8RecordStats(1, s + 1);
+    if (!VP8RecordStats(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
+      s = res->stats[VP8EncBands[n]][1];
+    } else {
+      v = abs(v);
+#if !defined(USE_LEVEL_CODE_TABLE)
+      if (!VP8RecordStats(v > 4, s + 3)) {
+        if (VP8RecordStats(v != 2, s + 4))
+          VP8RecordStats(v == 4, s + 5);
+      } else if (!VP8RecordStats(v > 10, s + 6)) {
+        VP8RecordStats(v > 6, s + 7);
+      } else if (!VP8RecordStats((v >= 3 + (8 << 2)), s + 8)) {
+        VP8RecordStats((v >= 3 + (8 << 1)), s + 9);
+      } else {
+        VP8RecordStats((v >= 3 + (8 << 3)), s + 10);
+      }
+#else
+      if (v > MAX_VARIABLE_LEVEL) {
+        v = MAX_VARIABLE_LEVEL;
+      }
+
+      {
+        const int bits = VP8LevelCodes[v - 1][1];
+        int pattern = VP8LevelCodes[v - 1][0];
+        int i;
+        for (i = 0; (pattern >>= 1) != 0; ++i) {
+          const int mask = 2 << i;
+          if (pattern & 1) VP8RecordStats(!!(bits & mask), s + 3 + i);
+        }
+      }
+#endif
+      s = res->stats[VP8EncBands[n]][2];
+    }
+  }
+  if (n < 16) VP8RecordStats(0, s + 0);
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/delta_palettization_enc.h b/media/libwebp/enc/delta_palettization_enc.h
deleted file mode 100644
index 63048ec6e8..0000000000
--- a/media/libwebp/enc/delta_palettization_enc.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Author: Mislav Bradac (mislavm@google.com)
-//
-
-#ifndef WEBP_ENC_DELTA_PALETTIZATION_H_
-#define WEBP_ENC_DELTA_PALETTIZATION_H_
-
-#include "../webp/encode.h"
-#include "../enc/vp8li_enc.h"
-
-// Replaces enc->argb_[] input by a palettizable approximation of it,
-// and generates optimal enc->palette_[].
-// This function can revert enc->use_palette_ / enc->use_predict_ flag
-// if delta-palettization is not producing expected saving.
-WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc);
-
-#endif  // WEBP_ENC_DELTA_PALETTIZATION_H_
diff --git a/media/libwebp/enc/filter_enc.c b/media/libwebp/enc/filter_enc.c
new file mode 100644
index 0000000000..5ffc232626
--- /dev/null
+++ b/media/libwebp/enc/filter_enc.c
@@ -0,0 +1,235 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Selecting filter level
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#include <assert.h>
+#include "../enc/vp8i_enc.h"
+#include "../dsp/dsp.h"
+
+// This table gives, for a given sharpness, the filtering strength to be
+// used (at least) in order to filter a given edge step delta.
+// This is constructed by brute force inspection: for all delta, we iterate
+// over all possible filtering strength / thresh until needs_filter() returns
+// true.
+#define MAX_DELTA_SIZE 64
+static const uint8_t kLevelsFromDelta[8][MAX_DELTA_SIZE] = {
+  { 0,   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
+  { 0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 17, 18,
+    20, 21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42,
+    44, 45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 16, 17, 19,
+    20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43,
+    44, 46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19,
+    21, 22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43,
+    45, 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20,
+    21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44,
+    45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 17, 19, 20,
+    22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43, 44,
+    46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19, 21,
+    22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45,
+    46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20, 21,
+    23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44, 45,
+    47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }
+};
+
+int VP8FilterStrengthFromDelta(int sharpness, int delta) {
+  const int pos = (delta < MAX_DELTA_SIZE) ? delta : MAX_DELTA_SIZE - 1;
+  assert(sharpness >= 0 && sharpness <= 7);
+  return kLevelsFromDelta[sharpness][pos];
+}
+
+//------------------------------------------------------------------------------
+// Paragraph 15.4: compute the inner-edge filtering strength
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+static int GetILevel(int sharpness, int level) {
+  if (sharpness > 0) {
+    if (sharpness > 4) {
+      level >>= 2;
+    } else {
+      level >>= 1;
+    }
+    if (level > 9 - sharpness) {
+      level = 9 - sharpness;
+    }
+  }
+  if (level < 1) level = 1;
+  return level;
+}
+
+static void DoFilter(const VP8EncIterator* const it, int level) {
+  const VP8Encoder* const enc = it->enc_;
+  const int ilevel = GetILevel(enc->config_->filter_sharpness, level);
+  const int limit = 2 * level + ilevel;
+
+  uint8_t* const y_dst = it->yuv_out2_ + Y_OFF_ENC;
+  uint8_t* const u_dst = it->yuv_out2_ + U_OFF_ENC;
+  uint8_t* const v_dst = it->yuv_out2_ + V_OFF_ENC;
+
+  // copy current block to yuv_out2_
+  memcpy(y_dst, it->yuv_out_, YUV_SIZE_ENC * sizeof(uint8_t));
+
+  if (enc->filter_hdr_.simple_ == 1) {   // simple
+    VP8SimpleHFilter16i(y_dst, BPS, limit);
+    VP8SimpleVFilter16i(y_dst, BPS, limit);
+  } else {    // complex
+    const int hev_thresh = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
+    VP8HFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
+    VP8HFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
+    VP8VFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
+    VP8VFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SSIM metric for one macroblock
+
+static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
+  int x, y;
+  double sum = 0.;
+
+  // compute SSIM in a 10 x 10 window
+  for (y = VP8_SSIM_KERNEL; y < 16 - VP8_SSIM_KERNEL; y++) {
+    for (x = VP8_SSIM_KERNEL; x < 16 - VP8_SSIM_KERNEL; x++) {
+      sum += VP8SSIMGetClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
+                               x, y, 16, 16);
+    }
+  }
+  for (x = 1; x < 7; x++) {
+    for (y = 1; y < 7; y++) {
+      sum += VP8SSIMGetClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
+                               x, y, 8, 8);
+      sum += VP8SSIMGetClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
+                               x, y, 8, 8);
+    }
+  }
+  return sum;
+}
+
+#endif  // !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+// Exposed APIs: Encoder should call the following 3 functions to adjust
+// loop filter strength
+
+void VP8InitFilter(VP8EncIterator* const it) {
+#if !defined(WEBP_REDUCE_SIZE)
+  if (it->lf_stats_ != NULL) {
+    int s, i;
+    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+      for (i = 0; i < MAX_LF_LEVELS; i++) {
+        (*it->lf_stats_)[s][i] = 0;
+      }
+    }
+    VP8SSIMDspInit();
+  }
+#else
+  (void)it;
+#endif
+}
+
+void VP8StoreFilterStats(VP8EncIterator* const it) {
+#if !defined(WEBP_REDUCE_SIZE)
+  int d;
+  VP8Encoder* const enc = it->enc_;
+  const int s = it->mb_->segment_;
+  const int level0 = enc->dqm_[s].fstrength_;
+
+  // explore +/-quant range of values around level0
+  const int delta_min = -enc->dqm_[s].quant_;
+  const int delta_max = enc->dqm_[s].quant_;
+  const int step_size = (delta_max - delta_min >= 4) ? 4 : 1;
+
+  if (it->lf_stats_ == NULL) return;
+
+  // NOTE: Currently we are applying filter only across the sublock edges
+  // There are two reasons for that.
+  // 1. Applying filter on macro block edges will change the pixels in
+  // the left and top macro blocks. That will be hard to restore
+  // 2. Macro Blocks on the bottom and right are not yet compressed. So we
+  // cannot apply filter on the right and bottom macro block edges.
+  if (it->mb_->type_ == 1 && it->mb_->skip_) return;
+
+  // Always try filter level  zero
+  (*it->lf_stats_)[s][0] += GetMBSSIM(it->yuv_in_, it->yuv_out_);
+
+  for (d = delta_min; d <= delta_max; d += step_size) {
+    const int level = level0 + d;
+    if (level <= 0 || level >= MAX_LF_LEVELS) {
+      continue;
+    }
+    DoFilter(it, level);
+    (*it->lf_stats_)[s][level] += GetMBSSIM(it->yuv_in_, it->yuv_out2_);
+  }
+#else  // defined(WEBP_REDUCE_SIZE)
+  (void)it;
+#endif  // !defined(WEBP_REDUCE_SIZE)
+}
+
+void VP8AdjustFilterStrength(VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+#if !defined(WEBP_REDUCE_SIZE)
+  if (it->lf_stats_ != NULL) {
+    int s;
+    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+      int i, best_level = 0;
+      // Improvement over filter level 0 should be at least 1e-5 (relatively)
+      double best_v = 1.00001 * (*it->lf_stats_)[s][0];
+      for (i = 1; i < MAX_LF_LEVELS; i++) {
+        const double v = (*it->lf_stats_)[s][i];
+        if (v > best_v) {
+          best_v = v;
+          best_level = i;
+        }
+      }
+      enc->dqm_[s].fstrength_ = best_level;
+    }
+    return;
+  }
+#endif  // !defined(WEBP_REDUCE_SIZE)
+  if (enc->config_->filter_strength > 0) {
+    int max_level = 0;
+    int s;
+    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+      VP8SegmentInfo* const dqm = &enc->dqm_[s];
+      // this '>> 3' accounts for some inverse WHT scaling
+      const int delta = (dqm->max_edge_ * dqm->y2_.q_[1]) >> 3;
+      const int level =
+          VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, delta);
+      if (level > dqm->fstrength_) {
+        dqm->fstrength_ = level;
+      }
+      if (max_level < dqm->fstrength_) {
+        max_level = dqm->fstrength_;
+      }
+    }
+    enc->filter_hdr_.level_ = max_level;
+  }
+}
+
+// -----------------------------------------------------------------------------
diff --git a/media/libwebp/enc/frame_enc.c b/media/libwebp/enc/frame_enc.c
new file mode 100644
index 0000000000..c8698ca5b5
--- /dev/null
+++ b/media/libwebp/enc/frame_enc.c
@@ -0,0 +1,899 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   frame coding and analysis
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <string.h>
+#include <math.h>
+
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../dsp/dsp.h"
+#include "../webp/format_constants.h"  // RIFF constants
+
+#define SEGMENT_VISU 0
+#define DEBUG_SEARCH 0    // useful to track search convergence
+
+//------------------------------------------------------------------------------
+// multi-pass convergence
+
+#define HEADER_SIZE_ESTIMATE (RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE +  \
+                              VP8_FRAME_HEADER_SIZE)
+#define DQ_LIMIT 0.4  // convergence is considered reached if dq < DQ_LIMIT
+// we allow 2k of extra head-room in PARTITION0 limit.
+#define PARTITION0_SIZE_LIMIT ((VP8_MAX_PARTITION0_SIZE - 2048ULL) << 11)
+
+static float Clamp(float v, float min, float max) {
+  return (v < min) ? min : (v > max) ? max : v;
+}
+
+typedef struct {  // struct for organizing convergence in either size or PSNR
+  int is_first;
+  float dq;
+  float q, last_q;
+  float qmin, qmax;
+  double value, last_value;   // PSNR or size
+  double target;
+  int do_size_search;
+} PassStats;
+
+static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
+  const uint64_t target_size = (uint64_t)enc->config_->target_size;
+  const int do_size_search = (target_size != 0);
+  const float target_PSNR = enc->config_->target_PSNR;
+
+  s->is_first = 1;
+  s->dq = 10.f;
+  s->qmin = 1.f * enc->config_->qmin;
+  s->qmax = 1.f * enc->config_->qmax;
+  s->q = s->last_q = Clamp(enc->config_->quality, s->qmin, s->qmax);
+  s->target = do_size_search ? (double)target_size
+            : (target_PSNR > 0.) ? target_PSNR
+            : 40.;   // default, just in case
+  s->value = s->last_value = 0.;
+  s->do_size_search = do_size_search;
+  return do_size_search;
+}
+
+static float ComputeNextQ(PassStats* const s) {
+  float dq;
+  if (s->is_first) {
+    dq = (s->value > s->target) ? -s->dq : s->dq;
+    s->is_first = 0;
+  } else if (s->value != s->last_value) {
+    const double slope = (s->target - s->value) / (s->last_value - s->value);
+    dq = (float)(slope * (s->last_q - s->q));
+  } else {
+    dq = 0.;  // we're done?!
+  }
+  // Limit variable to avoid large swings.
+  s->dq = Clamp(dq, -30.f, 30.f);
+  s->last_q = s->q;
+  s->last_value = s->value;
+  s->q = Clamp(s->q + s->dq, s->qmin, s->qmax);
+  return s->q;
+}
+
+//------------------------------------------------------------------------------
+// Tables for level coding
+
+const uint8_t VP8Cat3[] = { 173, 148, 140 };
+const uint8_t VP8Cat4[] = { 176, 155, 140, 135 };
+const uint8_t VP8Cat5[] = { 180, 157, 141, 134, 130 };
+const uint8_t VP8Cat6[] =
+    { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129 };
+
+//------------------------------------------------------------------------------
+// Reset the statistics about: number of skips, token proba, level cost,...
+
+static void ResetStats(VP8Encoder* const enc) {
+  VP8EncProba* const proba = &enc->proba_;
+  VP8CalculateLevelCosts(proba);
+  proba->nb_skip_ = 0;
+}
+
+//------------------------------------------------------------------------------
+// Skip decision probability
+
+#define SKIP_PROBA_THRESHOLD 250  // value below which using skip_proba is OK.
+
+static int CalcSkipProba(uint64_t nb, uint64_t total) {
+  return (int)(total ? (total - nb) * 255 / total : 255);
+}
+
+// Returns the bit-cost for coding the skip probability.
+static int FinalizeSkipProba(VP8Encoder* const enc) {
+  VP8EncProba* const proba = &enc->proba_;
+  const int nb_mbs = enc->mb_w_ * enc->mb_h_;
+  const int nb_events = proba->nb_skip_;
+  int size;
+  proba->skip_proba_ = CalcSkipProba(nb_events, nb_mbs);
+  proba->use_skip_proba_ = (proba->skip_proba_ < SKIP_PROBA_THRESHOLD);
+  size = 256;   // 'use_skip_proba' bit
+  if (proba->use_skip_proba_) {
+    size +=  nb_events * VP8BitCost(1, proba->skip_proba_)
+         + (nb_mbs - nb_events) * VP8BitCost(0, proba->skip_proba_);
+    size += 8 * 256;   // cost of signaling the skip_proba_ itself.
+  }
+  return size;
+}
+
+// Collect statistics and deduce probabilities for next coding pass.
+// Return the total bit-cost for coding the probability updates.
+static int CalcTokenProba(int nb, int total) {
+  assert(nb <= total);
+  return nb ? (255 - nb * 255 / total) : 255;
+}
+
+// Cost of coding 'nb' 1's and 'total-nb' 0's using 'proba' probability.
+static int BranchCost(int nb, int total, int proba) {
+  return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
+}
+
+static void ResetTokenStats(VP8Encoder* const enc) {
+  VP8EncProba* const proba = &enc->proba_;
+  memset(proba->stats_, 0, sizeof(proba->stats_));
+}
+
+static int FinalizeTokenProbas(VP8EncProba* const proba) {
+  int has_changed = 0;
+  int size = 0;
+  int t, b, c, p;
+  for (t = 0; t < NUM_TYPES; ++t) {
+    for (b = 0; b < NUM_BANDS; ++b) {
+      for (c = 0; c < NUM_CTX; ++c) {
+        for (p = 0; p < NUM_PROBAS; ++p) {
+          const proba_t stats = proba->stats_[t][b][c][p];
+          const int nb = (stats >> 0) & 0xffff;
+          const int total = (stats >> 16) & 0xffff;
+          const int update_proba = VP8CoeffsUpdateProba[t][b][c][p];
+          const int old_p = VP8CoeffsProba0[t][b][c][p];
+          const int new_p = CalcTokenProba(nb, total);
+          const int old_cost = BranchCost(nb, total, old_p)
+                             + VP8BitCost(0, update_proba);
+          const int new_cost = BranchCost(nb, total, new_p)
+                             + VP8BitCost(1, update_proba)
+                             + 8 * 256;
+          const int use_new_p = (old_cost > new_cost);
+          size += VP8BitCost(use_new_p, update_proba);
+          if (use_new_p) {  // only use proba that seem meaningful enough.
+            proba->coeffs_[t][b][c][p] = new_p;
+            has_changed |= (new_p != old_p);
+            size += 8 * 256;
+          } else {
+            proba->coeffs_[t][b][c][p] = old_p;
+          }
+        }
+      }
+    }
+  }
+  proba->dirty_ = has_changed;
+  return size;
+}
+
+//------------------------------------------------------------------------------
+// Finalize Segment probability based on the coding tree
+
+static int GetProba(int a, int b) {
+  const int total = a + b;
+  return (total == 0) ? 255     // that's the default probability.
+                      : (255 * a + total / 2) / total;  // rounded proba
+}
+
+static void ResetSegments(VP8Encoder* const enc) {
+  int n;
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    enc->mb_info_[n].segment_ = 0;
+  }
+}
+
+static void SetSegmentProbas(VP8Encoder* const enc) {
+  int p[NUM_MB_SEGMENTS] = { 0 };
+  int n;
+
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    const VP8MBInfo* const mb = &enc->mb_info_[n];
+    ++p[mb->segment_];
+  }
+#if !defined(WEBP_DISABLE_STATS)
+  if (enc->pic_->stats != NULL) {
+    for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
+      enc->pic_->stats->segment_size[n] = p[n];
+    }
+  }
+#endif
+  if (enc->segment_hdr_.num_segments_ > 1) {
+    uint8_t* const probas = enc->proba_.segments_;
+    probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
+    probas[1] = GetProba(p[0], p[1]);
+    probas[2] = GetProba(p[2], p[3]);
+
+    enc->segment_hdr_.update_map_ =
+        (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
+    if (!enc->segment_hdr_.update_map_) ResetSegments(enc);
+    enc->segment_hdr_.size_ =
+        p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
+        p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
+        p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) +
+        p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2]));
+  } else {
+    enc->segment_hdr_.update_map_ = 0;
+    enc->segment_hdr_.size_ = 0;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Coefficient coding
+
+static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const uint8_t* p = res->prob[n][ctx];
+  if (!VP8PutBit(bw, res->last >= 0, p[0])) {
+    return 0;
+  }
+
+  while (n < 16) {
+    const int c = res->coeffs[n++];
+    const int sign = c < 0;
+    int v = sign ? -c : c;
+    if (!VP8PutBit(bw, v != 0, p[1])) {
+      p = res->prob[VP8EncBands[n]][0];
+      continue;
+    }
+    if (!VP8PutBit(bw, v > 1, p[2])) {
+      p = res->prob[VP8EncBands[n]][1];
+    } else {
+      if (!VP8PutBit(bw, v > 4, p[3])) {
+        if (VP8PutBit(bw, v != 2, p[4])) {
+          VP8PutBit(bw, v == 4, p[5]);
+        }
+      } else if (!VP8PutBit(bw, v > 10, p[6])) {
+        if (!VP8PutBit(bw, v > 6, p[7])) {
+          VP8PutBit(bw, v == 6, 159);
+        } else {
+          VP8PutBit(bw, v >= 9, 165);
+          VP8PutBit(bw, !(v & 1), 145);
+        }
+      } else {
+        int mask;
+        const uint8_t* tab;
+        if (v < 3 + (8 << 1)) {          // VP8Cat3  (3b)
+          VP8PutBit(bw, 0, p[8]);
+          VP8PutBit(bw, 0, p[9]);
+          v -= 3 + (8 << 0);
+          mask = 1 << 2;
+          tab = VP8Cat3;
+        } else if (v < 3 + (8 << 2)) {   // VP8Cat4  (4b)
+          VP8PutBit(bw, 0, p[8]);
+          VP8PutBit(bw, 1, p[9]);
+          v -= 3 + (8 << 1);
+          mask = 1 << 3;
+          tab = VP8Cat4;
+        } else if (v < 3 + (8 << 3)) {   // VP8Cat5  (5b)
+          VP8PutBit(bw, 1, p[8]);
+          VP8PutBit(bw, 0, p[10]);
+          v -= 3 + (8 << 2);
+          mask = 1 << 4;
+          tab = VP8Cat5;
+        } else {                         // VP8Cat6 (11b)
+          VP8PutBit(bw, 1, p[8]);
+          VP8PutBit(bw, 1, p[10]);
+          v -= 3 + (8 << 3);
+          mask = 1 << 10;
+          tab = VP8Cat6;
+        }
+        while (mask) {
+          VP8PutBit(bw, !!(v & mask), *tab++);
+          mask >>= 1;
+        }
+      }
+      p = res->prob[VP8EncBands[n]][2];
+    }
+    VP8PutBitUniform(bw, sign);
+    if (n == 16 || !VP8PutBit(bw, n <= res->last, p[0])) {
+      return 1;   // EOB
+    }
+  }
+  return 1;
+}
+
+static void CodeResiduals(VP8BitWriter* const bw, VP8EncIterator* const it,
+                          const VP8ModeScore* const rd) {
+  int x, y, ch;
+  VP8Residual res;
+  uint64_t pos1, pos2, pos3;
+  const int i16 = (it->mb_->type_ == 1);
+  const int segment = it->mb_->segment_;
+  VP8Encoder* const enc = it->enc_;
+
+  VP8IteratorNzToBytes(it);
+
+  pos1 = VP8BitWriterPos(bw);
+  if (i16) {
+    VP8InitResidual(0, 1, enc, &res);
+    VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+    it->top_nz_[8] = it->left_nz_[8] =
+      PutCoeffs(bw, it->top_nz_[8] + it->left_nz_[8], &res);
+    VP8InitResidual(1, 0, enc, &res);
+  } else {
+    VP8InitResidual(0, 3, enc, &res);
+  }
+
+  // luma-AC
+  for (y = 0; y < 4; ++y) {
+    for (x = 0; x < 4; ++x) {
+      const int ctx = it->top_nz_[x] + it->left_nz_[y];
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      it->top_nz_[x] = it->left_nz_[y] = PutCoeffs(bw, ctx, &res);
+    }
+  }
+  pos2 = VP8BitWriterPos(bw);
+
+  // U/V
+  VP8InitResidual(0, 2, enc, &res);
+  for (ch = 0; ch <= 2; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x) {
+        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
+            PutCoeffs(bw, ctx, &res);
+      }
+    }
+  }
+  pos3 = VP8BitWriterPos(bw);
+  it->luma_bits_ = pos2 - pos1;
+  it->uv_bits_ = pos3 - pos2;
+  it->bit_count_[segment][i16] += it->luma_bits_;
+  it->bit_count_[segment][2] += it->uv_bits_;
+  VP8IteratorBytesToNz(it);
+}
+
+// Same as CodeResiduals, but doesn't actually write anything.
+// Instead, it just records the event distribution.
+static void RecordResiduals(VP8EncIterator* const it,
+                            const VP8ModeScore* const rd) {
+  int x, y, ch;
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+
+  VP8IteratorNzToBytes(it);
+
+  if (it->mb_->type_ == 1) {   // i16x16
+    VP8InitResidual(0, 1, enc, &res);
+    VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+    it->top_nz_[8] = it->left_nz_[8] =
+      VP8RecordCoeffs(it->top_nz_[8] + it->left_nz_[8], &res);
+    VP8InitResidual(1, 0, enc, &res);
+  } else {
+    VP8InitResidual(0, 3, enc, &res);
+  }
+
+  // luma-AC
+  for (y = 0; y < 4; ++y) {
+    for (x = 0; x < 4; ++x) {
+      const int ctx = it->top_nz_[x] + it->left_nz_[y];
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      it->top_nz_[x] = it->left_nz_[y] = VP8RecordCoeffs(ctx, &res);
+    }
+  }
+
+  // U/V
+  VP8InitResidual(0, 2, enc, &res);
+  for (ch = 0; ch <= 2; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x) {
+        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
+            VP8RecordCoeffs(ctx, &res);
+      }
+    }
+  }
+
+  VP8IteratorBytesToNz(it);
+}
+
+//------------------------------------------------------------------------------
+// Token buffer
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
+                        VP8TBuffer* const tokens) {
+  int x, y, ch;
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+
+  VP8IteratorNzToBytes(it);
+  if (it->mb_->type_ == 1) {   // i16x16
+    const int ctx = it->top_nz_[8] + it->left_nz_[8];
+    VP8InitResidual(0, 1, enc, &res);
+    VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+    it->top_nz_[8] = it->left_nz_[8] =
+        VP8RecordCoeffTokens(ctx, &res, tokens);
+    VP8InitResidual(1, 0, enc, &res);
+  } else {
+    VP8InitResidual(0, 3, enc, &res);
+  }
+
+  // luma-AC
+  for (y = 0; y < 4; ++y) {
+    for (x = 0; x < 4; ++x) {
+      const int ctx = it->top_nz_[x] + it->left_nz_[y];
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      it->top_nz_[x] = it->left_nz_[y] =
+          VP8RecordCoeffTokens(ctx, &res, tokens);
+    }
+  }
+
+  // U/V
+  VP8InitResidual(0, 2, enc, &res);
+  for (ch = 0; ch <= 2; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x) {
+        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
+            VP8RecordCoeffTokens(ctx, &res, tokens);
+      }
+    }
+  }
+  VP8IteratorBytesToNz(it);
+  return !tokens->error_;
+}
+
+#endif    // !DISABLE_TOKEN_BUFFER
+
+//------------------------------------------------------------------------------
+// ExtraInfo map / Debug function
+
+#if !defined(WEBP_DISABLE_STATS)
+
+#if SEGMENT_VISU
+static void SetBlock(uint8_t* p, int value, int size) {
+  int y;
+  for (y = 0; y < size; ++y) {
+    memset(p, value, size);
+    p += BPS;
+  }
+}
+#endif
+
+static void ResetSSE(VP8Encoder* const enc) {
+  enc->sse_[0] = 0;
+  enc->sse_[1] = 0;
+  enc->sse_[2] = 0;
+  // Note: enc->sse_[3] is managed by alpha.c
+  enc->sse_count_ = 0;
+}
+
+static void StoreSSE(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  const uint8_t* const in = it->yuv_in_;
+  const uint8_t* const out = it->yuv_out_;
+  // Note: not totally accurate at boundary. And doesn't include in-loop filter.
+  enc->sse_[0] += VP8SSE16x16(in + Y_OFF_ENC, out + Y_OFF_ENC);
+  enc->sse_[1] += VP8SSE8x8(in + U_OFF_ENC, out + U_OFF_ENC);
+  enc->sse_[2] += VP8SSE8x8(in + V_OFF_ENC, out + V_OFF_ENC);
+  enc->sse_count_ += 16 * 16;
+}
+
+static void StoreSideInfo(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  const VP8MBInfo* const mb = it->mb_;
+  WebPPicture* const pic = enc->pic_;
+
+  if (pic->stats != NULL) {
+    StoreSSE(it);
+    enc->block_count_[0] += (mb->type_ == 0);
+    enc->block_count_[1] += (mb->type_ == 1);
+    enc->block_count_[2] += (mb->skip_ != 0);
+  }
+
+  if (pic->extra_info != NULL) {
+    uint8_t* const info = &pic->extra_info[it->x_ + it->y_ * enc->mb_w_];
+    switch (pic->extra_info_type) {
+      case 1: *info = mb->type_; break;
+      case 2: *info = mb->segment_; break;
+      case 3: *info = enc->dqm_[mb->segment_].quant_; break;
+      case 4: *info = (mb->type_ == 1) ? it->preds_[0] : 0xff; break;
+      case 5: *info = mb->uv_mode_; break;
+      case 6: {
+        const int b = (int)((it->luma_bits_ + it->uv_bits_ + 7) >> 3);
+        *info = (b > 255) ? 255 : b; break;
+      }
+      case 7: *info = mb->alpha_; break;
+      default: *info = 0; break;
+    }
+  }
+#if SEGMENT_VISU  // visualize segments and prediction modes
+  SetBlock(it->yuv_out_ + Y_OFF_ENC, mb->segment_ * 64, 16);
+  SetBlock(it->yuv_out_ + U_OFF_ENC, it->preds_[0] * 64, 8);
+  SetBlock(it->yuv_out_ + V_OFF_ENC, mb->uv_mode_ * 64, 8);
+#endif
+}
+
+static void ResetSideInfo(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  WebPPicture* const pic = enc->pic_;
+  if (pic->stats != NULL) {
+    memset(enc->block_count_, 0, sizeof(enc->block_count_));
+  }
+  ResetSSE(enc);
+}
+#else  // defined(WEBP_DISABLE_STATS)
+static void ResetSSE(VP8Encoder* const enc) {
+  (void)enc;
+}
+static void StoreSideInfo(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  WebPPicture* const pic = enc->pic_;
+  if (pic->extra_info != NULL) {
+    if (it->x_ == 0 && it->y_ == 0) {   // only do it once, at start
+      memset(pic->extra_info, 0,
+             enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+    }
+  }
+}
+
+static void ResetSideInfo(const VP8EncIterator* const it) {
+  (void)it;
+}
+#endif  // !defined(WEBP_DISABLE_STATS)
+
+static double GetPSNR(uint64_t mse, uint64_t size) {
+  return (mse > 0 && size > 0) ? 10. * log10(255. * 255. * size / mse) : 99;
+}
+
+//------------------------------------------------------------------------------
+//  StatLoop(): only collect statistics (number of skips, token usage, ...).
+//  This is used for deciding optimal probabilities. It also modifies the
+//  quantizer value if some target (size, PSNR) was specified.
+
+static void SetLoopParams(VP8Encoder* const enc, float q) {
+  // Make sure the quality parameter is inside valid bounds
+  q = Clamp(q, 0.f, 100.f);
+
+  VP8SetSegmentParams(enc, q);      // setup segment quantizations and filters
+  SetSegmentProbas(enc);            // compute segment probabilities
+
+  ResetStats(enc);
+  ResetSSE(enc);
+}
+
+static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
+                            int nb_mbs, int percent_delta,
+                            PassStats* const s) {
+  VP8EncIterator it;
+  uint64_t size = 0;
+  uint64_t size_p0 = 0;
+  uint64_t distortion = 0;
+  const uint64_t pixel_count = nb_mbs * 384;
+
+  VP8IteratorInit(enc, &it);
+  SetLoopParams(enc, s->q);
+  do {
+    VP8ModeScore info;
+    VP8IteratorImport(&it, NULL);
+    if (VP8Decimate(&it, &info, rd_opt)) {
+      // Just record the number of skips and act like skip_proba is not used.
+      ++enc->proba_.nb_skip_;
+    }
+    RecordResiduals(&it, &info);
+    size += info.R + info.H;
+    size_p0 += info.H;
+    distortion += info.D;
+    if (percent_delta && !VP8IteratorProgress(&it, percent_delta)) {
+      return 0;
+    }
+    VP8IteratorSaveBoundary(&it);
+  } while (VP8IteratorNext(&it) && --nb_mbs > 0);
+
+  size_p0 += enc->segment_hdr_.size_;
+  if (s->do_size_search) {
+    size += FinalizeSkipProba(enc);
+    size += FinalizeTokenProbas(&enc->proba_);
+    size = ((size + size_p0 + 1024) >> 11) + HEADER_SIZE_ESTIMATE;
+    s->value = (double)size;
+  } else {
+    s->value = GetPSNR(distortion, pixel_count);
+  }
+  return size_p0;
+}
+
+static int StatLoop(VP8Encoder* const enc) {
+  const int method = enc->method_;
+  const int do_search = enc->do_search_;
+  const int fast_probe = ((method == 0 || method == 3) && !do_search);
+  int num_pass_left = enc->config_->pass;
+  const int task_percent = 20;
+  const int percent_per_pass =
+      (task_percent + num_pass_left / 2) / num_pass_left;
+  const int final_percent = enc->percent_ + task_percent;
+  const VP8RDLevel rd_opt =
+      (method >= 3 || do_search) ? RD_OPT_BASIC : RD_OPT_NONE;
+  int nb_mbs = enc->mb_w_ * enc->mb_h_;
+  PassStats stats;
+
+  InitPassStats(enc, &stats);
+  ResetTokenStats(enc);
+
+  // Fast mode: quick analysis pass over few mbs. Better than nothing.
+  if (fast_probe) {
+    if (method == 3) {  // we need more stats for method 3 to be reliable.
+      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 1 : 100;
+    } else {
+      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 2 : 50;
+    }
+  }
+
+  while (num_pass_left-- > 0) {
+    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
+                             (num_pass_left == 0) ||
+                             (enc->max_i4_header_bits_ == 0);
+    const uint64_t size_p0 =
+        OneStatPass(enc, rd_opt, nb_mbs, percent_per_pass, &stats);
+    if (size_p0 == 0) return 0;
+#if (DEBUG_SEARCH > 0)
+    printf("#%d value:%.1lf -> %.1lf   q:%.2f -> %.2f\n",
+           num_pass_left, stats.last_value, stats.value, stats.last_q, stats.q);
+#endif
+    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
+      ++num_pass_left;
+      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
+      continue;                        // ...and start over
+    }
+    if (is_last_pass) {
+      break;
+    }
+    // If no target size: just do several pass without changing 'q'
+    if (do_search) {
+      ComputeNextQ(&stats);
+      if (fabs(stats.dq) <= DQ_LIMIT) break;
+    }
+  }
+  if (!do_search || !stats.do_size_search) {
+    // Need to finalize probas now, since it wasn't done during the search.
+    FinalizeSkipProba(enc);
+    FinalizeTokenProbas(&enc->proba_);
+  }
+  VP8CalculateLevelCosts(&enc->proba_);  // finalize costs
+  return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
+}
+
+//------------------------------------------------------------------------------
+// Main loops
+//
+
+static const uint8_t kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 };
+
+static int PreLoopInitialize(VP8Encoder* const enc) {
+  int p;
+  int ok = 1;
+  const int average_bytes_per_MB = kAverageBytesPerMB[enc->base_quant_ >> 4];
+  const int bytes_per_parts =
+      enc->mb_w_ * enc->mb_h_ * average_bytes_per_MB / enc->num_parts_;
+  // Initialize the bit-writers
+  for (p = 0; ok && p < enc->num_parts_; ++p) {
+    ok = VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
+  }
+  if (!ok) {
+    VP8EncFreeBitWriters(enc);  // malloc error occurred
+    WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+  return ok;
+}
+
+static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
+  VP8Encoder* const enc = it->enc_;
+  if (ok) {      // Finalize the partitions, check for extra errors.
+    int p;
+    for (p = 0; p < enc->num_parts_; ++p) {
+      VP8BitWriterFinish(enc->parts_ + p);
+      ok &= !enc->parts_[p].error_;
+    }
+  }
+
+  if (ok) {      // All good. Finish up.
+#if !defined(WEBP_DISABLE_STATS)
+    if (enc->pic_->stats != NULL) {  // finalize byte counters...
+      int i, s;
+      for (i = 0; i <= 2; ++i) {
+        for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+          enc->residual_bytes_[i][s] = (int)((it->bit_count_[s][i] + 7) >> 3);
+        }
+      }
+    }
+#endif
+    VP8AdjustFilterStrength(it);     // ...and store filter stats.
+  } else {
+    // Something bad happened -> need to do some memory cleanup.
+    VP8EncFreeBitWriters(enc);
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+//  VP8EncLoop(): does the final bitstream coding.
+
+static void ResetAfterSkip(VP8EncIterator* const it) {
+  if (it->mb_->type_ == 1) {
+    *it->nz_ = 0;  // reset all predictors
+    it->left_nz_[8] = 0;
+  } else {
+    *it->nz_ &= (1 << 24);  // preserve the dc_nz bit
+  }
+}
+
+int VP8EncLoop(VP8Encoder* const enc) {
+  VP8EncIterator it;
+  int ok = PreLoopInitialize(enc);
+  if (!ok) return 0;
+
+  StatLoop(enc);  // stats-collection loop
+
+  VP8IteratorInit(enc, &it);
+  VP8InitFilter(&it);
+  do {
+    VP8ModeScore info;
+    const int dont_use_skip = !enc->proba_.use_skip_proba_;
+    const VP8RDLevel rd_opt = enc->rd_opt_level_;
+
+    VP8IteratorImport(&it, NULL);
+    // Warning! order is important: first call VP8Decimate() and
+    // *then* decide how to code the skip decision if there's one.
+    if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
+      CodeResiduals(it.bw_, &it, &info);
+    } else {   // reset predictors after a skip
+      ResetAfterSkip(&it);
+    }
+    StoreSideInfo(&it);
+    VP8StoreFilterStats(&it);
+    VP8IteratorExport(&it);
+    ok = VP8IteratorProgress(&it, 20);
+    VP8IteratorSaveBoundary(&it);
+  } while (ok && VP8IteratorNext(&it));
+
+  return PostLoopFinalize(&it, ok);
+}
+
+//------------------------------------------------------------------------------
+// Single pass using Token Buffer.
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+#define MIN_COUNT 96  // minimum number of macroblocks before updating stats
+
+int VP8EncTokenLoop(VP8Encoder* const enc) {
+  // Roughly refresh the proba eight times per pass
+  int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
+  int num_pass_left = enc->config_->pass;
+  int remaining_progress = 40;  // percents
+  const int do_search = enc->do_search_;
+  VP8EncIterator it;
+  VP8EncProba* const proba = &enc->proba_;
+  const VP8RDLevel rd_opt = enc->rd_opt_level_;
+  const uint64_t pixel_count = enc->mb_w_ * enc->mb_h_ * 384;
+  PassStats stats;
+  int ok;
+
+  InitPassStats(enc, &stats);
+  ok = PreLoopInitialize(enc);
+  if (!ok) return 0;
+
+  if (max_count < MIN_COUNT) max_count = MIN_COUNT;
+
+  assert(enc->num_parts_ == 1);
+  assert(enc->use_tokens_);
+  assert(proba->use_skip_proba_ == 0);
+  assert(rd_opt >= RD_OPT_BASIC);   // otherwise, token-buffer won't be useful
+  assert(num_pass_left > 0);
+
+  while (ok && num_pass_left-- > 0) {
+    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
+                             (num_pass_left == 0) ||
+                             (enc->max_i4_header_bits_ == 0);
+    uint64_t size_p0 = 0;
+    uint64_t distortion = 0;
+    int cnt = max_count;
+    // The final number of passes is not trivial to know in advance.
+    const int pass_progress = remaining_progress / (2 + num_pass_left);
+    remaining_progress -= pass_progress;
+    VP8IteratorInit(enc, &it);
+    SetLoopParams(enc, stats.q);
+    if (is_last_pass) {
+      ResetTokenStats(enc);
+      VP8InitFilter(&it);  // don't collect stats until last pass (too costly)
+    }
+    VP8TBufferClear(&enc->tokens_);
+    do {
+      VP8ModeScore info;
+      VP8IteratorImport(&it, NULL);
+      if (--cnt < 0) {
+        FinalizeTokenProbas(proba);
+        VP8CalculateLevelCosts(proba);  // refresh cost tables for rd-opt
+        cnt = max_count;
+      }
+      VP8Decimate(&it, &info, rd_opt);
+      ok = RecordTokens(&it, &info, &enc->tokens_);
+      if (!ok) {
+        WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+        break;
+      }
+      size_p0 += info.H;
+      distortion += info.D;
+      if (is_last_pass) {
+        StoreSideInfo(&it);
+        VP8StoreFilterStats(&it);
+        VP8IteratorExport(&it);
+        ok = VP8IteratorProgress(&it, pass_progress);
+      }
+      VP8IteratorSaveBoundary(&it);
+    } while (ok && VP8IteratorNext(&it));
+    if (!ok) break;
+
+    size_p0 += enc->segment_hdr_.size_;
+    if (stats.do_size_search) {
+      uint64_t size = FinalizeTokenProbas(&enc->proba_);
+      size += VP8EstimateTokenSize(&enc->tokens_,
+                                   (const uint8_t*)proba->coeffs_);
+      size = (size + size_p0 + 1024) >> 11;  // -> size in bytes
+      size += HEADER_SIZE_ESTIMATE;
+      stats.value = (double)size;
+    } else {  // compute and store PSNR
+      stats.value = GetPSNR(distortion, pixel_count);
+    }
+
+#if (DEBUG_SEARCH > 0)
+    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf "
+           " range:[%.1f, %.1f]\n",
+           num_pass_left, stats.last_value, stats.value,
+           stats.last_q, stats.q, stats.dq, stats.qmin, stats.qmax);
+#endif
+    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
+      ++num_pass_left;
+      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
+      if (is_last_pass) {
+        ResetSideInfo(&it);
+      }
+      continue;                        // ...and start over
+    }
+    if (is_last_pass) {
+      break;   // done
+    }
+    if (do_search) {
+      ComputeNextQ(&stats);  // Adjust q
+    }
+  }
+  if (ok) {
+    if (!stats.do_size_search) {
+      FinalizeTokenProbas(&enc->proba_);
+    }
+    ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
+                       (const uint8_t*)proba->coeffs_, 1);
+  }
+  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + remaining_progress,
+                                &enc->percent_);
+  return PostLoopFinalize(&it, ok);
+}
+
+#else
+
+int VP8EncTokenLoop(VP8Encoder* const enc) {
+  (void)enc;
+  return 0;   // we shouldn't be here.
+}
+
+#endif    // DISABLE_TOKEN_BUFFER
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/histogram_enc.c b/media/libwebp/enc/histogram_enc.c
new file mode 100644
index 0000000000..83f218bcb4
--- /dev/null
+++ b/media/libwebp/enc/histogram_enc.c
@@ -0,0 +1,1252 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include <math.h>
+
+#include "../enc/backward_references_enc.h"
+#include "../enc/histogram_enc.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/utils.h"
+
+#define MAX_COST 1.e38
+
+// Number of partitions for the three dominant (literal, red and blue) symbol
+// costs.
+#define NUM_PARTITIONS 4
+// The size of the bin-hash corresponding to the three dominant costs.
+#define BIN_SIZE (NUM_PARTITIONS * NUM_PARTITIONS * NUM_PARTITIONS)
+// Maximum number of histograms allowed in greedy combining algorithm.
+#define MAX_HISTO_GREEDY 100
+
+static void HistogramClear(VP8LHistogram* const p) {
+  uint32_t* const literal = p->literal_;
+  const int cache_bits = p->palette_code_bits_;
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  memset(p, 0, histo_size);
+  p->palette_code_bits_ = cache_bits;
+  p->literal_ = literal;
+}
+
+// Swap two histogram pointers.
+static void HistogramSwap(VP8LHistogram** const A, VP8LHistogram** const B) {
+  VP8LHistogram* const tmp = *A;
+  *A = *B;
+  *B = tmp;
+}
+
+static void HistogramCopy(const VP8LHistogram* const src,
+                          VP8LHistogram* const dst) {
+  uint32_t* const dst_literal = dst->literal_;
+  const int dst_cache_bits = dst->palette_code_bits_;
+  const int literal_size = VP8LHistogramNumCodes(dst_cache_bits);
+  const int histo_size = VP8LGetHistogramSize(dst_cache_bits);
+  assert(src->palette_code_bits_ == dst_cache_bits);
+  memcpy(dst, src, histo_size);
+  dst->literal_ = dst_literal;
+  memcpy(dst->literal_, src->literal_, literal_size * sizeof(*dst->literal_));
+}
+
+int VP8LGetHistogramSize(int cache_bits) {
+  const int literal_size = VP8LHistogramNumCodes(cache_bits);
+  const size_t total_size = sizeof(VP8LHistogram) + sizeof(int) * literal_size;
+  assert(total_size <= (size_t)0x7fffffff);
+  return (int)total_size;
+}
+
+void VP8LFreeHistogram(VP8LHistogram* const histo) {
+  WebPSafeFree(histo);
+}
+
+void VP8LFreeHistogramSet(VP8LHistogramSet* const histo) {
+  WebPSafeFree(histo);
+}
+
+void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
+                            VP8LHistogram* const histo) {
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, NULL, 0);
+    VP8LRefsCursorNext(&c);
+  }
+}
+
+void VP8LHistogramCreate(VP8LHistogram* const p,
+                         const VP8LBackwardRefs* const refs,
+                         int palette_code_bits) {
+  if (palette_code_bits >= 0) {
+    p->palette_code_bits_ = palette_code_bits;
+  }
+  HistogramClear(p);
+  VP8LHistogramStoreRefs(refs, p);
+}
+
+void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits,
+                       int init_arrays) {
+  p->palette_code_bits_ = palette_code_bits;
+  if (init_arrays) {
+    HistogramClear(p);
+  } else {
+    p->trivial_symbol_ = 0;
+    p->bit_cost_ = 0.;
+    p->literal_cost_ = 0.;
+    p->red_cost_ = 0.;
+    p->blue_cost_ = 0.;
+    memset(p->is_used_, 0, sizeof(p->is_used_));
+  }
+}
+
+VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
+  VP8LHistogram* histo = NULL;
+  const int total_size = VP8LGetHistogramSize(cache_bits);
+  uint8_t* const memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
+  if (memory == NULL) return NULL;
+  histo = (VP8LHistogram*)memory;
+  // literal_ won't necessary be aligned.
+  histo->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
+  VP8LHistogramInit(histo, cache_bits, /*init_arrays=*/ 0);
+  return histo;
+}
+
+// Resets the pointers of the histograms to point to the bit buffer in the set.
+static void HistogramSetResetPointers(VP8LHistogramSet* const set,
+                                      int cache_bits) {
+  int i;
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  uint8_t* memory = (uint8_t*) (set->histograms);
+  memory += set->max_size * sizeof(*set->histograms);
+  for (i = 0; i < set->max_size; ++i) {
+    memory = (uint8_t*) WEBP_ALIGN(memory);
+    set->histograms[i] = (VP8LHistogram*) memory;
+    // literal_ won't necessary be aligned.
+    set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
+    memory += histo_size;
+  }
+}
+
+// Returns the total size of the VP8LHistogramSet.
+static size_t HistogramSetTotalSize(int size, int cache_bits) {
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  return (sizeof(VP8LHistogramSet) + size * (sizeof(VP8LHistogram*) +
+          histo_size + WEBP_ALIGN_CST));
+}
+
+VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
+  int i;
+  VP8LHistogramSet* set;
+  const size_t total_size = HistogramSetTotalSize(size, cache_bits);
+  uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
+  if (memory == NULL) return NULL;
+
+  set = (VP8LHistogramSet*)memory;
+  memory += sizeof(*set);
+  set->histograms = (VP8LHistogram**)memory;
+  set->max_size = size;
+  set->size = size;
+  HistogramSetResetPointers(set, cache_bits);
+  for (i = 0; i < size; ++i) {
+    VP8LHistogramInit(set->histograms[i], cache_bits, /*init_arrays=*/ 0);
+  }
+  return set;
+}
+
+void VP8LHistogramSetClear(VP8LHistogramSet* const set) {
+  int i;
+  const int cache_bits = set->histograms[0]->palette_code_bits_;
+  const int size = set->max_size;
+  const size_t total_size = HistogramSetTotalSize(size, cache_bits);
+  uint8_t* memory = (uint8_t*)set;
+
+  memset(memory, 0, total_size);
+  memory += sizeof(*set);
+  set->histograms = (VP8LHistogram**)memory;
+  set->max_size = size;
+  set->size = size;
+  HistogramSetResetPointers(set, cache_bits);
+  for (i = 0; i < size; ++i) {
+    set->histograms[i]->palette_code_bits_ = cache_bits;
+  }
+}
+
+// Removes the histogram 'i' from 'set' by setting it to NULL.
+static void HistogramSetRemoveHistogram(VP8LHistogramSet* const set, int i,
+                                        int* const num_used) {
+  assert(set->histograms[i] != NULL);
+  set->histograms[i] = NULL;
+  --*num_used;
+  // If we remove the last valid one, shrink until the next valid one.
+  if (i == set->size - 1) {
+    while (set->size >= 1 && set->histograms[set->size - 1] == NULL) {
+      --set->size;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
+                                     const PixOrCopy* const v,
+                                     int (*const distance_modifier)(int, int),
+                                     int distance_modifier_arg0) {
+  if (PixOrCopyIsLiteral(v)) {
+    ++histo->alpha_[PixOrCopyLiteral(v, 3)];
+    ++histo->red_[PixOrCopyLiteral(v, 2)];
+    ++histo->literal_[PixOrCopyLiteral(v, 1)];
+    ++histo->blue_[PixOrCopyLiteral(v, 0)];
+  } else if (PixOrCopyIsCacheIdx(v)) {
+    const int literal_ix =
+        NUM_LITERAL_CODES + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
+    assert(histo->palette_code_bits_ != 0);
+    ++histo->literal_[literal_ix];
+  } else {
+    int code, extra_bits;
+    VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
+    ++histo->literal_[NUM_LITERAL_CODES + code];
+    if (distance_modifier == NULL) {
+      VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+    } else {
+      VP8LPrefixEncodeBits(
+          distance_modifier(distance_modifier_arg0, PixOrCopyDistance(v)),
+          &code, &extra_bits);
+    }
+    ++histo->distance_[code];
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Entropy-related functions.
+
+static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
+  double mix;
+  if (entropy->nonzeros < 5) {
+    if (entropy->nonzeros <= 1) {
+      return 0;
+    }
+    // Two symbols, they will be 0 and 1 in a Huffman code.
+    // Let's mix in a bit of entropy to favor good clustering when
+    // distributions of these are combined.
+    if (entropy->nonzeros == 2) {
+      return 0.99 * entropy->sum + 0.01 * entropy->entropy;
+    }
+    // No matter what the entropy says, we cannot be better than min_limit
+    // with Huffman coding. I am mixing a bit of entropy into the
+    // min_limit since it produces much better (~0.5 %) compression results
+    // perhaps because of better entropy clustering.
+    if (entropy->nonzeros == 3) {
+      mix = 0.95;
+    } else {
+      mix = 0.7;  // nonzeros == 4.
+    }
+  } else {
+    mix = 0.627;
+  }
+
+  {
+    double min_limit = 2 * entropy->sum - entropy->max_val;
+    min_limit = mix * min_limit + (1.0 - mix) * entropy->entropy;
+    return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
+  }
+}
+
+double VP8LBitsEntropy(const uint32_t* const array, int n) {
+  VP8LBitEntropy entropy;
+  VP8LBitsEntropyUnrefined(array, n, &entropy);
+
+  return BitsEntropyRefine(&entropy);
+}
+
+static double InitialHuffmanCost(void) {
+  // Small bias because Huffman code length is typically not stored in
+  // full length.
+  static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
+  static const double kSmallBias = 9.1;
+  return kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
+}
+
+// Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
+static double FinalHuffmanCost(const VP8LStreaks* const stats) {
+  // The constants in this function are experimental and got rounded from
+  // their original values in 1/8 when switched to 1/1024.
+  double retval = InitialHuffmanCost();
+  // Second coefficient: Many zeros in the histogram are covered efficiently
+  // by a run-length encode. Originally 2/8.
+  retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1];
+  // Second coefficient: Constant values are encoded less efficiently, but still
+  // RLE'ed. Originally 6/8.
+  retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1];
+  // 0s are usually encoded more efficiently than non-0s.
+  // Originally 15/8.
+  retval += 1.796875 * stats->streaks[0][0];
+  // Originally 26/8.
+  retval += 3.28125 * stats->streaks[1][0];
+  return retval;
+}
+
+// Get the symbol entropy for the distribution 'population'.
+// Set 'trivial_sym', if there's only one symbol present in the distribution.
+static double PopulationCost(const uint32_t* const population, int length,
+                             uint32_t* const trivial_sym,
+                             uint8_t* const is_used) {
+  VP8LBitEntropy bit_entropy;
+  VP8LStreaks stats;
+  VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
+  if (trivial_sym != NULL) {
+    *trivial_sym = (bit_entropy.nonzeros == 1) ? bit_entropy.nonzero_code
+                                               : VP8L_NON_TRIVIAL_SYM;
+  }
+  // The histogram is used if there is at least one non-zero streak.
+  *is_used = (stats.streaks[1][0] != 0 || stats.streaks[1][1] != 0);
+
+  return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+}
+
+// trivial_at_end is 1 if the two histograms only have one element that is
+// non-zero: both the zero-th one, or both the last one.
+static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
+                                             const uint32_t* const Y,
+                                             int length, int is_X_used,
+                                             int is_Y_used,
+                                             int trivial_at_end) {
+  VP8LStreaks stats;
+  if (trivial_at_end) {
+    // This configuration is due to palettization that transforms an indexed
+    // pixel into 0xff000000 | (pixel << 8) in VP8LBundleColorMap.
+    // BitsEntropyRefine is 0 for histograms with only one non-zero value.
+    // Only FinalHuffmanCost needs to be evaluated.
+    memset(&stats, 0, sizeof(stats));
+    // Deal with the non-zero value at index 0 or length-1.
+    stats.streaks[1][0] = 1;
+    // Deal with the following/previous zero streak.
+    stats.counts[0] = 1;
+    stats.streaks[0][1] = length - 1;
+    return FinalHuffmanCost(&stats);
+  } else {
+    VP8LBitEntropy bit_entropy;
+    if (is_X_used) {
+      if (is_Y_used) {
+        VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
+      } else {
+        VP8LGetEntropyUnrefined(X, length, &bit_entropy, &stats);
+      }
+    } else {
+      if (is_Y_used) {
+        VP8LGetEntropyUnrefined(Y, length, &bit_entropy, &stats);
+      } else {
+        memset(&stats, 0, sizeof(stats));
+        stats.counts[0] = 1;
+        stats.streaks[0][length > 3] = length;
+        VP8LBitEntropyInit(&bit_entropy);
+      }
+    }
+
+    return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+  }
+}
+
+// Estimates the Entropy + Huffman + other block overhead size cost.
+double VP8LHistogramEstimateBits(VP8LHistogram* const p) {
+  return
+      PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_),
+                     NULL, &p->is_used_[0])
+      + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1])
+      + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2])
+      + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3])
+      + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL, &p->is_used_[4])
+      + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
+      + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
+}
+
+// -----------------------------------------------------------------------------
+// Various histogram combine/cost-eval functions
+
+static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
+                                       const VP8LHistogram* const b,
+                                       double cost_threshold,
+                                       double* cost) {
+  const int palette_code_bits = a->palette_code_bits_;
+  int trivial_at_end = 0;
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+  *cost += GetCombinedEntropy(a->literal_, b->literal_,
+                              VP8LHistogramNumCodes(palette_code_bits),
+                              a->is_used_[0], b->is_used_[0], 0);
+  *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
+                                 b->literal_ + NUM_LITERAL_CODES,
+                                 NUM_LENGTH_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
+      a->trivial_symbol_ == b->trivial_symbol_) {
+    // A, R and B are all 0 or 0xff.
+    const uint32_t color_a = (a->trivial_symbol_ >> 24) & 0xff;
+    const uint32_t color_r = (a->trivial_symbol_ >> 16) & 0xff;
+    const uint32_t color_b = (a->trivial_symbol_ >> 0) & 0xff;
+    if ((color_a == 0 || color_a == 0xff) &&
+        (color_r == 0 || color_r == 0xff) &&
+        (color_b == 0 || color_b == 0xff)) {
+      trivial_at_end = 1;
+    }
+  }
+
+  *cost +=
+      GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, a->is_used_[1],
+                         b->is_used_[1], trivial_at_end);
+  if (*cost > cost_threshold) return 0;
+
+  *cost +=
+      GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, a->is_used_[2],
+                         b->is_used_[2], trivial_at_end);
+  if (*cost > cost_threshold) return 0;
+
+  *cost +=
+      GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
+                         a->is_used_[3], b->is_used_[3], trivial_at_end);
+  if (*cost > cost_threshold) return 0;
+
+  *cost +=
+      GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
+                         a->is_used_[4], b->is_used_[4], 0);
+  *cost +=
+      VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  return 1;
+}
+
+static WEBP_INLINE void HistogramAdd(const VP8LHistogram* const a,
+                                     const VP8LHistogram* const b,
+                                     VP8LHistogram* const out) {
+  VP8LHistogramAdd(a, b, out);
+  out->trivial_symbol_ = (a->trivial_symbol_ == b->trivial_symbol_)
+                       ? a->trivial_symbol_
+                       : VP8L_NON_TRIVIAL_SYM;
+}
+
+// Performs out = a + b, computing the cost C(a+b) - C(a) - C(b) while comparing
+// to the threshold value 'cost_threshold'. The score returned is
+//  Score = C(a+b) - C(a) - C(b), where C(a) + C(b) is known and fixed.
+// Since the previous score passed is 'cost_threshold', we only need to compare
+// the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
+// early.
+static double HistogramAddEval(const VP8LHistogram* const a,
+                               const VP8LHistogram* const b,
+                               VP8LHistogram* const out,
+                               double cost_threshold) {
+  double cost = 0;
+  const double sum_cost = a->bit_cost_ + b->bit_cost_;
+  cost_threshold += sum_cost;
+
+  if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
+    HistogramAdd(a, b, out);
+    out->bit_cost_ = cost;
+    out->palette_code_bits_ = a->palette_code_bits_;
+  }
+
+  return cost - sum_cost;
+}
+
+// Same as HistogramAddEval(), except that the resulting histogram
+// is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
+// the term C(b) which is constant over all the evaluations.
+static double HistogramAddThresh(const VP8LHistogram* const a,
+                                 const VP8LHistogram* const b,
+                                 double cost_threshold) {
+  double cost;
+  assert(a != NULL && b != NULL);
+  cost = -a->bit_cost_;
+  GetCombinedHistogramEntropy(a, b, cost_threshold, &cost);
+  return cost;
+}
+
+// -----------------------------------------------------------------------------
+
+// The structure to keep track of cost range for the three dominant entropy
+// symbols.
+// TODO(skal): Evaluate if float can be used here instead of double for
+// representing the entropy costs.
+typedef struct {
+  double literal_max_;
+  double literal_min_;
+  double red_max_;
+  double red_min_;
+  double blue_max_;
+  double blue_min_;
+} DominantCostRange;
+
+static void DominantCostRangeInit(DominantCostRange* const c) {
+  c->literal_max_ = 0.;
+  c->literal_min_ = MAX_COST;
+  c->red_max_ = 0.;
+  c->red_min_ = MAX_COST;
+  c->blue_max_ = 0.;
+  c->blue_min_ = MAX_COST;
+}
+
+static void UpdateDominantCostRange(
+    const VP8LHistogram* const h, DominantCostRange* const c) {
+  if (c->literal_max_ < h->literal_cost_) c->literal_max_ = h->literal_cost_;
+  if (c->literal_min_ > h->literal_cost_) c->literal_min_ = h->literal_cost_;
+  if (c->red_max_ < h->red_cost_) c->red_max_ = h->red_cost_;
+  if (c->red_min_ > h->red_cost_) c->red_min_ = h->red_cost_;
+  if (c->blue_max_ < h->blue_cost_) c->blue_max_ = h->blue_cost_;
+  if (c->blue_min_ > h->blue_cost_) c->blue_min_ = h->blue_cost_;
+}
+
+static void UpdateHistogramCost(VP8LHistogram* const h) {
+  uint32_t alpha_sym, red_sym, blue_sym;
+  const double alpha_cost =
+      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym,
+                     &h->is_used_[3]);
+  const double distance_cost =
+      PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
+      VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
+  const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
+  h->literal_cost_ =
+      PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
+          VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
+  h->red_cost_ =
+      PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
+  h->blue_cost_ =
+      PopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym, &h->is_used_[2]);
+  h->bit_cost_ = h->literal_cost_ + h->red_cost_ + h->blue_cost_ +
+                 alpha_cost + distance_cost;
+  if ((alpha_sym | red_sym | blue_sym) == VP8L_NON_TRIVIAL_SYM) {
+    h->trivial_symbol_ = VP8L_NON_TRIVIAL_SYM;
+  } else {
+    h->trivial_symbol_ =
+        ((uint32_t)alpha_sym << 24) | (red_sym << 16) | (blue_sym << 0);
+  }
+}
+
+static int GetBinIdForEntropy(double min, double max, double val) {
+  const double range = max - min;
+  if (range > 0.) {
+    const double delta = val - min;
+    return (int)((NUM_PARTITIONS - 1e-6) * delta / range);
+  } else {
+    return 0;
+  }
+}
+
+static int GetHistoBinIndex(const VP8LHistogram* const h,
+                            const DominantCostRange* const c, int low_effort) {
+  int bin_id = GetBinIdForEntropy(c->literal_min_, c->literal_max_,
+                                  h->literal_cost_);
+  assert(bin_id < NUM_PARTITIONS);
+  if (!low_effort) {
+    bin_id = bin_id * NUM_PARTITIONS
+           + GetBinIdForEntropy(c->red_min_, c->red_max_, h->red_cost_);
+    bin_id = bin_id * NUM_PARTITIONS
+           + GetBinIdForEntropy(c->blue_min_, c->blue_max_, h->blue_cost_);
+    assert(bin_id < BIN_SIZE);
+  }
+  return bin_id;
+}
+
+// Construct the histograms from backward references.
+static void HistogramBuild(
+    int xsize, int histo_bits, const VP8LBackwardRefs* const backward_refs,
+    VP8LHistogramSet* const image_histo) {
+  int x = 0, y = 0;
+  const int histo_xsize = VP8LSubSampleSize(xsize, histo_bits);
+  VP8LHistogram** const histograms = image_histo->histograms;
+  VP8LRefsCursor c = VP8LRefsCursorInit(backward_refs);
+  assert(histo_bits > 0);
+  VP8LHistogramSetClear(image_histo);
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
+    const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
+    VP8LHistogramAddSinglePixOrCopy(histograms[ix], v, NULL, 0);
+    x += PixOrCopyLength(v);
+    while (x >= xsize) {
+      x -= xsize;
+      ++y;
+    }
+    VP8LRefsCursorNext(&c);
+  }
+}
+
+// Copies the histograms and computes its bit_cost.
+static const uint16_t kInvalidHistogramSymbol = (uint16_t)(-1);
+static void HistogramCopyAndAnalyze(VP8LHistogramSet* const orig_histo,
+                                    VP8LHistogramSet* const image_histo,
+                                    int* const num_used,
+                                    uint16_t* const histogram_symbols) {
+  int i, cluster_id;
+  int num_used_orig = *num_used;
+  VP8LHistogram** const orig_histograms = orig_histo->histograms;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  assert(image_histo->max_size == orig_histo->max_size);
+  for (cluster_id = 0, i = 0; i < orig_histo->max_size; ++i) {
+    VP8LHistogram* const histo = orig_histograms[i];
+    UpdateHistogramCost(histo);
+
+    // Skip the histogram if it is completely empty, which can happen for tiles
+    // with no information (when they are skipped because of LZ77).
+    if (!histo->is_used_[0] && !histo->is_used_[1] && !histo->is_used_[2]
+        && !histo->is_used_[3] && !histo->is_used_[4]) {
+      // The first histogram is always used. If an histogram is empty, we set
+      // its id to be the same as the previous one: this will improve
+      // compressibility for later LZ77.
+      assert(i > 0);
+      HistogramSetRemoveHistogram(image_histo, i, num_used);
+      HistogramSetRemoveHistogram(orig_histo, i, &num_used_orig);
+      histogram_symbols[i] = kInvalidHistogramSymbol;
+    } else {
+      // Copy histograms from orig_histo[] to image_histo[].
+      HistogramCopy(histo, histograms[i]);
+      histogram_symbols[i] = cluster_id++;
+      assert(cluster_id <= image_histo->max_size);
+    }
+  }
+}
+
+// Partition histograms to different entropy bins for three dominant (literal,
+// red and blue) symbol costs and compute the histogram aggregate bit_cost.
+static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
+                                       uint16_t* const bin_map,
+                                       int low_effort) {
+  int i;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  const int histo_size = image_histo->size;
+  DominantCostRange cost_range;
+  DominantCostRangeInit(&cost_range);
+
+  // Analyze the dominant (literal, red and blue) entropy costs.
+  for (i = 0; i < histo_size; ++i) {
+    if (histograms[i] == NULL) continue;
+    UpdateDominantCostRange(histograms[i], &cost_range);
+  }
+
+  // bin-hash histograms on three of the dominant (literal, red and blue)
+  // symbol costs and store the resulting bin_id for each histogram.
+  for (i = 0; i < histo_size; ++i) {
+    // bin_map[i] is not set to a special value as its use will later be guarded
+    // by another (histograms[i] == NULL).
+    if (histograms[i] == NULL) continue;
+    bin_map[i] = GetHistoBinIndex(histograms[i], &cost_range, low_effort);
+  }
+}
+
+// Merges some histograms with same bin_id together if it's advantageous.
+// Sets the remaining histograms to NULL.
+static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
+                                       int* num_used,
+                                       const uint16_t* const clusters,
+                                       uint16_t* const cluster_mappings,
+                                       VP8LHistogram* cur_combo,
+                                       const uint16_t* const bin_map,
+                                       int num_bins,
+                                       double combine_cost_factor,
+                                       int low_effort) {
+  VP8LHistogram** const histograms = image_histo->histograms;
+  int idx;
+  struct {
+    int16_t first;    // position of the histogram that accumulates all
+                      // histograms with the same bin_id
+    uint16_t num_combine_failures;   // number of combine failures per bin_id
+  } bin_info[BIN_SIZE];
+
+  assert(num_bins <= BIN_SIZE);
+  for (idx = 0; idx < num_bins; ++idx) {
+    bin_info[idx].first = -1;
+    bin_info[idx].num_combine_failures = 0;
+  }
+
+  // By default, a cluster matches itself.
+  for (idx = 0; idx < *num_used; ++idx) cluster_mappings[idx] = idx;
+  for (idx = 0; idx < image_histo->size; ++idx) {
+    int bin_id, first;
+    if (histograms[idx] == NULL) continue;
+    bin_id = bin_map[idx];
+    first = bin_info[bin_id].first;
+    if (first == -1) {
+      bin_info[bin_id].first = idx;
+    } else if (low_effort) {
+      HistogramAdd(histograms[idx], histograms[first], histograms[first]);
+      HistogramSetRemoveHistogram(image_histo, idx, num_used);
+      cluster_mappings[clusters[idx]] = clusters[first];
+    } else {
+      // try to merge #idx into #first (both share the same bin_id)
+      const double bit_cost = histograms[idx]->bit_cost_;
+      const double bit_cost_thresh = -bit_cost * combine_cost_factor;
+      const double curr_cost_diff =
+          HistogramAddEval(histograms[first], histograms[idx],
+                           cur_combo, bit_cost_thresh);
+      if (curr_cost_diff < bit_cost_thresh) {
+        // Try to merge two histograms only if the combo is a trivial one or
+        // the two candidate histograms are already non-trivial.
+        // For some images, 'try_combine' turns out to be false for a lot of
+        // histogram pairs. In that case, we fallback to combining
+        // histograms as usual to avoid increasing the header size.
+        const int try_combine =
+            (cur_combo->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM) ||
+            ((histograms[idx]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM) &&
+             (histograms[first]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM));
+        const int max_combine_failures = 32;
+        if (try_combine ||
+            bin_info[bin_id].num_combine_failures >= max_combine_failures) {
+          // move the (better) merged histogram to its final slot
+          HistogramSwap(&cur_combo, &histograms[first]);
+          HistogramSetRemoveHistogram(image_histo, idx, num_used);
+          cluster_mappings[clusters[idx]] = clusters[first];
+        } else {
+          ++bin_info[bin_id].num_combine_failures;
+        }
+      }
+    }
+  }
+  if (low_effort) {
+    // for low_effort case, update the final cost when everything is merged
+    for (idx = 0; idx < image_histo->size; ++idx) {
+      if (histograms[idx] == NULL) continue;
+      UpdateHistogramCost(histograms[idx]);
+    }
+  }
+}
+
+// Implement a Lehmer random number generator with a multiplicative constant of
+// 48271 and a modulo constant of 2^31 - 1.
+static uint32_t MyRand(uint32_t* const seed) {
+  *seed = (uint32_t)(((uint64_t)(*seed) * 48271u) % 2147483647u);
+  assert(*seed > 0);
+  return *seed;
+}
+
+// -----------------------------------------------------------------------------
+// Histogram pairs priority queue
+
+// Pair of histograms. Negative idx1 value means that pair is out-of-date.
+typedef struct {
+  int idx1;
+  int idx2;
+  double cost_diff;
+  double cost_combo;
+} HistogramPair;
+
+typedef struct {
+  HistogramPair* queue;
+  int size;
+  int max_size;
+} HistoQueue;
+
+static int HistoQueueInit(HistoQueue* const histo_queue, const int max_size) {
+  histo_queue->size = 0;
+  histo_queue->max_size = max_size;
+  // We allocate max_size + 1 because the last element at index "size" is
+  // used as temporary data (and it could be up to max_size).
+  histo_queue->queue = (HistogramPair*)WebPSafeMalloc(
+      histo_queue->max_size + 1, sizeof(*histo_queue->queue));
+  return histo_queue->queue != NULL;
+}
+
+static void HistoQueueClear(HistoQueue* const histo_queue) {
+  assert(histo_queue != NULL);
+  WebPSafeFree(histo_queue->queue);
+  histo_queue->size = 0;
+  histo_queue->max_size = 0;
+}
+
+// Pop a specific pair in the queue by replacing it with the last one
+// and shrinking the queue.
+static void HistoQueuePopPair(HistoQueue* const histo_queue,
+                              HistogramPair* const pair) {
+  assert(pair >= histo_queue->queue &&
+         pair < (histo_queue->queue + histo_queue->size));
+  assert(histo_queue->size > 0);
+  *pair = histo_queue->queue[histo_queue->size - 1];
+  --histo_queue->size;
+}
+
+// Check whether a pair in the queue should be updated as head or not.
+static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
+                                 HistogramPair* const pair) {
+  assert(pair->cost_diff < 0.);
+  assert(pair >= histo_queue->queue &&
+         pair < (histo_queue->queue + histo_queue->size));
+  assert(histo_queue->size > 0);
+  if (pair->cost_diff < histo_queue->queue[0].cost_diff) {
+    // Replace the best pair.
+    const HistogramPair tmp = histo_queue->queue[0];
+    histo_queue->queue[0] = *pair;
+    *pair = tmp;
+  }
+}
+
+// Update the cost diff and combo of a pair of histograms. This needs to be
+// called when the the histograms have been merged with a third one.
+static void HistoQueueUpdatePair(const VP8LHistogram* const h1,
+                                 const VP8LHistogram* const h2,
+                                 double threshold,
+                                 HistogramPair* const pair) {
+  const double sum_cost = h1->bit_cost_ + h2->bit_cost_;
+  pair->cost_combo = 0.;
+  GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair->cost_combo);
+  pair->cost_diff = pair->cost_combo - sum_cost;
+}
+
+// Create a pair from indices "idx1" and "idx2" provided its cost
+// is inferior to "threshold", a negative entropy.
+// It returns the cost of the pair, or 0. if it superior to threshold.
+static double HistoQueuePush(HistoQueue* const histo_queue,
+                             VP8LHistogram** const histograms, int idx1,
+                             int idx2, double threshold) {
+  const VP8LHistogram* h1;
+  const VP8LHistogram* h2;
+  HistogramPair pair;
+
+  // Stop here if the queue is full.
+  if (histo_queue->size == histo_queue->max_size) return 0.;
+  assert(threshold <= 0.);
+  if (idx1 > idx2) {
+    const int tmp = idx2;
+    idx2 = idx1;
+    idx1 = tmp;
+  }
+  pair.idx1 = idx1;
+  pair.idx2 = idx2;
+  h1 = histograms[idx1];
+  h2 = histograms[idx2];
+
+  HistoQueueUpdatePair(h1, h2, threshold, &pair);
+
+  // Do not even consider the pair if it does not improve the entropy.
+  if (pair.cost_diff >= threshold) return 0.;
+
+  histo_queue->queue[histo_queue->size++] = pair;
+  HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]);
+
+  return pair.cost_diff;
+}
+
+// -----------------------------------------------------------------------------
+
+// Combines histograms by continuously choosing the one with the highest cost
+// reduction.
+static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
+                                  int* const num_used) {
+  int ok = 0;
+  const int image_histo_size = image_histo->size;
+  int i, j;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  // Priority queue of histogram pairs.
+  HistoQueue histo_queue;
+
+  // image_histo_size^2 for the queue size is safe. If you look at
+  // HistogramCombineGreedy, and imagine that UpdateQueueFront always pushes
+  // data to the queue, you insert at most:
+  // - image_histo_size*(image_histo_size-1)/2 (the first two for loops)
+  // - image_histo_size - 1 in the last for loop at the first iteration of
+  //   the while loop, image_histo_size - 2 at the second iteration ...
+  //   therefore image_histo_size*(image_histo_size-1)/2 overall too
+  if (!HistoQueueInit(&histo_queue, image_histo_size * image_histo_size)) {
+    goto End;
+  }
+
+  for (i = 0; i < image_histo_size; ++i) {
+    if (image_histo->histograms[i] == NULL) continue;
+    for (j = i + 1; j < image_histo_size; ++j) {
+      // Initialize queue.
+      if (image_histo->histograms[j] == NULL) continue;
+      HistoQueuePush(&histo_queue, histograms, i, j, 0.);
+    }
+  }
+
+  while (histo_queue.size > 0) {
+    const int idx1 = histo_queue.queue[0].idx1;
+    const int idx2 = histo_queue.queue[0].idx2;
+    HistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
+    histograms[idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
+
+    // Remove merged histogram.
+    HistogramSetRemoveHistogram(image_histo, idx2, num_used);
+
+    // Remove pairs intersecting the just combined best pair.
+    for (i = 0; i < histo_queue.size;) {
+      HistogramPair* const p = histo_queue.queue + i;
+      if (p->idx1 == idx1 || p->idx2 == idx1 ||
+          p->idx1 == idx2 || p->idx2 == idx2) {
+        HistoQueuePopPair(&histo_queue, p);
+      } else {
+        HistoQueueUpdateHead(&histo_queue, p);
+        ++i;
+      }
+    }
+
+    // Push new pairs formed with combined histogram to the queue.
+    for (i = 0; i < image_histo->size; ++i) {
+      if (i == idx1 || image_histo->histograms[i] == NULL) continue;
+      HistoQueuePush(&histo_queue, image_histo->histograms, idx1, i, 0.);
+    }
+  }
+
+  ok = 1;
+
+ End:
+  HistoQueueClear(&histo_queue);
+  return ok;
+}
+
+// Perform histogram aggregation using a stochastic approach.
+// 'do_greedy' is set to 1 if a greedy approach needs to be performed
+// afterwards, 0 otherwise.
+static int PairComparison(const void* idx1, const void* idx2) {
+  // To be used with bsearch: <0 when *idx1<*idx2, >0 if >, 0 when ==.
+  return (*(int*) idx1 - *(int*) idx2);
+}
+static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
+                                      int* const num_used, int min_cluster_size,
+                                      int* const do_greedy) {
+  int j, iter;
+  uint32_t seed = 1;
+  int tries_with_no_success = 0;
+  const int outer_iters = *num_used;
+  const int num_tries_no_success = outer_iters / 2;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  // Priority queue of histogram pairs. Its size of 'kHistoQueueSize'
+  // impacts the quality of the compression and the speed: the smaller the
+  // faster but the worse for the compression.
+  HistoQueue histo_queue;
+  const int kHistoQueueSize = 9;
+  int ok = 0;
+  // mapping from an index in image_histo with no NULL histogram to the full
+  // blown image_histo.
+  int* mappings;
+
+  if (*num_used < min_cluster_size) {
+    *do_greedy = 1;
+    return 1;
+  }
+
+  mappings = (int*) WebPSafeMalloc(*num_used, sizeof(*mappings));
+  if (mappings == NULL) return 0;
+  if (!HistoQueueInit(&histo_queue, kHistoQueueSize)) goto End;
+  // Fill the initial mapping.
+  for (j = 0, iter = 0; iter < image_histo->size; ++iter) {
+    if (histograms[iter] == NULL) continue;
+    mappings[j++] = iter;
+  }
+  assert(j == *num_used);
+
+  // Collapse similar histograms in 'image_histo'.
+  for (iter = 0;
+       iter < outer_iters && *num_used >= min_cluster_size &&
+           ++tries_with_no_success < num_tries_no_success;
+       ++iter) {
+    int* mapping_index;
+    double best_cost =
+        (histo_queue.size == 0) ? 0. : histo_queue.queue[0].cost_diff;
+    int best_idx1 = -1, best_idx2 = 1;
+    const uint32_t rand_range = (*num_used - 1) * (*num_used);
+    // (*num_used) / 2 was chosen empirically. Less means faster but worse
+    // compression.
+    const int num_tries = (*num_used) / 2;
+
+    // Pick random samples.
+    for (j = 0; *num_used >= 2 && j < num_tries; ++j) {
+      double curr_cost;
+      // Choose two different histograms at random and try to combine them.
+      const uint32_t tmp = MyRand(&seed) % rand_range;
+      uint32_t idx1 = tmp / (*num_used - 1);
+      uint32_t idx2 = tmp % (*num_used - 1);
+      if (idx2 >= idx1) ++idx2;
+      idx1 = mappings[idx1];
+      idx2 = mappings[idx2];
+
+      // Calculate cost reduction on combination.
+      curr_cost =
+          HistoQueuePush(&histo_queue, histograms, idx1, idx2, best_cost);
+      if (curr_cost < 0) {  // found a better pair?
+        best_cost = curr_cost;
+        // Empty the queue if we reached full capacity.
+        if (histo_queue.size == histo_queue.max_size) break;
+      }
+    }
+    if (histo_queue.size == 0) continue;
+
+    // Get the best histograms.
+    best_idx1 = histo_queue.queue[0].idx1;
+    best_idx2 = histo_queue.queue[0].idx2;
+    assert(best_idx1 < best_idx2);
+    // Pop best_idx2 from mappings.
+    mapping_index = (int*) bsearch(&best_idx2, mappings, *num_used,
+                                   sizeof(best_idx2), &PairComparison);
+    assert(mapping_index != NULL);
+    memmove(mapping_index, mapping_index + 1, sizeof(*mapping_index) *
+        ((*num_used) - (mapping_index - mappings) - 1));
+    // Merge the histograms and remove best_idx2 from the queue.
+    HistogramAdd(histograms[best_idx2], histograms[best_idx1],
+                 histograms[best_idx1]);
+    histograms[best_idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
+    HistogramSetRemoveHistogram(image_histo, best_idx2, num_used);
+    // Parse the queue and update each pair that deals with best_idx1,
+    // best_idx2 or image_histo_size.
+    for (j = 0; j < histo_queue.size;) {
+      HistogramPair* const p = histo_queue.queue + j;
+      const int is_idx1_best = p->idx1 == best_idx1 || p->idx1 == best_idx2;
+      const int is_idx2_best = p->idx2 == best_idx1 || p->idx2 == best_idx2;
+      int do_eval = 0;
+      // The front pair could have been duplicated by a random pick so
+      // check for it all the time nevertheless.
+      if (is_idx1_best && is_idx2_best) {
+        HistoQueuePopPair(&histo_queue, p);
+        continue;
+      }
+      // Any pair containing one of the two best indices should only refer to
+      // best_idx1. Its cost should also be updated.
+      if (is_idx1_best) {
+        p->idx1 = best_idx1;
+        do_eval = 1;
+      } else if (is_idx2_best) {
+        p->idx2 = best_idx1;
+        do_eval = 1;
+      }
+      // Make sure the index order is respected.
+      if (p->idx1 > p->idx2) {
+        const int tmp = p->idx2;
+        p->idx2 = p->idx1;
+        p->idx1 = tmp;
+      }
+      if (do_eval) {
+        // Re-evaluate the cost of an updated pair.
+        HistoQueueUpdatePair(histograms[p->idx1], histograms[p->idx2], 0., p);
+        if (p->cost_diff >= 0.) {
+          HistoQueuePopPair(&histo_queue, p);
+          continue;
+        }
+      }
+      HistoQueueUpdateHead(&histo_queue, p);
+      ++j;
+    }
+    tries_with_no_success = 0;
+  }
+  *do_greedy = (*num_used <= min_cluster_size);
+  ok = 1;
+
+End:
+  HistoQueueClear(&histo_queue);
+  WebPSafeFree(mappings);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// Histogram refinement
+
+// Find the best 'out' histogram for each of the 'in' histograms.
+// At call-time, 'out' contains the histograms of the clusters.
+// Note: we assume that out[]->bit_cost_ is already up-to-date.
+static void HistogramRemap(const VP8LHistogramSet* const in,
+                           VP8LHistogramSet* const out,
+                           uint16_t* const symbols) {
+  int i;
+  VP8LHistogram** const in_histo = in->histograms;
+  VP8LHistogram** const out_histo = out->histograms;
+  const int in_size = out->max_size;
+  const int out_size = out->size;
+  if (out_size > 1) {
+    for (i = 0; i < in_size; ++i) {
+      int best_out = 0;
+      double best_bits = MAX_COST;
+      int k;
+      if (in_histo[i] == NULL) {
+        // Arbitrarily set to the previous value if unused to help future LZ77.
+        symbols[i] = symbols[i - 1];
+        continue;
+      }
+      for (k = 0; k < out_size; ++k) {
+        double cur_bits;
+        cur_bits = HistogramAddThresh(out_histo[k], in_histo[i], best_bits);
+        if (k == 0 || cur_bits < best_bits) {
+          best_bits = cur_bits;
+          best_out = k;
+        }
+      }
+      symbols[i] = best_out;
+    }
+  } else {
+    assert(out_size == 1);
+    for (i = 0; i < in_size; ++i) {
+      symbols[i] = 0;
+    }
+  }
+
+  // Recompute each out based on raw and symbols.
+  VP8LHistogramSetClear(out);
+  out->size = out_size;
+
+  for (i = 0; i < in_size; ++i) {
+    int idx;
+    if (in_histo[i] == NULL) continue;
+    idx = symbols[i];
+    HistogramAdd(in_histo[i], out_histo[idx], out_histo[idx]);
+  }
+}
+
+static double GetCombineCostFactor(int histo_size, int quality) {
+  double combine_cost_factor = 0.16;
+  if (quality < 90) {
+    if (histo_size > 256) combine_cost_factor /= 2.;
+    if (histo_size > 512) combine_cost_factor /= 2.;
+    if (histo_size > 1024) combine_cost_factor /= 2.;
+    if (quality <= 50) combine_cost_factor /= 2.;
+  }
+  return combine_cost_factor;
+}
+
+// Given a HistogramSet 'set', the mapping of clusters 'cluster_mapping' and the
+// current assignment of the cells in 'symbols', merge the clusters and
+// assign the smallest possible clusters values.
+static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
+                                     uint16_t* const cluster_mappings,
+                                     int num_clusters,
+                                     uint16_t* const cluster_mappings_tmp,
+                                     uint16_t* const symbols) {
+  int i, cluster_max;
+  int do_continue = 1;
+  // First, assign the lowest cluster to each pixel.
+  while (do_continue) {
+    do_continue = 0;
+    for (i = 0; i < num_clusters; ++i) {
+      int k;
+      k = cluster_mappings[i];
+      while (k != cluster_mappings[k]) {
+        cluster_mappings[k] = cluster_mappings[cluster_mappings[k]];
+        k = cluster_mappings[k];
+      }
+      if (k != cluster_mappings[i]) {
+        do_continue = 1;
+        cluster_mappings[i] = k;
+      }
+    }
+  }
+  // Create a mapping from a cluster id to its minimal version.
+  cluster_max = 0;
+  memset(cluster_mappings_tmp, 0,
+         set->max_size * sizeof(*cluster_mappings_tmp));
+  assert(cluster_mappings[0] == 0);
+  // Re-map the ids.
+  for (i = 0; i < set->max_size; ++i) {
+    int cluster;
+    if (symbols[i] == kInvalidHistogramSymbol) continue;
+    cluster = cluster_mappings[symbols[i]];
+    assert(symbols[i] < num_clusters);
+    if (cluster > 0 && cluster_mappings_tmp[cluster] == 0) {
+      ++cluster_max;
+      cluster_mappings_tmp[cluster] = cluster_max;
+    }
+    symbols[i] = cluster_mappings_tmp[cluster];
+  }
+
+  // Make sure all cluster values are used.
+  cluster_max = 0;
+  for (i = 0; i < set->max_size; ++i) {
+    if (symbols[i] == kInvalidHistogramSymbol) continue;
+    if (symbols[i] <= cluster_max) continue;
+    ++cluster_max;
+    assert(symbols[i] == cluster_max);
+  }
+}
+
+static void RemoveEmptyHistograms(VP8LHistogramSet* const image_histo) {
+  uint32_t size;
+  int i;
+  for (i = 0, size = 0; i < image_histo->size; ++i) {
+    if (image_histo->histograms[i] == NULL) continue;
+    image_histo->histograms[size++] = image_histo->histograms[i];
+  }
+  image_histo->size = size;
+}
+
+int VP8LGetHistoImageSymbols(int xsize, int ysize,
+                             const VP8LBackwardRefs* const refs,
+                             int quality, int low_effort,
+                             int histogram_bits, int cache_bits,
+                             VP8LHistogramSet* const image_histo,
+                             VP8LHistogram* const tmp_histo,
+                             uint16_t* const histogram_symbols) {
+  int ok = 0;
+  const int histo_xsize =
+      histogram_bits ? VP8LSubSampleSize(xsize, histogram_bits) : 1;
+  const int histo_ysize =
+      histogram_bits ? VP8LSubSampleSize(ysize, histogram_bits) : 1;
+  const int image_histo_raw_size = histo_xsize * histo_ysize;
+  VP8LHistogramSet* const orig_histo =
+      VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
+  // Don't attempt linear bin-partition heuristic for
+  // histograms of small sizes (as bin_map will be very sparse) and
+  // maximum quality q==100 (to preserve the compression gains at that level).
+  const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
+  int entropy_combine;
+  uint16_t* const map_tmp =
+      WebPSafeMalloc(2 * image_histo_raw_size, sizeof(map_tmp));
+  uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size;
+  int num_used = image_histo_raw_size;
+  if (orig_histo == NULL || map_tmp == NULL) goto Error;
+
+  // Construct the histograms from backward references.
+  HistogramBuild(xsize, histogram_bits, refs, orig_histo);
+  // Copies the histograms and computes its bit_cost.
+  // histogram_symbols is optimized
+  HistogramCopyAndAnalyze(orig_histo, image_histo, &num_used,
+                          histogram_symbols);
+
+  entropy_combine =
+      (num_used > entropy_combine_num_bins * 2) && (quality < 100);
+
+  if (entropy_combine) {
+    uint16_t* const bin_map = map_tmp;
+    const double combine_cost_factor =
+        GetCombineCostFactor(image_histo_raw_size, quality);
+    const uint32_t num_clusters = num_used;
+
+    HistogramAnalyzeEntropyBin(image_histo, bin_map, low_effort);
+    // Collapse histograms with similar entropy.
+    HistogramCombineEntropyBin(image_histo, &num_used, histogram_symbols,
+                               cluster_mappings, tmp_histo, bin_map,
+                               entropy_combine_num_bins, combine_cost_factor,
+                               low_effort);
+    OptimizeHistogramSymbols(image_histo, cluster_mappings, num_clusters,
+                             map_tmp, histogram_symbols);
+  }
+
+  // Don't combine the histograms using stochastic and greedy heuristics for
+  // low-effort compression mode.
+  if (!low_effort || !entropy_combine) {
+    const float x = quality / 100.f;
+    // cubic ramp between 1 and MAX_HISTO_GREEDY:
+    const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
+    int do_greedy;
+    if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size,
+                                    &do_greedy)) {
+      goto Error;
+    }
+    if (do_greedy) {
+      RemoveEmptyHistograms(image_histo);
+      if (!HistogramCombineGreedy(image_histo, &num_used)) {
+        goto Error;
+      }
+    }
+  }
+
+  // Find the optimal map from original histograms to the final ones.
+  RemoveEmptyHistograms(image_histo);
+  HistogramRemap(orig_histo, image_histo, histogram_symbols);
+
+  ok = 1;
+
+ Error:
+  VP8LFreeHistogramSet(orig_histo);
+  WebPSafeFree(map_tmp);
+  return ok;
+}
diff --git a/media/libwebp/enc/histogram_enc.h b/media/libwebp/enc/histogram_enc.h
index ef39b7c6db..bf93ce62b3 100644
--- a/media/libwebp/enc/histogram_enc.h
+++ b/media/libwebp/enc/histogram_enc.h
@@ -64,8 +64,8 @@ void VP8LHistogramCreate(VP8LHistogram* const p,
                          const VP8LBackwardRefs* const refs,
                          int palette_code_bits);
 
-// Return the size of the histogram for a given palette_code_bits.
-int VP8LGetHistogramSize(int palette_code_bits);
+// Return the size of the histogram for a given cache_bits.
+int VP8LGetHistogramSize(int cache_bits);
 
 // Set the palette_code_bits and reset the stats.
 // If init_arrays is true, the arrays are also filled with 0's.
@@ -110,7 +110,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              const VP8LBackwardRefs* const refs,
                              int quality, int low_effort,
                              int histogram_bits, int cache_bits,
-                             VP8LHistogramSet* const image_in,
+                             VP8LHistogramSet* const image_histo,
                              VP8LHistogram* const tmp_histo,
                              uint16_t* const histogram_symbols);
 
diff --git a/media/libwebp/enc/iterator_enc.c b/media/libwebp/enc/iterator_enc.c
new file mode 100644
index 0000000000..c2b137c124
--- /dev/null
+++ b/media/libwebp/enc/iterator_enc.c
@@ -0,0 +1,459 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// VP8Iterator: block iterator
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <string.h>
+
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// VP8Iterator
+//------------------------------------------------------------------------------
+
+static void InitLeft(VP8EncIterator* const it) {
+  it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] =
+      (it->y_ > 0) ? 129 : 127;
+  memset(it->y_left_, 129, 16);
+  memset(it->u_left_, 129, 8);
+  memset(it->v_left_, 129, 8);
+  it->left_nz_[8] = 0;
+  if (it->top_derr_ != NULL) {
+    memset(&it->left_derr_, 0, sizeof(it->left_derr_));
+  }
+}
+
+static void InitTop(VP8EncIterator* const it) {
+  const VP8Encoder* const enc = it->enc_;
+  const size_t top_size = enc->mb_w_ * 16;
+  memset(enc->y_top_, 127, 2 * top_size);
+  memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
+  if (enc->top_derr_ != NULL) {
+    memset(enc->top_derr_, 0, enc->mb_w_ * sizeof(*enc->top_derr_));
+  }
+}
+
+void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
+  VP8Encoder* const enc = it->enc_;
+  it->x_ = 0;
+  it->y_ = y;
+  it->bw_ = &enc->parts_[y & (enc->num_parts_ - 1)];
+  it->preds_ = enc->preds_ + y * 4 * enc->preds_w_;
+  it->nz_ = enc->nz_;
+  it->mb_ = enc->mb_info_ + y * enc->mb_w_;
+  it->y_top_ = enc->y_top_;
+  it->uv_top_ = enc->uv_top_;
+  InitLeft(it);
+}
+
+void VP8IteratorReset(VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  VP8IteratorSetRow(it, 0);
+  VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
+  InitTop(it);
+  memset(it->bit_count_, 0, sizeof(it->bit_count_));
+  it->do_trellis_ = 0;
+}
+
+void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down) {
+  it->count_down_ = it->count_down0_ = count_down;
+}
+
+int VP8IteratorIsDone(const VP8EncIterator* const it) {
+  return (it->count_down_ <= 0);
+}
+
+void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
+  it->enc_ = enc;
+  it->yuv_in_   = (uint8_t*)WEBP_ALIGN(it->yuv_mem_);
+  it->yuv_out_  = it->yuv_in_ + YUV_SIZE_ENC;
+  it->yuv_out2_ = it->yuv_out_ + YUV_SIZE_ENC;
+  it->yuv_p_    = it->yuv_out2_ + YUV_SIZE_ENC;
+  it->lf_stats_ = enc->lf_stats_;
+  it->percent0_ = enc->percent_;
+  it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1);
+  it->u_left_ = it->y_left_ + 16 + 16;
+  it->v_left_ = it->u_left_ + 16;
+  it->top_derr_ = enc->top_derr_;
+  VP8IteratorReset(it);
+}
+
+int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
+  VP8Encoder* const enc = it->enc_;
+  if (delta && enc->pic_->progress_hook != NULL) {
+    const int done = it->count_down0_ - it->count_down_;
+    const int percent = (it->count_down0_ <= 0)
+                      ? it->percent0_
+                      : it->percent0_ + delta * done / it->count_down0_;
+    return WebPReportProgress(enc->pic_, percent, &enc->percent_);
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Import the source samples into the cache. Takes care of replicating
+// boundary pixels if necessary.
+
+static WEBP_INLINE int MinSize(int a, int b) { return (a < b) ? a : b; }
+
+static void ImportBlock(const uint8_t* src, int src_stride,
+                        uint8_t* dst, int w, int h, int size) {
+  int i;
+  for (i = 0; i < h; ++i) {
+    memcpy(dst, src, w);
+    if (w < size) {
+      memset(dst + w, dst[w - 1], size - w);
+    }
+    dst += BPS;
+    src += src_stride;
+  }
+  for (i = h; i < size; ++i) {
+    memcpy(dst, dst - BPS, size);
+    dst += BPS;
+  }
+}
+
+static void ImportLine(const uint8_t* src, int src_stride,
+                       uint8_t* dst, int len, int total_len) {
+  int i;
+  for (i = 0; i < len; ++i, src += src_stride) dst[i] = *src;
+  for (; i < total_len; ++i) dst[i] = dst[len - 1];
+}
+
+void VP8IteratorImport(VP8EncIterator* const it, uint8_t* const tmp_32) {
+  const VP8Encoder* const enc = it->enc_;
+  const int x = it->x_, y = it->y_;
+  const WebPPicture* const pic = enc->pic_;
+  const uint8_t* const ysrc = pic->y + (y * pic->y_stride  + x) * 16;
+  const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8;
+  const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8;
+  const int w = MinSize(pic->width - x * 16, 16);
+  const int h = MinSize(pic->height - y * 16, 16);
+  const int uv_w = (w + 1) >> 1;
+  const int uv_h = (h + 1) >> 1;
+
+  ImportBlock(ysrc, pic->y_stride,  it->yuv_in_ + Y_OFF_ENC, w, h, 16);
+  ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF_ENC, uv_w, uv_h, 8);
+  ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF_ENC, uv_w, uv_h, 8);
+
+  if (tmp_32 == NULL) return;
+
+  // Import source (uncompressed) samples into boundary.
+  if (x == 0) {
+    InitLeft(it);
+  } else {
+    if (y == 0) {
+      it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] = 127;
+    } else {
+      it->y_left_[-1] = ysrc[- 1 - pic->y_stride];
+      it->u_left_[-1] = usrc[- 1 - pic->uv_stride];
+      it->v_left_[-1] = vsrc[- 1 - pic->uv_stride];
+    }
+    ImportLine(ysrc - 1, pic->y_stride,  it->y_left_, h,   16);
+    ImportLine(usrc - 1, pic->uv_stride, it->u_left_, uv_h, 8);
+    ImportLine(vsrc - 1, pic->uv_stride, it->v_left_, uv_h, 8);
+  }
+
+  it->y_top_  = tmp_32 + 0;
+  it->uv_top_ = tmp_32 + 16;
+  if (y == 0) {
+    memset(tmp_32, 127, 32 * sizeof(*tmp_32));
+  } else {
+    ImportLine(ysrc - pic->y_stride,  1, tmp_32,          w,   16);
+    ImportLine(usrc - pic->uv_stride, 1, tmp_32 + 16,     uv_w, 8);
+    ImportLine(vsrc - pic->uv_stride, 1, tmp_32 + 16 + 8, uv_w, 8);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Copy back the compressed samples into user space if requested.
+
+static void ExportBlock(const uint8_t* src, uint8_t* dst, int dst_stride,
+                        int w, int h) {
+  while (h-- > 0) {
+    memcpy(dst, src, w);
+    dst += dst_stride;
+    src += BPS;
+  }
+}
+
+void VP8IteratorExport(const VP8EncIterator* const it) {
+  const VP8Encoder* const enc = it->enc_;
+  if (enc->config_->show_compressed) {
+    const int x = it->x_, y = it->y_;
+    const uint8_t* const ysrc = it->yuv_out_ + Y_OFF_ENC;
+    const uint8_t* const usrc = it->yuv_out_ + U_OFF_ENC;
+    const uint8_t* const vsrc = it->yuv_out_ + V_OFF_ENC;
+    const WebPPicture* const pic = enc->pic_;
+    uint8_t* const ydst = pic->y + (y * pic->y_stride + x) * 16;
+    uint8_t* const udst = pic->u + (y * pic->uv_stride + x) * 8;
+    uint8_t* const vdst = pic->v + (y * pic->uv_stride + x) * 8;
+    int w = (pic->width - x * 16);
+    int h = (pic->height - y * 16);
+
+    if (w > 16) w = 16;
+    if (h > 16) h = 16;
+
+    // Luma plane
+    ExportBlock(ysrc, ydst, pic->y_stride, w, h);
+
+    {   // U/V planes
+      const int uv_w = (w + 1) >> 1;
+      const int uv_h = (h + 1) >> 1;
+      ExportBlock(usrc, udst, pic->uv_stride, uv_w, uv_h);
+      ExportBlock(vsrc, vdst, pic->uv_stride, uv_w, uv_h);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Non-zero contexts setup/teardown
+
+// Nz bits:
+//  0  1  2  3  Y
+//  4  5  6  7
+//  8  9 10 11
+// 12 13 14 15
+// 16 17        U
+// 18 19
+// 20 21        V
+// 22 23
+// 24           DC-intra16
+
+// Convert packed context to byte array
+#define BIT(nz, n) (!!((nz) & (1 << (n))))
+
+void VP8IteratorNzToBytes(VP8EncIterator* const it) {
+  const int tnz = it->nz_[0], lnz = it->nz_[-1];
+  int* const top_nz = it->top_nz_;
+  int* const left_nz = it->left_nz_;
+
+  // Top-Y
+  top_nz[0] = BIT(tnz, 12);
+  top_nz[1] = BIT(tnz, 13);
+  top_nz[2] = BIT(tnz, 14);
+  top_nz[3] = BIT(tnz, 15);
+  // Top-U
+  top_nz[4] = BIT(tnz, 18);
+  top_nz[5] = BIT(tnz, 19);
+  // Top-V
+  top_nz[6] = BIT(tnz, 22);
+  top_nz[7] = BIT(tnz, 23);
+  // DC
+  top_nz[8] = BIT(tnz, 24);
+
+  // left-Y
+  left_nz[0] = BIT(lnz,  3);
+  left_nz[1] = BIT(lnz,  7);
+  left_nz[2] = BIT(lnz, 11);
+  left_nz[3] = BIT(lnz, 15);
+  // left-U
+  left_nz[4] = BIT(lnz, 17);
+  left_nz[5] = BIT(lnz, 19);
+  // left-V
+  left_nz[6] = BIT(lnz, 21);
+  left_nz[7] = BIT(lnz, 23);
+  // left-DC is special, iterated separately
+}
+
+void VP8IteratorBytesToNz(VP8EncIterator* const it) {
+  uint32_t nz = 0;
+  const int* const top_nz = it->top_nz_;
+  const int* const left_nz = it->left_nz_;
+  // top
+  nz |= (top_nz[0] << 12) | (top_nz[1] << 13);
+  nz |= (top_nz[2] << 14) | (top_nz[3] << 15);
+  nz |= (top_nz[4] << 18) | (top_nz[5] << 19);
+  nz |= (top_nz[6] << 22) | (top_nz[7] << 23);
+  nz |= (top_nz[8] << 24);  // we propagate the _top_ bit, esp. for intra4
+  // left
+  nz |= (left_nz[0] << 3) | (left_nz[1] << 7);
+  nz |= (left_nz[2] << 11);
+  nz |= (left_nz[4] << 17) | (left_nz[6] << 21);
+
+  *it->nz_ = nz;
+}
+
+#undef BIT
+
+//------------------------------------------------------------------------------
+// Advance to the next position, doing the bookkeeping.
+
+void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  const int x = it->x_, y = it->y_;
+  const uint8_t* const ysrc = it->yuv_out_ + Y_OFF_ENC;
+  const uint8_t* const uvsrc = it->yuv_out_ + U_OFF_ENC;
+  if (x < enc->mb_w_ - 1) {   // left
+    int i;
+    for (i = 0; i < 16; ++i) {
+      it->y_left_[i] = ysrc[15 + i * BPS];
+    }
+    for (i = 0; i < 8; ++i) {
+      it->u_left_[i] = uvsrc[7 + i * BPS];
+      it->v_left_[i] = uvsrc[15 + i * BPS];
+    }
+    // top-left (before 'top'!)
+    it->y_left_[-1] = it->y_top_[15];
+    it->u_left_[-1] = it->uv_top_[0 + 7];
+    it->v_left_[-1] = it->uv_top_[8 + 7];
+  }
+  if (y < enc->mb_h_ - 1) {  // top
+    memcpy(it->y_top_, ysrc + 15 * BPS, 16);
+    memcpy(it->uv_top_, uvsrc + 7 * BPS, 8 + 8);
+  }
+}
+
+int VP8IteratorNext(VP8EncIterator* const it) {
+  if (++it->x_ == it->enc_->mb_w_) {
+    VP8IteratorSetRow(it, ++it->y_);
+  } else {
+    it->preds_ += 4;
+    it->mb_ += 1;
+    it->nz_ += 1;
+    it->y_top_ += 16;
+    it->uv_top_ += 16;
+  }
+  return (0 < --it->count_down_);
+}
+
+//------------------------------------------------------------------------------
+// Helper function to set mode properties
+
+void VP8SetIntra16Mode(const VP8EncIterator* const it, int mode) {
+  uint8_t* preds = it->preds_;
+  int y;
+  for (y = 0; y < 4; ++y) {
+    memset(preds, mode, 4);
+    preds += it->enc_->preds_w_;
+  }
+  it->mb_->type_ = 1;
+}
+
+void VP8SetIntra4Mode(const VP8EncIterator* const it, const uint8_t* modes) {
+  uint8_t* preds = it->preds_;
+  int y;
+  for (y = 4; y > 0; --y) {
+    memcpy(preds, modes, 4 * sizeof(*modes));
+    preds += it->enc_->preds_w_;
+    modes += 4;
+  }
+  it->mb_->type_ = 0;
+}
+
+void VP8SetIntraUVMode(const VP8EncIterator* const it, int mode) {
+  it->mb_->uv_mode_ = mode;
+}
+
+void VP8SetSkip(const VP8EncIterator* const it, int skip) {
+  it->mb_->skip_ = skip;
+}
+
+void VP8SetSegment(const VP8EncIterator* const it, int segment) {
+  it->mb_->segment_ = segment;
+}
+
+//------------------------------------------------------------------------------
+// Intra4x4 sub-blocks iteration
+//
+//  We store and update the boundary samples into an array of 37 pixels. They
+//  are updated as we iterate and reconstructs each intra4x4 blocks in turn.
+//  The position of the samples has the following snake pattern:
+//
+// 16|17 18 19 20|21 22 23 24|25 26 27 28|29 30 31 32|33 34 35 36  <- Top-right
+// --+-----------+-----------+-----------+-----------+
+// 15|         19|         23|         27|         31|
+// 14|         18|         22|         26|         30|
+// 13|         17|         21|         25|         29|
+// 12|13 14 15 16|17 18 19 20|21 22 23 24|25 26 27 28|
+// --+-----------+-----------+-----------+-----------+
+// 11|         15|         19|         23|         27|
+// 10|         14|         18|         22|         26|
+//  9|         13|         17|         21|         25|
+//  8| 9 10 11 12|13 14 15 16|17 18 19 20|21 22 23 24|
+// --+-----------+-----------+-----------+-----------+
+//  7|         11|         15|         19|         23|
+//  6|         10|         14|         18|         22|
+//  5|          9|         13|         17|         21|
+//  4| 5  6  7  8| 9 10 11 12|13 14 15 16|17 18 19 20|
+// --+-----------+-----------+-----------+-----------+
+//  3|          7|         11|         15|         19|
+//  2|          6|         10|         14|         18|
+//  1|          5|          9|         13|         17|
+//  0| 1  2  3  4| 5  6  7  8| 9 10 11 12|13 14 15 16|
+// --+-----------+-----------+-----------+-----------+
+
+// Array to record the position of the top sample to pass to the prediction
+// functions in dsp.c.
+static const uint8_t VP8TopLeftI4[16] = {
+  17, 21, 25, 29,
+  13, 17, 21, 25,
+  9,  13, 17, 21,
+  5,   9, 13, 17
+};
+
+void VP8IteratorStartI4(VP8EncIterator* const it) {
+  const VP8Encoder* const enc = it->enc_;
+  int i;
+
+  it->i4_ = 0;    // first 4x4 sub-block
+  it->i4_top_ = it->i4_boundary_ + VP8TopLeftI4[0];
+
+  // Import the boundary samples
+  for (i = 0; i < 17; ++i) {    // left
+    it->i4_boundary_[i] = it->y_left_[15 - i];
+  }
+  for (i = 0; i < 16; ++i) {    // top
+    it->i4_boundary_[17 + i] = it->y_top_[i];
+  }
+  // top-right samples have a special case on the far right of the picture
+  if (it->x_ < enc->mb_w_ - 1) {
+    for (i = 16; i < 16 + 4; ++i) {
+      it->i4_boundary_[17 + i] = it->y_top_[i];
+    }
+  } else {    // else, replicate the last valid pixel four times
+    for (i = 16; i < 16 + 4; ++i) {
+      it->i4_boundary_[17 + i] = it->i4_boundary_[17 + 15];
+    }
+  }
+  VP8IteratorNzToBytes(it);  // import the non-zero context
+}
+
+int VP8IteratorRotateI4(VP8EncIterator* const it,
+                        const uint8_t* const yuv_out) {
+  const uint8_t* const blk = yuv_out + VP8Scan[it->i4_];
+  uint8_t* const top = it->i4_top_;
+  int i;
+
+  // Update the cache with 7 fresh samples
+  for (i = 0; i <= 3; ++i) {
+    top[-4 + i] = blk[i + 3 * BPS];   // store future top samples
+  }
+  if ((it->i4_ & 3) != 3) {  // if not on the right sub-blocks #3, #7, #11, #15
+    for (i = 0; i <= 2; ++i) {        // store future left samples
+      top[i] = blk[3 + (2 - i) * BPS];
+    }
+  } else {  // else replicate top-right samples, as says the specs.
+    for (i = 0; i <= 3; ++i) {
+      top[i] = top[i + 4];
+    }
+  }
+  // move pointers to next sub-block
+  ++it->i4_;
+  if (it->i4_ == 16) {    // we're done
+    return 0;
+  }
+
+  it->i4_top_ = it->i4_boundary_ + VP8TopLeftI4[it->i4_];
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/moz.build b/media/libwebp/enc/moz.build
new file mode 100644
index 0000000000..12eaf5a5ed
--- /dev/null
+++ b/media/libwebp/enc/moz.build
@@ -0,0 +1,39 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+with Files('**'):
+    BUG_COMPONENT = ('Core', 'ImageLib')
+
+SOURCES += [
+    'alpha_enc.c',
+    'analysis_enc.c',
+    'backward_references_cost_enc.c',
+    'backward_references_enc.c',
+    'config_enc.c',
+    'cost_enc.c',
+    'filter_enc.c',
+    'frame_enc.c',
+    'histogram_enc.c',
+    'iterator_enc.c',
+    'near_lossless_enc.c',
+    'picture_csp_enc.c',
+    'picture_enc.c',
+    'picture_psnr_enc.c',
+    'picture_rescale_enc.c',
+    'picture_tools_enc.c',
+    'predictor_enc.c',
+    'quant_enc.c',
+    'syntax_enc.c',
+    'token_enc.c',
+    'tree_enc.c',
+    'vp8l_enc.c',
+    'webp_enc.c',
+]
+
+FINAL_LIBRARY = 'gkmedias'
+
+# We allow warnings for third-party code that can be updated from upstream.
+ALLOW_COMPILER_WARNINGS = True
diff --git a/media/libwebp/enc/near_lossless_enc.c b/media/libwebp/enc/near_lossless_enc.c
new file mode 100644
index 0000000000..1fd12a4364
--- /dev/null
+++ b/media/libwebp/enc/near_lossless_enc.c
@@ -0,0 +1,151 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Near-lossless image preprocessing adjusts pixel values to help
+// compressibility with a guarantee of maximum deviation between original and
+// resulting pixel values.
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+// Converted to C by Aleksander Kramarz (akramarz@google.com)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../dsp/lossless_common.h"
+#include "../utils/utils.h"
+#include "../enc/vp8li_enc.h"
+
+#if (WEBP_NEAR_LOSSLESS == 1)
+
+#define MIN_DIM_FOR_NEAR_LOSSLESS 64
+#define MAX_LIMIT_BITS             5
+
+// Quantizes the value up or down to a multiple of 1<<bits (or to 255),
+// choosing the closer one, resolving ties using bankers' rounding.
+static uint32_t FindClosestDiscretized(uint32_t a, int bits) {
+  const uint32_t mask = (1u << bits) - 1;
+  const uint32_t biased = a + (mask >> 1) + ((a >> bits) & 1);
+  assert(bits > 0);
+  if (biased > 0xff) return 0xff;
+  return biased & ~mask;
+}
+
+// Applies FindClosestDiscretized to all channels of pixel.
+static uint32_t ClosestDiscretizedArgb(uint32_t a, int bits) {
+  return
+      (FindClosestDiscretized(a >> 24, bits) << 24) |
+      (FindClosestDiscretized((a >> 16) & 0xff, bits) << 16) |
+      (FindClosestDiscretized((a >> 8) & 0xff, bits) << 8) |
+      (FindClosestDiscretized(a & 0xff, bits));
+}
+
+// Checks if distance between corresponding channel values of pixels a and b
+// is within the given limit.
+static int IsNear(uint32_t a, uint32_t b, int limit) {
+  int k;
+  for (k = 0; k < 4; ++k) {
+    const int delta =
+        (int)((a >> (k * 8)) & 0xff) - (int)((b >> (k * 8)) & 0xff);
+    if (delta >= limit || delta <= -limit) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int IsSmooth(const uint32_t* const prev_row,
+                    const uint32_t* const curr_row,
+                    const uint32_t* const next_row,
+                    int ix, int limit) {
+  // Check that all pixels in 4-connected neighborhood are smooth.
+  return (IsNear(curr_row[ix], curr_row[ix - 1], limit) &&
+          IsNear(curr_row[ix], curr_row[ix + 1], limit) &&
+          IsNear(curr_row[ix], prev_row[ix], limit) &&
+          IsNear(curr_row[ix], next_row[ix], limit));
+}
+
+// Adjusts pixel values of image with given maximum error.
+static void NearLossless(int xsize, int ysize, const uint32_t* argb_src,
+                         int stride, int limit_bits, uint32_t* copy_buffer,
+                         uint32_t* argb_dst) {
+  int x, y;
+  const int limit = 1 << limit_bits;
+  uint32_t* prev_row = copy_buffer;
+  uint32_t* curr_row = prev_row + xsize;
+  uint32_t* next_row = curr_row + xsize;
+  memcpy(curr_row, argb_src, xsize * sizeof(argb_src[0]));
+  memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
+
+  for (y = 0; y < ysize; ++y, argb_src += stride, argb_dst += xsize) {
+    if (y == 0 || y == ysize - 1) {
+      memcpy(argb_dst, argb_src, xsize * sizeof(argb_src[0]));
+    } else {
+      memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
+      argb_dst[0] = argb_src[0];
+      argb_dst[xsize - 1] = argb_src[xsize - 1];
+      for (x = 1; x < xsize - 1; ++x) {
+        if (IsSmooth(prev_row, curr_row, next_row, x, limit)) {
+          argb_dst[x] = curr_row[x];
+        } else {
+          argb_dst[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
+        }
+      }
+    }
+    {
+      // Three-way swap.
+      uint32_t* const temp = prev_row;
+      prev_row = curr_row;
+      curr_row = next_row;
+      next_row = temp;
+    }
+  }
+}
+
+int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
+                         uint32_t* const argb_dst) {
+  int i;
+  const int xsize = picture->width;
+  const int ysize = picture->height;
+  const int stride = picture->argb_stride;
+  uint32_t* const copy_buffer =
+      (uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer));
+  const int limit_bits = VP8LNearLosslessBits(quality);
+  assert(argb_dst != NULL);
+  assert(limit_bits > 0);
+  assert(limit_bits <= MAX_LIMIT_BITS);
+  if (copy_buffer == NULL) {
+    return 0;
+  }
+  // For small icon images, don't attempt to apply near-lossless compression.
+  if ((xsize < MIN_DIM_FOR_NEAR_LOSSLESS &&
+       ysize < MIN_DIM_FOR_NEAR_LOSSLESS) ||
+      ysize < 3) {
+    for (i = 0; i < ysize; ++i) {
+      memcpy(argb_dst + i * xsize, picture->argb + i * picture->argb_stride,
+             xsize * sizeof(*argb_dst));
+    }
+    WebPSafeFree(copy_buffer);
+    return 1;
+  }
+
+  NearLossless(xsize, ysize, picture->argb, stride, limit_bits, copy_buffer,
+               argb_dst);
+  for (i = limit_bits - 1; i != 0; --i) {
+    NearLossless(xsize, ysize, argb_dst, xsize, i, copy_buffer, argb_dst);
+  }
+  WebPSafeFree(copy_buffer);
+  return 1;
+}
+#else  // (WEBP_NEAR_LOSSLESS == 1)
+
+// Define a stub to suppress compiler warnings.
+extern void VP8LNearLosslessStub(void);
+void VP8LNearLosslessStub(void) {}
+
+#endif  // (WEBP_NEAR_LOSSLESS == 1)
diff --git a/media/libwebp/enc/picture_csp_enc.c b/media/libwebp/enc/picture_csp_enc.c
new file mode 100644
index 0000000000..3dd5d380e8
--- /dev/null
+++ b/media/libwebp/enc/picture_csp_enc.c
@@ -0,0 +1,1210 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture utils for colorspace conversion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../utils/random_utils.h"
+#include "../utils/utils.h"
+#include "../dsp/dsp.h"
+#include "../dsp/lossless.h"
+#include "../dsp/yuv.h"
+
+// Uncomment to disable gamma-compression during RGB->U/V averaging
+#define USE_GAMMA_COMPRESSION
+
+// If defined, use table to compute x / alpha.
+#define USE_INVERSE_ALPHA_TABLE
+
+#ifdef WORDS_BIGENDIAN
+// uint32_t 0xff000000 is 0xff,00,00,00 in memory
+#define CHANNEL_OFFSET(i) (i)
+#else
+// uint32_t 0xff000000 is 0x00,00,00,ff in memory
+#define CHANNEL_OFFSET(i) (3-(i))
+#endif
+
+#define ALPHA_OFFSET CHANNEL_OFFSET(0)
+
+//------------------------------------------------------------------------------
+// Detection of non-trivial transparency
+
+// Returns true if alpha[] has non-0xff values.
+static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
+                          int x_step, int y_step) {
+  if (alpha == NULL) return 0;
+  WebPInitAlphaProcessing();
+  if (x_step == 1) {
+    for (; height-- > 0; alpha += y_step) {
+      if (WebPHasAlpha8b(alpha, width)) return 1;
+    }
+  } else {
+    for (; height-- > 0; alpha += y_step) {
+      if (WebPHasAlpha32b(alpha, width)) return 1;
+    }
+  }
+  return 0;
+}
+
+// Checking for the presence of non-opaque alpha.
+int WebPPictureHasTransparency(const WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (picture->use_argb) {
+    const int alpha_offset = ALPHA_OFFSET;
+    return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
+                          picture->width, picture->height,
+                          4, picture->argb_stride * sizeof(*picture->argb));
+  }
+  return CheckNonOpaque(picture->a, picture->width, picture->height,
+                        1, picture->a_stride);
+}
+
+//------------------------------------------------------------------------------
+// Code for gamma correction
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// gamma-compensates loss of resolution during chroma subsampling
+#define kGamma 0.80      // for now we use a different gamma value than kGammaF
+#define kGammaFix 12     // fixed-point precision for linear values
+#define kGammaScale ((1 << kGammaFix) - 1)
+#define kGammaTabFix 7   // fixed-point fractional bits precision
+#define kGammaTabScale (1 << kGammaTabFix)
+#define kGammaTabRounder (kGammaTabScale >> 1)
+#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
+
+static int kLinearToGammaTab[kGammaTabSize + 1];
+static uint16_t kGammaToLinearTab[256];
+static volatile int kGammaTablesOk = 0;
+static void InitGammaTables(void);
+
+WEBP_DSP_INIT_FUNC(InitGammaTables) {
+  if (!kGammaTablesOk) {
+    int v;
+    const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
+    const double norm = 1. / 255.;
+    for (v = 0; v <= 255; ++v) {
+      kGammaToLinearTab[v] =
+          (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5);
+    }
+    for (v = 0; v <= kGammaTabSize; ++v) {
+      kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5);
+    }
+    kGammaTablesOk = 1;
+  }
+}
+
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
+  return kGammaToLinearTab[v];
+}
+
+static WEBP_INLINE int Interpolate(int v) {
+  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
+  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
+  const int v0 = kLinearToGammaTab[tab_pos];
+  const int v1 = kLinearToGammaTab[tab_pos + 1];
+  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
+  assert(tab_pos + 1 < kGammaTabSize + 1);
+  return y;
+}
+
+// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
+// U/V value, suitable for RGBToU/V calls.
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  const int y = Interpolate(base_value << shift);   // final uplifted value
+  return (y + kGammaTabRounder) >> kGammaTabFix;    // descale
+}
+
+#else
+
+static void InitGammaTables(void) {}
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  return (int)(base_value << shift);
+}
+
+#endif    // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+// RGB -> YUV conversion
+
+static int RGBToY(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToY(r, g, b, YUV_HALF)
+                      : VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
+}
+
+static int RGBToU(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToU(r, g, b, YUV_HALF << 2)
+                      : VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+static int RGBToV(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToV(r, g, b, YUV_HALF << 2)
+                      : VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+//------------------------------------------------------------------------------
+// Sharp RGB->YUV conversion
+
+static const int kNumIterations = 4;
+static const int kMinDimensionIterativeConversion = 4;
+
+// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
+// banding sometimes. Better use extra precision.
+#define SFIX 2                // fixed-point precision of RGB and Y/W
+typedef int16_t fixed_t;      // signed type with extra SFIX precision for UV
+typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
+
+#define SHALF (1 << SFIX >> 1)
+#define MAX_Y_T ((256 << SFIX) - 1)
+#define SROUNDER (1 << (YUV_FIX + SFIX - 1))
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// We use tables of different size and precision for the Rec709 / BT2020
+// transfer function.
+#define kGammaF (1./0.45)
+static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
+#define GAMMA_TO_LINEAR_BITS 14
+static uint32_t kGammaToLinearTabS[MAX_Y_T + 1];   // size scales with Y_FIX
+static volatile int kGammaTablesSOk = 0;
+static void InitGammaTablesS(void);
+
+WEBP_DSP_INIT_FUNC(InitGammaTablesS) {
+  assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
+  if (!kGammaTablesSOk) {
+    int v;
+    const double norm = 1. / MAX_Y_T;
+    const double scale = 1. / kGammaTabSize;
+    const double a = 0.09929682680944;
+    const double thresh = 0.018053968510807;
+    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
+    for (v = 0; v <= MAX_Y_T; ++v) {
+      const double g = norm * v;
+      double value;
+      if (g <= thresh * 4.5) {
+        value = g / 4.5;
+      } else {
+        const double a_rec = 1. / (1. + a);
+        value = pow(a_rec * (g + a), kGammaF);
+      }
+      kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
+    }
+    for (v = 0; v <= kGammaTabSize; ++v) {
+      const double g = scale * v;
+      double value;
+      if (g <= thresh) {
+        value = 4.5 * g;
+      } else {
+        value = (1. + a) * pow(g, 1. / kGammaF) - a;
+      }
+      // we already incorporate the 1/2 rounding constant here
+      kLinearToGammaTabS[v] =
+          (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
+    }
+    // to prevent small rounding errors to cause read-overflow:
+    kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
+    kGammaTablesSOk = 1;
+  }
+}
+
+// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return kGammaToLinearTabS[v];
+}
+
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
+  const uint32_t v = value * kGammaTabSize;
+  const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
+  // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
+  const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS);  // fractional part
+  // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
+  const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
+  const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
+  // Final interpolation. Note that rounding is already included.
+  const uint32_t v2 = (v1 - v0) * x;    // note: v1 >= v0.
+  const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
+  return result;
+}
+
+#else
+
+static void InitGammaTablesS(void) {}
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T;
+}
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS;
+}
+
+#endif    // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+
+static uint8_t clip_8b(fixed_t v) {
+  return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
+}
+
+static fixed_y_t clip_y(int y) {
+  return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
+}
+
+//------------------------------------------------------------------------------
+
+static int RGBToGray(int r, int g, int b) {
+  const int luma = 13933 * r + 46871 * g + 4732 * b + YUV_HALF;
+  return (luma >> YUV_FIX);
+}
+
+static uint32_t ScaleDown(int a, int b, int c, int d) {
+  const uint32_t A = GammaToLinearS(a);
+  const uint32_t B = GammaToLinearS(b);
+  const uint32_t C = GammaToLinearS(c);
+  const uint32_t D = GammaToLinearS(d);
+  return LinearToGammaS((A + B + C + D + 2) >> 2);
+}
+
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    const uint32_t R = GammaToLinearS(src[0 * w + i]);
+    const uint32_t G = GammaToLinearS(src[1 * w + i]);
+    const uint32_t B = GammaToLinearS(src[2 * w + i]);
+    const uint32_t Y = RGBToGray(R, G, B);
+    dst[i] = (fixed_y_t)LinearToGammaS(Y);
+  }
+}
+
+static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
+                         fixed_t* dst, int uv_w) {
+  int i;
+  for (i = 0; i < uv_w; ++i) {
+    const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
+                            src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
+    const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
+                            src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
+    const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
+                            src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
+    const int W = RGBToGray(r, g, b);
+    dst[0 * uv_w] = (fixed_t)(r - W);
+    dst[1 * uv_w] = (fixed_t)(g - W);
+    dst[2 * uv_w] = (fixed_t)(b - W);
+    dst  += 1;
+    src1 += 2;
+    src2 += 2;
+  }
+}
+
+static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) {
+  const int v0 = (A * 3 + B + 2) >> 2;
+  return clip_y(v0 + W0);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {  // 8bit -> SFIX
+  return ((fixed_y_t)a << SFIX) | SHALF;
+}
+
+static void ImportOneRow(const uint8_t* const r_ptr,
+                         const uint8_t* const g_ptr,
+                         const uint8_t* const b_ptr,
+                         int step,
+                         int pic_width,
+                         fixed_y_t* const dst) {
+  int i;
+  const int w = (pic_width + 1) & ~1;
+  for (i = 0; i < pic_width; ++i) {
+    const int off = i * step;
+    dst[i + 0 * w] = UpLift(r_ptr[off]);
+    dst[i + 1 * w] = UpLift(g_ptr[off]);
+    dst[i + 2 * w] = UpLift(b_ptr[off]);
+  }
+  if (pic_width & 1) {  // replicate rightmost pixel
+    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
+    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
+    dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
+  }
+}
+
+static void InterpolateTwoRows(const fixed_y_t* const best_y,
+                               const fixed_t* prev_uv,
+                               const fixed_t* cur_uv,
+                               const fixed_t* next_uv,
+                               int w,
+                               fixed_y_t* out1,
+                               fixed_y_t* out2) {
+  const int uv_w = w >> 1;
+  const int len = (w - 1) >> 1;   // length to filter
+  int k = 3;
+  while (k-- > 0) {   // process each R/G/B segments in turn
+    // special boundary case for i==0
+    out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
+    out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
+
+    WebPSharpYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
+    WebPSharpYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
+
+    // special boundary case for i == w - 1 when w is even
+    if (!(w & 1)) {
+      out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
+                            best_y[w - 1 + 0]);
+      out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
+                            best_y[w - 1 + w]);
+    }
+    out1 += w;
+    out2 += w;
+    prev_uv += uv_w;
+    cur_uv  += uv_w;
+    next_uv += uv_w;
+  }
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
+  const int luma = 16839 * r + 33059 * g + 6420 * b + SROUNDER;
+  return clip_8b(16 + (luma >> (YUV_FIX + SFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
+  const int u =  -9719 * r - 19081 * g + 28800 * b + SROUNDER;
+  return clip_8b(128 + (u >> (YUV_FIX + SFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
+  const int v = +28800 * r - 24116 * g -  4684 * b + SROUNDER;
+  return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
+}
+
+static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
+                            WebPPicture* const picture) {
+  int i, j;
+  uint8_t* dst_y = picture->y;
+  uint8_t* dst_u = picture->u;
+  uint8_t* dst_v = picture->v;
+  const fixed_t* const best_uv_base = best_uv;
+  const int w = (picture->width + 1) & ~1;
+  const int h = (picture->height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  for (best_uv = best_uv_base, j = 0; j < picture->height; ++j) {
+    for (i = 0; i < picture->width; ++i) {
+      const int off = (i >> 1);
+      const int W = best_y[i];
+      const int r = best_uv[off + 0 * uv_w] + W;
+      const int g = best_uv[off + 1 * uv_w] + W;
+      const int b = best_uv[off + 2 * uv_w] + W;
+      dst_y[i] = ConvertRGBToY(r, g, b);
+    }
+    best_y += w;
+    best_uv += (j & 1) * 3 * uv_w;
+    dst_y += picture->y_stride;
+  }
+  for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
+    for (i = 0; i < uv_w; ++i) {
+      const int off = i;
+      const int r = best_uv[off + 0 * uv_w];
+      const int g = best_uv[off + 1 * uv_w];
+      const int b = best_uv[off + 2 * uv_w];
+      dst_u[i] = ConvertRGBToU(r, g, b);
+      dst_v[i] = ConvertRGBToV(r, g, b);
+    }
+    best_uv += 3 * uv_w;
+    dst_u += picture->uv_stride;
+    dst_v += picture->uv_stride;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main function
+
+#define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
+
+static int PreprocessARGB(const uint8_t* r_ptr,
+                          const uint8_t* g_ptr,
+                          const uint8_t* b_ptr,
+                          int step, int rgb_stride,
+                          WebPPicture* const picture) {
+  // we expand the right/bottom border if needed
+  const int w = (picture->width + 1) & ~1;
+  const int h = (picture->height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  uint64_t prev_diff_y_sum = ~0;
+  int j, iter;
+
+  // TODO(skal): allocate one big memory chunk. But for now, it's easier
+  // for valgrind debugging to have several chunks.
+  fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
+  fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
+  fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+  fixed_y_t* best_y = best_y_base;
+  fixed_y_t* target_y = target_y_base;
+  fixed_t* best_uv = best_uv_base;
+  fixed_t* target_uv = target_uv_base;
+  const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
+  int ok;
+
+  if (best_y_base == NULL || best_uv_base == NULL ||
+      target_y_base == NULL || target_uv_base == NULL ||
+      best_rgb_y == NULL || best_rgb_uv == NULL ||
+      tmp_buffer == NULL) {
+    ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    goto End;
+  }
+  assert(picture->width >= kMinDimensionIterativeConversion);
+  assert(picture->height >= kMinDimensionIterativeConversion);
+
+  WebPInitConvertARGBToYUV();
+
+  // Import RGB samples to W/RGB representation.
+  for (j = 0; j < picture->height; j += 2) {
+    const int is_last_row = (j == picture->height - 1);
+    fixed_y_t* const src1 = tmp_buffer + 0 * w;
+    fixed_y_t* const src2 = tmp_buffer + 3 * w;
+
+    // prepare two rows of input
+    ImportOneRow(r_ptr, g_ptr, b_ptr, step, picture->width, src1);
+    if (!is_last_row) {
+      ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
+                   step, picture->width, src2);
+    } else {
+      memcpy(src2, src1, 3 * w * sizeof(*src2));
+    }
+    StoreGray(src1, best_y + 0, w);
+    StoreGray(src2, best_y + w, w);
+
+    UpdateW(src1, target_y, w);
+    UpdateW(src2, target_y + w, w);
+    UpdateChroma(src1, src2, target_uv, uv_w);
+    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
+    best_y += 2 * w;
+    best_uv += 3 * uv_w;
+    target_y += 2 * w;
+    target_uv += 3 * uv_w;
+    r_ptr += 2 * rgb_stride;
+    g_ptr += 2 * rgb_stride;
+    b_ptr += 2 * rgb_stride;
+  }
+
+  // Iterate and resolve clipping conflicts.
+  for (iter = 0; iter < kNumIterations; ++iter) {
+    const fixed_t* cur_uv = best_uv_base;
+    const fixed_t* prev_uv = best_uv_base;
+    uint64_t diff_y_sum = 0;
+
+    best_y = best_y_base;
+    best_uv = best_uv_base;
+    target_y = target_y_base;
+    target_uv = target_uv_base;
+    for (j = 0; j < h; j += 2) {
+      fixed_y_t* const src1 = tmp_buffer + 0 * w;
+      fixed_y_t* const src2 = tmp_buffer + 3 * w;
+      {
+        const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
+        InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2);
+        prev_uv = cur_uv;
+        cur_uv = next_uv;
+      }
+
+      UpdateW(src1, best_rgb_y + 0 * w, w);
+      UpdateW(src2, best_rgb_y + 1 * w, w);
+      UpdateChroma(src1, src2, best_rgb_uv, uv_w);
+
+      // update two rows of Y and one row of RGB
+      diff_y_sum += WebPSharpYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w);
+      WebPSharpYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
+
+      best_y += 2 * w;
+      best_uv += 3 * uv_w;
+      target_y += 2 * w;
+      target_uv += 3 * uv_w;
+    }
+    // test exit condition
+    if (iter > 0) {
+      if (diff_y_sum < diff_y_threshold) break;
+      if (diff_y_sum > prev_diff_y_sum) break;
+    }
+    prev_diff_y_sum = diff_y_sum;
+  }
+  // final reconstruction
+  ok = ConvertWRGBToYUV(best_y_base, best_uv_base, picture);
+
+ End:
+  WebPSafeFree(best_y_base);
+  WebPSafeFree(best_uv_base);
+  WebPSafeFree(target_y_base);
+  WebPSafeFree(target_uv_base);
+  WebPSafeFree(best_rgb_y);
+  WebPSafeFree(best_rgb_uv);
+  WebPSafeFree(tmp_buffer);
+  return ok;
+}
+#undef SAFE_ALLOC
+
+//------------------------------------------------------------------------------
+// "Fast" regular RGB->YUV
+
+#define SUM4(ptr, step) LinearToGamma(                     \
+    GammaToLinear((ptr)[0]) +                              \
+    GammaToLinear((ptr)[(step)]) +                         \
+    GammaToLinear((ptr)[rgb_stride]) +                     \
+    GammaToLinear((ptr)[rgb_stride + (step)]), 0)          \
+
+#define SUM2(ptr) \
+    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
+
+#define SUM2ALPHA(ptr) ((ptr)[0] + (ptr)[rgb_stride])
+#define SUM4ALPHA(ptr) (SUM2ALPHA(ptr) + SUM2ALPHA((ptr) + 4))
+
+#if defined(USE_INVERSE_ALPHA_TABLE)
+
+static const int kAlphaFix = 19;
+// Following table is (1 << kAlphaFix) / a. The (v * kInvAlpha[a]) >> kAlphaFix
+// formula is then equal to v / a in most (99.6%) cases. Note that this table
+// and constant are adjusted very tightly to fit 32b arithmetic.
+// In particular, they use the fact that the operands for 'v / a' are actually
+// derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3
+// with ai in [0..255] and pi in [0..1<<kGammaFix). The constraint to avoid
+// overflow is: kGammaFix + kAlphaFix <= 31.
+static const uint32_t kInvAlpha[4 * 0xff + 1] = {
+  0,  /* alpha = 0 */
+  524288, 262144, 174762, 131072, 104857, 87381, 74898, 65536,
+  58254, 52428, 47662, 43690, 40329, 37449, 34952, 32768,
+  30840, 29127, 27594, 26214, 24966, 23831, 22795, 21845,
+  20971, 20164, 19418, 18724, 18078, 17476, 16912, 16384,
+  15887, 15420, 14979, 14563, 14169, 13797, 13443, 13107,
+  12787, 12483, 12192, 11915, 11650, 11397, 11155, 10922,
+  10699, 10485, 10280, 10082, 9892, 9709, 9532, 9362,
+  9198, 9039, 8886, 8738, 8594, 8456, 8322, 8192,
+  8065, 7943, 7825, 7710, 7598, 7489, 7384, 7281,
+  7182, 7084, 6990, 6898, 6808, 6721, 6636, 6553,
+  6472, 6393, 6316, 6241, 6168, 6096, 6026, 5957,
+  5890, 5825, 5761, 5698, 5637, 5577, 5518, 5461,
+  5405, 5349, 5295, 5242, 5190, 5140, 5090, 5041,
+  4993, 4946, 4899, 4854, 4809, 4766, 4723, 4681,
+  4639, 4599, 4559, 4519, 4481, 4443, 4405, 4369,
+  4332, 4297, 4262, 4228, 4194, 4161, 4128, 4096,
+  4064, 4032, 4002, 3971, 3942, 3912, 3883, 3855,
+  3826, 3799, 3771, 3744, 3718, 3692, 3666, 3640,
+  3615, 3591, 3566, 3542, 3518, 3495, 3472, 3449,
+  3426, 3404, 3382, 3360, 3339, 3318, 3297, 3276,
+  3256, 3236, 3216, 3196, 3177, 3158, 3139, 3120,
+  3102, 3084, 3066, 3048, 3030, 3013, 2995, 2978,
+  2962, 2945, 2928, 2912, 2896, 2880, 2864, 2849,
+  2833, 2818, 2803, 2788, 2774, 2759, 2744, 2730,
+  2716, 2702, 2688, 2674, 2661, 2647, 2634, 2621,
+  2608, 2595, 2582, 2570, 2557, 2545, 2532, 2520,
+  2508, 2496, 2484, 2473, 2461, 2449, 2438, 2427,
+  2416, 2404, 2394, 2383, 2372, 2361, 2351, 2340,
+  2330, 2319, 2309, 2299, 2289, 2279, 2269, 2259,
+  2250, 2240, 2231, 2221, 2212, 2202, 2193, 2184,
+  2175, 2166, 2157, 2148, 2139, 2131, 2122, 2114,
+  2105, 2097, 2088, 2080, 2072, 2064, 2056, 2048,
+  2040, 2032, 2024, 2016, 2008, 2001, 1993, 1985,
+  1978, 1971, 1963, 1956, 1949, 1941, 1934, 1927,
+  1920, 1913, 1906, 1899, 1892, 1885, 1879, 1872,
+  1865, 1859, 1852, 1846, 1839, 1833, 1826, 1820,
+  1814, 1807, 1801, 1795, 1789, 1783, 1777, 1771,
+  1765, 1759, 1753, 1747, 1741, 1736, 1730, 1724,
+  1718, 1713, 1707, 1702, 1696, 1691, 1685, 1680,
+  1675, 1669, 1664, 1659, 1653, 1648, 1643, 1638,
+  1633, 1628, 1623, 1618, 1613, 1608, 1603, 1598,
+  1593, 1588, 1583, 1579, 1574, 1569, 1565, 1560,
+  1555, 1551, 1546, 1542, 1537, 1533, 1528, 1524,
+  1519, 1515, 1510, 1506, 1502, 1497, 1493, 1489,
+  1485, 1481, 1476, 1472, 1468, 1464, 1460, 1456,
+  1452, 1448, 1444, 1440, 1436, 1432, 1428, 1424,
+  1420, 1416, 1413, 1409, 1405, 1401, 1398, 1394,
+  1390, 1387, 1383, 1379, 1376, 1372, 1368, 1365,
+  1361, 1358, 1354, 1351, 1347, 1344, 1340, 1337,
+  1334, 1330, 1327, 1323, 1320, 1317, 1314, 1310,
+  1307, 1304, 1300, 1297, 1294, 1291, 1288, 1285,
+  1281, 1278, 1275, 1272, 1269, 1266, 1263, 1260,
+  1257, 1254, 1251, 1248, 1245, 1242, 1239, 1236,
+  1233, 1230, 1227, 1224, 1222, 1219, 1216, 1213,
+  1210, 1208, 1205, 1202, 1199, 1197, 1194, 1191,
+  1188, 1186, 1183, 1180, 1178, 1175, 1172, 1170,
+  1167, 1165, 1162, 1159, 1157, 1154, 1152, 1149,
+  1147, 1144, 1142, 1139, 1137, 1134, 1132, 1129,
+  1127, 1125, 1122, 1120, 1117, 1115, 1113, 1110,
+  1108, 1106, 1103, 1101, 1099, 1096, 1094, 1092,
+  1089, 1087, 1085, 1083, 1081, 1078, 1076, 1074,
+  1072, 1069, 1067, 1065, 1063, 1061, 1059, 1057,
+  1054, 1052, 1050, 1048, 1046, 1044, 1042, 1040,
+  1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024,
+  1022, 1020, 1018, 1016, 1014, 1012, 1010, 1008,
+  1006, 1004, 1002, 1000, 998, 996, 994, 992,
+  991, 989, 987, 985, 983, 981, 979, 978,
+  976, 974, 972, 970, 969, 967, 965, 963,
+  961, 960, 958, 956, 954, 953, 951, 949,
+  948, 946, 944, 942, 941, 939, 937, 936,
+  934, 932, 931, 929, 927, 926, 924, 923,
+  921, 919, 918, 916, 914, 913, 911, 910,
+  908, 907, 905, 903, 902, 900, 899, 897,
+  896, 894, 893, 891, 890, 888, 887, 885,
+  884, 882, 881, 879, 878, 876, 875, 873,
+  872, 870, 869, 868, 866, 865, 863, 862,
+  860, 859, 858, 856, 855, 853, 852, 851,
+  849, 848, 846, 845, 844, 842, 841, 840,
+  838, 837, 836, 834, 833, 832, 830, 829,
+  828, 826, 825, 824, 823, 821, 820, 819,
+  817, 816, 815, 814, 812, 811, 810, 809,
+  807, 806, 805, 804, 802, 801, 800, 799,
+  798, 796, 795, 794, 793, 791, 790, 789,
+  788, 787, 786, 784, 783, 782, 781, 780,
+  779, 777, 776, 775, 774, 773, 772, 771,
+  769, 768, 767, 766, 765, 764, 763, 762,
+  760, 759, 758, 757, 756, 755, 754, 753,
+  752, 751, 750, 748, 747, 746, 745, 744,
+  743, 742, 741, 740, 739, 738, 737, 736,
+  735, 734, 733, 732, 731, 730, 729, 728,
+  727, 726, 725, 724, 723, 722, 721, 720,
+  719, 718, 717, 716, 715, 714, 713, 712,
+  711, 710, 709, 708, 707, 706, 705, 704,
+  703, 702, 701, 700, 699, 699, 698, 697,
+  696, 695, 694, 693, 692, 691, 690, 689,
+  688, 688, 687, 686, 685, 684, 683, 682,
+  681, 680, 680, 679, 678, 677, 676, 675,
+  674, 673, 673, 672, 671, 670, 669, 668,
+  667, 667, 666, 665, 664, 663, 662, 661,
+  661, 660, 659, 658, 657, 657, 656, 655,
+  654, 653, 652, 652, 651, 650, 649, 648,
+  648, 647, 646, 645, 644, 644, 643, 642,
+  641, 640, 640, 639, 638, 637, 637, 636,
+  635, 634, 633, 633, 632, 631, 630, 630,
+  629, 628, 627, 627, 626, 625, 624, 624,
+  623, 622, 621, 621, 620, 619, 618, 618,
+  617, 616, 616, 615, 614, 613, 613, 612,
+  611, 611, 610, 609, 608, 608, 607, 606,
+  606, 605, 604, 604, 603, 602, 601, 601,
+  600, 599, 599, 598, 597, 597, 596, 595,
+  595, 594, 593, 593, 592, 591, 591, 590,
+  589, 589, 588, 587, 587, 586, 585, 585,
+  584, 583, 583, 582, 581, 581, 580, 579,
+  579, 578, 578, 577, 576, 576, 575, 574,
+  574, 573, 572, 572, 571, 571, 570, 569,
+  569, 568, 568, 567, 566, 566, 565, 564,
+  564, 563, 563, 562, 561, 561, 560, 560,
+  559, 558, 558, 557, 557, 556, 555, 555,
+  554, 554, 553, 553, 552, 551, 551, 550,
+  550, 549, 548, 548, 547, 547, 546, 546,
+  545, 544, 544, 543, 543, 542, 542, 541,
+  541, 540, 539, 539, 538, 538, 537, 537,
+  536, 536, 535, 534, 534, 533, 533, 532,
+  532, 531, 531, 530, 530, 529, 529, 528,
+  527, 527, 526, 526, 525, 525, 524, 524,
+  523, 523, 522, 522, 521, 521, 520, 520,
+  519, 519, 518, 518, 517, 517, 516, 516,
+  515, 515, 514, 514
+};
+
+// Note that LinearToGamma() expects the values to be premultiplied by 4,
+// so we incorporate this factor 4 inside the DIVIDE_BY_ALPHA macro directly.
+#define DIVIDE_BY_ALPHA(sum, a)  (((sum) * kInvAlpha[(a)]) >> (kAlphaFix - 2))
+
+#else
+
+#define DIVIDE_BY_ALPHA(sum, a) (4 * (sum) / (a))
+
+#endif  // USE_INVERSE_ALPHA_TABLE
+
+static WEBP_INLINE int LinearToGammaWeighted(const uint8_t* src,
+                                             const uint8_t* a_ptr,
+                                             uint32_t total_a, int step,
+                                             int rgb_stride) {
+  const uint32_t sum =
+      a_ptr[0] * GammaToLinear(src[0]) +
+      a_ptr[step] * GammaToLinear(src[step]) +
+      a_ptr[rgb_stride] * GammaToLinear(src[rgb_stride]) +
+      a_ptr[rgb_stride + step] * GammaToLinear(src[rgb_stride + step]);
+  assert(total_a > 0 && total_a <= 4 * 0xff);
+#if defined(USE_INVERSE_ALPHA_TABLE)
+  assert((uint64_t)sum * kInvAlpha[total_a] < ((uint64_t)1 << 32));
+#endif
+  return LinearToGamma(DIVIDE_BY_ALPHA(sum, total_a), 0);
+}
+
+static WEBP_INLINE void ConvertRowToY(const uint8_t* const r_ptr,
+                                      const uint8_t* const g_ptr,
+                                      const uint8_t* const b_ptr,
+                                      int step,
+                                      uint8_t* const dst_y,
+                                      int width,
+                                      VP8Random* const rg) {
+  int i, j;
+  for (i = 0, j = 0; i < width; i += 1, j += step) {
+    dst_y[i] = RGBToY(r_ptr[j], g_ptr[j], b_ptr[j], rg);
+  }
+}
+
+static WEBP_INLINE void AccumulateRGBA(const uint8_t* const r_ptr,
+                                       const uint8_t* const g_ptr,
+                                       const uint8_t* const b_ptr,
+                                       const uint8_t* const a_ptr,
+                                       int rgb_stride,
+                                       uint16_t* dst, int width) {
+  int i, j;
+  // we loop over 2x2 blocks and produce one R/G/B/A value for each.
+  for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * 4, dst += 4) {
+    const uint32_t a = SUM4ALPHA(a_ptr + j);
+    int r, g, b;
+    if (a == 4 * 0xff || a == 0) {
+      r = SUM4(r_ptr + j, 4);
+      g = SUM4(g_ptr + j, 4);
+      b = SUM4(b_ptr + j, 4);
+    } else {
+      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 4, rgb_stride);
+      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 4, rgb_stride);
+      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 4, rgb_stride);
+    }
+    dst[0] = r;
+    dst[1] = g;
+    dst[2] = b;
+    dst[3] = a;
+  }
+  if (width & 1) {
+    const uint32_t a = 2u * SUM2ALPHA(a_ptr + j);
+    int r, g, b;
+    if (a == 4 * 0xff || a == 0) {
+      r = SUM2(r_ptr + j);
+      g = SUM2(g_ptr + j);
+      b = SUM2(b_ptr + j);
+    } else {
+      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 0, rgb_stride);
+      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 0, rgb_stride);
+      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 0, rgb_stride);
+    }
+    dst[0] = r;
+    dst[1] = g;
+    dst[2] = b;
+    dst[3] = a;
+  }
+}
+
+static WEBP_INLINE void AccumulateRGB(const uint8_t* const r_ptr,
+                                      const uint8_t* const g_ptr,
+                                      const uint8_t* const b_ptr,
+                                      int step, int rgb_stride,
+                                      uint16_t* dst, int width) {
+  int i, j;
+  for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * step, dst += 4) {
+    dst[0] = SUM4(r_ptr + j, step);
+    dst[1] = SUM4(g_ptr + j, step);
+    dst[2] = SUM4(b_ptr + j, step);
+  }
+  if (width & 1) {
+    dst[0] = SUM2(r_ptr + j);
+    dst[1] = SUM2(g_ptr + j);
+    dst[2] = SUM2(b_ptr + j);
+  }
+}
+
+static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb,
+                                        uint8_t* const dst_u,
+                                        uint8_t* const dst_v,
+                                        int width,
+                                        VP8Random* const rg) {
+  int i;
+  for (i = 0; i < width; i += 1, rgb += 4) {
+    const int r = rgb[0], g = rgb[1], b = rgb[2];
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+}
+
+static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
+                              const uint8_t* g_ptr,
+                              const uint8_t* b_ptr,
+                              const uint8_t* a_ptr,
+                              int step,         // bytes per pixel
+                              int rgb_stride,   // bytes per scanline
+                              float dithering,
+                              int use_iterative_conversion,
+                              WebPPicture* const picture) {
+  int y;
+  const int width = picture->width;
+  const int height = picture->height;
+  const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
+  const int is_rgb = (r_ptr < b_ptr);  // otherwise it's bgr
+
+  picture->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
+  picture->use_argb = 0;
+
+  // disable smart conversion if source is too small (overkill).
+  if (width < kMinDimensionIterativeConversion ||
+      height < kMinDimensionIterativeConversion) {
+    use_iterative_conversion = 0;
+  }
+
+  if (!WebPPictureAllocYUVA(picture, width, height)) {
+    return 0;
+  }
+  if (has_alpha) {
+    assert(step == 4);
+#if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE)
+    assert(kAlphaFix + kGammaFix <= 31);
+#endif
+  }
+
+  if (use_iterative_conversion) {
+    InitGammaTablesS();
+    if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
+      return 0;
+    }
+    if (has_alpha) {
+      WebPExtractAlpha(a_ptr, rgb_stride, width, height,
+                       picture->a, picture->a_stride);
+    }
+  } else {
+    const int uv_width = (width + 1) >> 1;
+    int use_dsp = (step == 3);  // use special function in this case
+    // temporary storage for accumulated R/G/B values during conversion to U/V
+    uint16_t* const tmp_rgb =
+        (uint16_t*)WebPSafeMalloc(4 * uv_width, sizeof(*tmp_rgb));
+    uint8_t* dst_y = picture->y;
+    uint8_t* dst_u = picture->u;
+    uint8_t* dst_v = picture->v;
+    uint8_t* dst_a = picture->a;
+
+    VP8Random base_rg;
+    VP8Random* rg = NULL;
+    if (dithering > 0.) {
+      VP8InitRandom(&base_rg, dithering);
+      rg = &base_rg;
+      use_dsp = 0;   // can't use dsp in this case
+    }
+    WebPInitConvertARGBToYUV();
+    InitGammaTables();
+
+    if (tmp_rgb == NULL) return 0;  // malloc error
+
+    // Downsample Y/U/V planes, two rows at a time
+    for (y = 0; y < (height >> 1); ++y) {
+      int rows_have_alpha = has_alpha;
+      if (use_dsp) {
+        if (is_rgb) {
+          WebPConvertRGB24ToY(r_ptr, dst_y, width);
+          WebPConvertRGB24ToY(r_ptr + rgb_stride,
+                              dst_y + picture->y_stride, width);
+        } else {
+          WebPConvertBGR24ToY(b_ptr, dst_y, width);
+          WebPConvertBGR24ToY(b_ptr + rgb_stride,
+                              dst_y + picture->y_stride, width);
+        }
+      } else {
+        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
+        ConvertRowToY(r_ptr + rgb_stride,
+                      g_ptr + rgb_stride,
+                      b_ptr + rgb_stride, step,
+                      dst_y + picture->y_stride, width, rg);
+      }
+      dst_y += 2 * picture->y_stride;
+      if (has_alpha) {
+        rows_have_alpha &= !WebPExtractAlpha(a_ptr, rgb_stride, width, 2,
+                                             dst_a, picture->a_stride);
+        dst_a += 2 * picture->a_stride;
+      }
+      // Collect averaged R/G/B(/A)
+      if (!rows_have_alpha) {
+        AccumulateRGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, tmp_rgb, width);
+      } else {
+        AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, rgb_stride, tmp_rgb, width);
+      }
+      // Convert to U/V
+      if (rg == NULL) {
+        WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
+      } else {
+        ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
+      }
+      dst_u += picture->uv_stride;
+      dst_v += picture->uv_stride;
+      r_ptr += 2 * rgb_stride;
+      b_ptr += 2 * rgb_stride;
+      g_ptr += 2 * rgb_stride;
+      if (has_alpha) a_ptr += 2 * rgb_stride;
+    }
+    if (height & 1) {    // extra last row
+      int row_has_alpha = has_alpha;
+      if (use_dsp) {
+        if (r_ptr < b_ptr) {
+          WebPConvertRGB24ToY(r_ptr, dst_y, width);
+        } else {
+          WebPConvertBGR24ToY(b_ptr, dst_y, width);
+        }
+      } else {
+        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
+      }
+      if (row_has_alpha) {
+        row_has_alpha &= !WebPExtractAlpha(a_ptr, 0, width, 1, dst_a, 0);
+      }
+      // Collect averaged R/G/B(/A)
+      if (!row_has_alpha) {
+        // Collect averaged R/G/B
+        AccumulateRGB(r_ptr, g_ptr, b_ptr, step, /* rgb_stride = */ 0,
+                      tmp_rgb, width);
+      } else {
+        AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /* rgb_stride = */ 0,
+                       tmp_rgb, width);
+      }
+      if (rg == NULL) {
+        WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
+      } else {
+        ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
+      }
+    }
+    WebPSafeFree(tmp_rgb);
+  }
+  return 1;
+}
+
+#undef SUM4
+#undef SUM2
+#undef SUM4ALPHA
+#undef SUM2ALPHA
+
+//------------------------------------------------------------------------------
+// call for ARGB->YUVA conversion
+
+static int PictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace,
+                             float dithering, int use_iterative_conversion) {
+  if (picture == NULL) return 0;
+  if (picture->argb == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  } else if ((colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  } else {
+    const uint8_t* const argb = (const uint8_t*)picture->argb;
+    const uint8_t* const a = argb + CHANNEL_OFFSET(0);
+    const uint8_t* const r = argb + CHANNEL_OFFSET(1);
+    const uint8_t* const g = argb + CHANNEL_OFFSET(2);
+    const uint8_t* const b = argb + CHANNEL_OFFSET(3);
+
+    picture->colorspace = WEBP_YUV420;
+    return ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride,
+                              dithering, use_iterative_conversion, picture);
+  }
+}
+
+int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
+                                  float dithering) {
+  return PictureARGBToYUVA(picture, colorspace, dithering, 0);
+}
+
+int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
+  return PictureARGBToYUVA(picture, colorspace, 0.f, 0);
+}
+
+int WebPPictureSharpARGBToYUVA(WebPPicture* picture) {
+  return PictureARGBToYUVA(picture, WEBP_YUV420, 0.f, 1);
+}
+// for backward compatibility
+int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
+  return WebPPictureSharpARGBToYUVA(picture);
+}
+
+//------------------------------------------------------------------------------
+// call for YUVA -> ARGB conversion
+
+int WebPPictureYUVAToARGB(WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (picture->y == NULL || picture->u == NULL || picture->v == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  }
+  if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  }
+  if ((picture->colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  }
+  // Allocate a new argb buffer (discarding the previous one).
+  if (!WebPPictureAllocARGB(picture, picture->width, picture->height)) return 0;
+  picture->use_argb = 1;
+
+  // Convert
+  {
+    int y;
+    const int width = picture->width;
+    const int height = picture->height;
+    const int argb_stride = 4 * picture->argb_stride;
+    uint8_t* dst = (uint8_t*)picture->argb;
+    const uint8_t* cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
+    WebPUpsampleLinePairFunc upsample =
+        WebPGetLinePairConverter(ALPHA_OFFSET > 0);
+
+    // First row, with replicated top samples.
+    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+    cur_y += picture->y_stride;
+    dst += argb_stride;
+    // Center rows.
+    for (y = 1; y + 1 < height; y += 2) {
+      const uint8_t* const top_u = cur_u;
+      const uint8_t* const top_v = cur_v;
+      cur_u += picture->uv_stride;
+      cur_v += picture->uv_stride;
+      upsample(cur_y, cur_y + picture->y_stride, top_u, top_v, cur_u, cur_v,
+               dst, dst + argb_stride, width);
+      cur_y += 2 * picture->y_stride;
+      dst += 2 * argb_stride;
+    }
+    // Last row (if needed), with replicated bottom samples.
+    if (height > 1 && !(height & 1)) {
+      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+    }
+    // Insert alpha values if needed, in replacement for the default 0xff ones.
+    if (picture->colorspace & WEBP_CSP_ALPHA_BIT) {
+      for (y = 0; y < height; ++y) {
+        uint32_t* const argb_dst = picture->argb + y * picture->argb_stride;
+        const uint8_t* const src = picture->a + y * picture->a_stride;
+        int x;
+        for (x = 0; x < width; ++x) {
+          argb_dst[x] = (argb_dst[x] & 0x00ffffffu) | ((uint32_t)src[x] << 24);
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// automatic import / conversion
+
+static int Import(WebPPicture* const picture,
+                  const uint8_t* rgb, int rgb_stride,
+                  int step, int swap_rb, int import_alpha) {
+  int y;
+  // swap_rb -> b,g,r,a , !swap_rb -> r,g,b,a
+  const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0);
+  const uint8_t* g_ptr = rgb + 1;
+  const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2);
+  const int width = picture->width;
+  const int height = picture->height;
+
+  if (!picture->use_argb) {
+    const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
+    return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
+                              0.f /* no dithering */, 0, picture);
+  }
+  if (!WebPPictureAlloc(picture)) return 0;
+
+  VP8LDspInit();
+  WebPInitAlphaProcessing();
+
+  if (import_alpha) {
+    // dst[] byte order is {a,r,g,b} for big-endian, {b,g,r,a} for little endian
+    uint32_t* dst = picture->argb;
+    const int do_copy = (ALPHA_OFFSET == 3) && swap_rb;
+    assert(step == 4);
+    if (do_copy) {
+      for (y = 0; y < height; ++y) {
+        memcpy(dst, rgb, width * 4);
+        rgb += rgb_stride;
+        dst += picture->argb_stride;
+      }
+    } else {
+      for (y = 0; y < height; ++y) {
+#ifdef WORDS_BIGENDIAN
+        // BGRA or RGBA input order.
+        const uint8_t* a_ptr = rgb + 3;
+        WebPPackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
+        r_ptr += rgb_stride;
+        g_ptr += rgb_stride;
+        b_ptr += rgb_stride;
+#else
+        // RGBA input order. Need to swap R and B.
+        VP8LConvertBGRAToRGBA((const uint32_t*)rgb, width, (uint8_t*)dst);
+#endif
+        rgb += rgb_stride;
+        dst += picture->argb_stride;
+      }
+    }
+  } else {
+    uint32_t* dst = picture->argb;
+    assert(step >= 3);
+    for (y = 0; y < height; ++y) {
+      WebPPackRGB(r_ptr, g_ptr, b_ptr, width, step, dst);
+      r_ptr += rgb_stride;
+      g_ptr += rgb_stride;
+      b_ptr += rgb_stride;
+      dst += picture->argb_stride;
+    }
+  }
+  return 1;
+}
+
+// Public API
+
+#if !defined(WEBP_REDUCE_CSP)
+
+int WebPPictureImportBGR(WebPPicture* picture,
+                         const uint8_t* rgb, int rgb_stride) {
+  return (picture != NULL && rgb != NULL)
+             ? Import(picture, rgb, rgb_stride, 3, 1, 0)
+             : 0;
+}
+
+int WebPPictureImportBGRA(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL && rgba != NULL)
+             ? Import(picture, rgba, rgba_stride, 4, 1, 1)
+             : 0;
+}
+
+
+int WebPPictureImportBGRX(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL && rgba != NULL)
+             ? Import(picture, rgba, rgba_stride, 4, 1, 0)
+             : 0;
+}
+
+#endif   // WEBP_REDUCE_CSP
+
+int WebPPictureImportRGB(WebPPicture* picture,
+                         const uint8_t* rgb, int rgb_stride) {
+  return (picture != NULL && rgb != NULL)
+             ? Import(picture, rgb, rgb_stride, 3, 0, 0)
+             : 0;
+}
+
+int WebPPictureImportRGBA(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL && rgba != NULL)
+             ? Import(picture, rgba, rgba_stride, 4, 0, 1)
+             : 0;
+}
+
+int WebPPictureImportRGBX(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL && rgba != NULL)
+             ? Import(picture, rgba, rgba_stride, 4, 0, 0)
+             : 0;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/picture_enc.c b/media/libwebp/enc/picture_enc.c
new file mode 100644
index 0000000000..5275ba9ed2
--- /dev/null
+++ b/media/libwebp/enc/picture_enc.c
@@ -0,0 +1,296 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture class basis
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../dsp/dsp.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+// WebPPicture
+//------------------------------------------------------------------------------
+
+static int DummyWriter(const uint8_t* data, size_t data_size,
+                       const WebPPicture* const picture) {
+  // The following are to prevent 'unused variable' error message.
+  (void)data;
+  (void)data_size;
+  (void)picture;
+  return 1;
+}
+
+int WebPPictureInitInternal(WebPPicture* picture, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
+    return 0;   // caller/system version mismatch!
+  }
+  if (picture != NULL) {
+    memset(picture, 0, sizeof(*picture));
+    picture->writer = DummyWriter;
+    WebPEncodingSetError(picture, VP8_ENC_OK);
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static void WebPPictureResetBufferARGB(WebPPicture* const picture) {
+  picture->memory_argb_ = NULL;
+  picture->argb = NULL;
+  picture->argb_stride = 0;
+}
+
+static void WebPPictureResetBufferYUVA(WebPPicture* const picture) {
+  picture->memory_ = NULL;
+  picture->y = picture->u = picture->v = picture->a = NULL;
+  picture->y_stride = picture->uv_stride = 0;
+  picture->a_stride = 0;
+}
+
+void WebPPictureResetBuffers(WebPPicture* const picture) {
+  WebPPictureResetBufferARGB(picture);
+  WebPPictureResetBufferYUVA(picture);
+}
+
+int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
+  void* memory;
+  const uint64_t argb_size = (uint64_t)width * height;
+
+  assert(picture != NULL);
+
+  WebPSafeFree(picture->memory_argb_);
+  WebPPictureResetBufferARGB(picture);
+
+  if (width <= 0 || height <= 0) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+  // allocate a new buffer.
+  memory = WebPSafeMalloc(argb_size + WEBP_ALIGN_CST, sizeof(*picture->argb));
+  if (memory == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+  picture->memory_argb_ = memory;
+  picture->argb = (uint32_t*)WEBP_ALIGN(memory);
+  picture->argb_stride = width;
+  return 1;
+}
+
+int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
+  const WebPEncCSP uv_csp =
+      (WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK);
+  const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT;
+  const int y_stride = width;
+  const int uv_width = (int)(((int64_t)width + 1) >> 1);
+  const int uv_height = (int)(((int64_t)height + 1) >> 1);
+  const int uv_stride = uv_width;
+  int a_width, a_stride;
+  uint64_t y_size, uv_size, a_size, total_size;
+  uint8_t* mem;
+
+  assert(picture != NULL);
+
+  WebPSafeFree(picture->memory_);
+  WebPPictureResetBufferYUVA(picture);
+
+  if (uv_csp != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  }
+
+  // alpha
+  a_width = has_alpha ? width : 0;
+  a_stride = a_width;
+  y_size = (uint64_t)y_stride * height;
+  uv_size = (uint64_t)uv_stride * uv_height;
+  a_size =  (uint64_t)a_stride * height;
+
+  total_size = y_size + a_size + 2 * uv_size;
+
+  // Security and validation checks
+  if (width <= 0 || height <= 0 ||           // luma/alpha param error
+      uv_width <= 0 || uv_height <= 0) {     // u/v param error
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+  // allocate a new buffer.
+  mem = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*mem));
+  if (mem == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+
+  // From now on, we're in the clear, we can no longer fail...
+  picture->memory_ = (void*)mem;
+  picture->y_stride  = y_stride;
+  picture->uv_stride = uv_stride;
+  picture->a_stride  = a_stride;
+
+  // TODO(skal): we could align the y/u/v planes and adjust stride.
+  picture->y = mem;
+  mem += y_size;
+
+  picture->u = mem;
+  mem += uv_size;
+  picture->v = mem;
+  mem += uv_size;
+
+  if (a_size > 0) {
+    picture->a = mem;
+    mem += a_size;
+  }
+  (void)mem;  // makes the static analyzer happy
+  return 1;
+}
+
+int WebPPictureAlloc(WebPPicture* picture) {
+  if (picture != NULL) {
+    const int width = picture->width;
+    const int height = picture->height;
+
+    WebPPictureFree(picture);   // erase previous buffer
+
+    if (!picture->use_argb) {
+      return WebPPictureAllocYUVA(picture, width, height);
+    } else {
+      return WebPPictureAllocARGB(picture, width, height);
+    }
+  }
+  return 1;
+}
+
+void WebPPictureFree(WebPPicture* picture) {
+  if (picture != NULL) {
+    WebPSafeFree(picture->memory_);
+    WebPSafeFree(picture->memory_argb_);
+    WebPPictureResetBuffers(picture);
+  }
+}
+
+//------------------------------------------------------------------------------
+// WebPMemoryWriter: Write-to-memory
+
+void WebPMemoryWriterInit(WebPMemoryWriter* writer) {
+  writer->mem = NULL;
+  writer->size = 0;
+  writer->max_size = 0;
+}
+
+int WebPMemoryWrite(const uint8_t* data, size_t data_size,
+                    const WebPPicture* picture) {
+  WebPMemoryWriter* const w = (WebPMemoryWriter*)picture->custom_ptr;
+  uint64_t next_size;
+  if (w == NULL) {
+    return 1;
+  }
+  next_size = (uint64_t)w->size + data_size;
+  if (next_size > w->max_size) {
+    uint8_t* new_mem;
+    uint64_t next_max_size = 2ULL * w->max_size;
+    if (next_max_size < next_size) next_max_size = next_size;
+    if (next_max_size < 8192ULL) next_max_size = 8192ULL;
+    new_mem = (uint8_t*)WebPSafeMalloc(next_max_size, 1);
+    if (new_mem == NULL) {
+      return 0;
+    }
+    if (w->size > 0) {
+      memcpy(new_mem, w->mem, w->size);
+    }
+    WebPSafeFree(w->mem);
+    w->mem = new_mem;
+    // down-cast is ok, thanks to WebPSafeMalloc
+    w->max_size = (size_t)next_max_size;
+  }
+  if (data_size > 0) {
+    memcpy(w->mem + w->size, data, data_size);
+    w->size += data_size;
+  }
+  return 1;
+}
+
+void WebPMemoryWriterClear(WebPMemoryWriter* writer) {
+  if (writer != NULL) {
+    WebPSafeFree(writer->mem);
+    writer->mem = NULL;
+    writer->size = 0;
+    writer->max_size = 0;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Simplest high-level calls:
+
+typedef int (*Importer)(WebPPicture* const, const uint8_t* const, int);
+
+static size_t Encode(const uint8_t* rgba, int width, int height, int stride,
+                     Importer import, float quality_factor, int lossless,
+                     uint8_t** output) {
+  WebPPicture pic;
+  WebPConfig config;
+  WebPMemoryWriter wrt;
+  int ok;
+
+  if (output == NULL) return 0;
+
+  if (!WebPConfigPreset(&config, WEBP_PRESET_DEFAULT, quality_factor) ||
+      !WebPPictureInit(&pic)) {
+    return 0;  // shouldn't happen, except if system installation is broken
+  }
+
+  config.lossless = !!lossless;
+  pic.use_argb = !!lossless;
+  pic.width = width;
+  pic.height = height;
+  pic.writer = WebPMemoryWrite;
+  pic.custom_ptr = &wrt;
+  WebPMemoryWriterInit(&wrt);
+
+  ok = import(&pic, rgba, stride) && WebPEncode(&config, &pic);
+  WebPPictureFree(&pic);
+  if (!ok) {
+    WebPMemoryWriterClear(&wrt);
+    *output = NULL;
+    return 0;
+  }
+  *output = wrt.mem;
+  return wrt.size;
+}
+
+#define ENCODE_FUNC(NAME, IMPORTER)                                     \
+size_t NAME(const uint8_t* in, int w, int h, int bps, float q,          \
+            uint8_t** out) {                                            \
+  return Encode(in, w, h, bps, IMPORTER, q, 0, out);                    \
+}
+
+ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB)
+ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA)
+#if !defined(WEBP_REDUCE_CSP)
+ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
+ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA)
+#endif  // WEBP_REDUCE_CSP
+
+#undef ENCODE_FUNC
+
+#define LOSSLESS_DEFAULT_QUALITY 70.
+#define LOSSLESS_ENCODE_FUNC(NAME, IMPORTER)                                 \
+size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) {       \
+  return Encode(in, w, h, bps, IMPORTER, LOSSLESS_DEFAULT_QUALITY, 1, out);  \
+}
+
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA)
+#if !defined(WEBP_REDUCE_CSP)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)
+#endif  // WEBP_REDUCE_CSP
+
+#undef LOSSLESS_ENCODE_FUNC
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/picture_psnr_enc.c b/media/libwebp/enc/picture_psnr_enc.c
new file mode 100644
index 0000000000..bbd32854c9
--- /dev/null
+++ b/media/libwebp/enc/picture_psnr_enc.c
@@ -0,0 +1,258 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools for measuring distortion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../webp/encode.h"
+
+#if !(defined(WEBP_DISABLE_STATS) || defined(WEBP_REDUCE_SIZE))
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "../dsp/dsp.h"
+#include "../enc/vp8i_enc.h"
+#include "../utils/utils.h"
+
+typedef double (*AccumulateFunc)(const uint8_t* src, int src_stride,
+                                 const uint8_t* ref, int ref_stride,
+                                 int w, int h);
+
+//------------------------------------------------------------------------------
+// local-min distortion
+//
+// For every pixel in the *reference* picture, we search for the local best
+// match in the compressed image. This is not a symmetrical measure.
+
+#define RADIUS 2  // search radius. Shouldn't be too large.
+
+static double AccumulateLSIM(const uint8_t* src, int src_stride,
+                             const uint8_t* ref, int ref_stride,
+                             int w, int h) {
+  int x, y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
+    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
+    for (x = 0; x < w; ++x) {
+      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
+      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
+      double best_sse = 255. * 255.;
+      const double value = (double)ref[y * ref_stride + x];
+      int i, j;
+      for (j = y_0; j < y_1; ++j) {
+        const uint8_t* const s = src + j * src_stride;
+        for (i = x_0; i < x_1; ++i) {
+          const double diff = s[i] - value;
+          const double sse = diff * diff;
+          if (sse < best_sse) best_sse = sse;
+        }
+      }
+      total_sse += best_sse;
+    }
+  }
+  return total_sse;
+}
+#undef RADIUS
+
+static double AccumulateSSE(const uint8_t* src, int src_stride,
+                            const uint8_t* ref, int ref_stride,
+                            int w, int h) {
+  int y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    total_sse += VP8AccumulateSSE(src, ref, w);
+    src += src_stride;
+    ref += ref_stride;
+  }
+  return total_sse;
+}
+
+//------------------------------------------------------------------------------
+
+static double AccumulateSSIM(const uint8_t* src, int src_stride,
+                             const uint8_t* ref, int ref_stride,
+                             int w, int h) {
+  const int w0 = (w < VP8_SSIM_KERNEL) ? w : VP8_SSIM_KERNEL;
+  const int w1 = w - VP8_SSIM_KERNEL - 1;
+  const int h0 = (h < VP8_SSIM_KERNEL) ? h : VP8_SSIM_KERNEL;
+  const int h1 = h - VP8_SSIM_KERNEL - 1;
+  int x, y;
+  double sum = 0.;
+  for (y = 0; y < h0; ++y) {
+    for (x = 0; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  for (; y < h1; ++y) {
+    for (x = 0; x < w0; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+    for (; x < w1; ++x) {
+      const int off1 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * src_stride;
+      const int off2 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * ref_stride;
+      sum += VP8SSIMGet(src + off1, src_stride, ref + off2, ref_stride);
+    }
+    for (; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  for (; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  return sum;
+}
+
+//------------------------------------------------------------------------------
+// Distortion
+
+// Max value returned in case of exact similarity.
+static const double kMinDistortion_dB = 99.;
+
+static double GetPSNR(double v, double size) {
+  return (v > 0. && size > 0.) ? -4.3429448 * log(v / (size * 255 * 255.))
+                               : kMinDistortion_dB;
+}
+
+static double GetLogSSIM(double v, double size) {
+  v = (size > 0.) ? v / size : 1.;
+  return (v < 1.) ? -10.0 * log10(1. - v) : kMinDistortion_dB;
+}
+
+int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                        const uint8_t* ref, size_t ref_stride,
+                        int width, int height, size_t x_step,
+                        int type, float* distortion, float* result) {
+  uint8_t* allocated = NULL;
+  const AccumulateFunc metric = (type == 0) ? AccumulateSSE :
+                                (type == 1) ? AccumulateSSIM :
+                                              AccumulateLSIM;
+  if (src == NULL || ref == NULL ||
+      src_stride < x_step * width || ref_stride < x_step * width ||
+      result == NULL || distortion == NULL) {
+    return 0;
+  }
+
+  VP8SSIMDspInit();
+  if (x_step != 1) {   // extract a packed plane if needed
+    int x, y;
+    uint8_t* tmp1;
+    uint8_t* tmp2;
+    allocated =
+        (uint8_t*)WebPSafeMalloc(2ULL * width * height, sizeof(*allocated));
+    if (allocated == NULL) return 0;
+    tmp1 = allocated;
+    tmp2 = tmp1 + (size_t)width * height;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        tmp1[x + y * width] = src[x * x_step + y * src_stride];
+        tmp2[x + y * width] = ref[x * x_step + y * ref_stride];
+      }
+    }
+    src = tmp1;
+    ref = tmp2;
+  }
+  *distortion = (float)metric(src, width, ref, width, width, height);
+  WebPSafeFree(allocated);
+
+  *result = (type == 1) ? (float)GetLogSSIM(*distortion, (double)width * height)
+                        : (float)GetPSNR(*distortion, (double)width * height);
+  return 1;
+}
+
+#ifdef WORDS_BIGENDIAN
+#define BLUE_OFFSET 3   // uint32_t 0x000000ff is 0x00,00,00,ff in memory
+#else
+#define BLUE_OFFSET 0   // uint32_t 0x000000ff is 0xff,00,00,00 in memory
+#endif
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+                          int type, float results[5]) {
+  int w, h, c;
+  int ok = 0;
+  WebPPicture p0, p1;
+  double total_size = 0., total_distortion = 0.;
+  if (src == NULL || ref == NULL ||
+      src->width != ref->width || src->height != ref->height ||
+      results == NULL) {
+    return 0;
+  }
+
+  VP8SSIMDspInit();
+  if (!WebPPictureInit(&p0) || !WebPPictureInit(&p1)) return 0;
+  w = src->width;
+  h = src->height;
+  if (!WebPPictureView(src, 0, 0, w, h, &p0)) goto Error;
+  if (!WebPPictureView(ref, 0, 0, w, h, &p1)) goto Error;
+
+  // We always measure distortion in ARGB space.
+  if (p0.use_argb == 0 && !WebPPictureYUVAToARGB(&p0)) goto Error;
+  if (p1.use_argb == 0 && !WebPPictureYUVAToARGB(&p1)) goto Error;
+  for (c = 0; c < 4; ++c) {
+    float distortion;
+    const size_t stride0 = 4 * (size_t)p0.argb_stride;
+    const size_t stride1 = 4 * (size_t)p1.argb_stride;
+    // results are reported as BGRA
+    const int offset = c ^ BLUE_OFFSET;
+    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + offset, stride0,
+                             (const uint8_t*)p1.argb + offset, stride1,
+                             w, h, 4, type, &distortion, results + c)) {
+      goto Error;
+    }
+    total_distortion += distortion;
+    total_size += w * h;
+  }
+
+  results[4] = (type == 1) ? (float)GetLogSSIM(total_distortion, total_size)
+                           : (float)GetPSNR(total_distortion, total_size);
+  ok = 1;
+
+ Error:
+  WebPPictureFree(&p0);
+  WebPPictureFree(&p1);
+  return ok;
+}
+
+#undef BLUE_OFFSET
+
+#else  // defined(WEBP_DISABLE_STATS)
+int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                        const uint8_t* ref, size_t ref_stride,
+                        int width, int height, size_t x_step,
+                        int type, float* distortion, float* result) {
+  (void)src;
+  (void)src_stride;
+  (void)ref;
+  (void)ref_stride;
+  (void)width;
+  (void)height;
+  (void)x_step;
+  (void)type;
+  if (distortion == NULL || result == NULL) return 0;
+  *distortion = 0.f;
+  *result = 0.f;
+  return 1;
+}
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+                          int type, float results[5]) {
+  int i;
+  (void)src;
+  (void)ref;
+  (void)type;
+  if (results == NULL) return 0;
+  for (i = 0; i < 5; ++i) results[i] = 0.f;
+  return 1;
+}
+
+#endif  // !defined(WEBP_DISABLE_STATS)
diff --git a/media/libwebp/enc/picture_rescale_enc.c b/media/libwebp/enc/picture_rescale_enc.c
new file mode 100644
index 0000000000..22d31363f0
--- /dev/null
+++ b/media/libwebp/enc/picture_rescale_enc.c
@@ -0,0 +1,316 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools: copy, crop, rescaling and view.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../webp/encode.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../utils/rescaler_utils.h"
+#include "../utils/utils.h"
+
+#define HALVE(x) (((x) + 1) >> 1)
+
+// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
+// into 'dst'. Mark 'dst' as not owning any memory.
+static void PictureGrabSpecs(const WebPPicture* const src,
+                             WebPPicture* const dst) {
+  assert(src != NULL && dst != NULL);
+  *dst = *src;
+  WebPPictureResetBuffers(dst);
+}
+
+//------------------------------------------------------------------------------
+
+// Adjust top-left corner to chroma sample position.
+static void SnapTopLeftPosition(const WebPPicture* const pic,
+                                int* const left, int* const top) {
+  if (!pic->use_argb) {
+    *left &= ~1;
+    *top &= ~1;
+  }
+}
+
+// Adjust top-left corner and verify that the sub-rectangle is valid.
+static int AdjustAndCheckRectangle(const WebPPicture* const pic,
+                                   int* const left, int* const top,
+                                   int width, int height) {
+  SnapTopLeftPosition(pic, left, top);
+  if ((*left) < 0 || (*top) < 0) return 0;
+  if (width <= 0 || height <= 0) return 0;
+  if ((*left) + width > pic->width) return 0;
+  if ((*top) + height > pic->height) return 0;
+  return 1;
+}
+
+int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
+  if (src == NULL || dst == NULL) return 0;
+  if (src == dst) return 1;
+
+  PictureGrabSpecs(src, dst);
+  if (!WebPPictureAlloc(dst)) return 0;
+
+  if (!src->use_argb) {
+    WebPCopyPlane(src->y, src->y_stride,
+                  dst->y, dst->y_stride, dst->width, dst->height);
+    WebPCopyPlane(src->u, src->uv_stride, dst->u, dst->uv_stride,
+                  HALVE(dst->width), HALVE(dst->height));
+    WebPCopyPlane(src->v, src->uv_stride, dst->v, dst->uv_stride,
+                  HALVE(dst->width), HALVE(dst->height));
+    if (dst->a != NULL)  {
+      WebPCopyPlane(src->a, src->a_stride,
+                    dst->a, dst->a_stride, dst->width, dst->height);
+    }
+  } else {
+    WebPCopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
+                  (uint8_t*)dst->argb, 4 * dst->argb_stride,
+                  4 * dst->width, dst->height);
+  }
+  return 1;
+}
+
+int WebPPictureIsView(const WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (picture->use_argb) {
+    return (picture->memory_argb_ == NULL);
+  }
+  return (picture->memory_ == NULL);
+}
+
+int WebPPictureView(const WebPPicture* src,
+                    int left, int top, int width, int height,
+                    WebPPicture* dst) {
+  if (src == NULL || dst == NULL) return 0;
+
+  // verify rectangle position.
+  if (!AdjustAndCheckRectangle(src, &left, &top, width, height)) return 0;
+
+  if (src != dst) {  // beware of aliasing! We don't want to leak 'memory_'.
+    PictureGrabSpecs(src, dst);
+  }
+  dst->width = width;
+  dst->height = height;
+  if (!src->use_argb) {
+    dst->y = src->y + top * src->y_stride + left;
+    dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
+    dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
+    dst->y_stride = src->y_stride;
+    dst->uv_stride = src->uv_stride;
+    if (src->a != NULL) {
+      dst->a = src->a + top * src->a_stride + left;
+      dst->a_stride = src->a_stride;
+    }
+  } else {
+    dst->argb = src->argb + top * src->argb_stride + left;
+    dst->argb_stride = src->argb_stride;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Picture cropping
+
+int WebPPictureCrop(WebPPicture* pic,
+                    int left, int top, int width, int height) {
+  WebPPicture tmp;
+
+  if (pic == NULL) return 0;
+  if (!AdjustAndCheckRectangle(pic, &left, &top, width, height)) return 0;
+
+  PictureGrabSpecs(pic, &tmp);
+  tmp.width = width;
+  tmp.height = height;
+  if (!WebPPictureAlloc(&tmp)) return 0;
+
+  if (!pic->use_argb) {
+    const int y_offset = top * pic->y_stride + left;
+    const int uv_offset = (top / 2) * pic->uv_stride + left / 2;
+    WebPCopyPlane(pic->y + y_offset, pic->y_stride,
+                  tmp.y, tmp.y_stride, width, height);
+    WebPCopyPlane(pic->u + uv_offset, pic->uv_stride,
+                  tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
+    WebPCopyPlane(pic->v + uv_offset, pic->uv_stride,
+                  tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
+
+    if (tmp.a != NULL) {
+      const int a_offset = top * pic->a_stride + left;
+      WebPCopyPlane(pic->a + a_offset, pic->a_stride,
+                    tmp.a, tmp.a_stride, width, height);
+    }
+  } else {
+    const uint8_t* const src =
+        (const uint8_t*)(pic->argb + top * pic->argb_stride + left);
+    WebPCopyPlane(src, pic->argb_stride * 4, (uint8_t*)tmp.argb,
+                  tmp.argb_stride * 4, width * 4, height);
+  }
+  WebPPictureFree(pic);
+  *pic = tmp;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Simple picture rescaler
+
+static int RescalePlane(const uint8_t* src,
+                        int src_width, int src_height, int src_stride,
+                        uint8_t* dst,
+                        int dst_width, int dst_height, int dst_stride,
+                        rescaler_t* const work,
+                        int num_channels) {
+  WebPRescaler rescaler;
+  int y = 0;
+  if (!WebPRescalerInit(&rescaler, src_width, src_height,
+                        dst, dst_width, dst_height, dst_stride,
+                        num_channels, work)) {
+    return 0;
+  }
+  while (y < src_height) {
+    y += WebPRescalerImport(&rescaler, src_height - y,
+                            src + y * src_stride, src_stride);
+    WebPRescalerExport(&rescaler);
+  }
+  return 1;
+}
+
+static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
+  assert(pic->argb != NULL);
+  WebPMultARGBRows((uint8_t*)pic->argb, pic->argb_stride * sizeof(*pic->argb),
+                   pic->width, pic->height, inverse);
+}
+
+static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
+  if (pic->a != NULL) {
+    WebPMultRows(pic->y, pic->y_stride, pic->a, pic->a_stride,
+                 pic->width, pic->height, inverse);
+  }
+}
+
+int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+  WebPPicture tmp;
+  int prev_width, prev_height;
+  rescaler_t* work;
+
+  if (pic == NULL) return 0;
+  prev_width = pic->width;
+  prev_height = pic->height;
+  if (!WebPRescalerGetScaledDimensions(
+          prev_width, prev_height, &width, &height)) {
+    return 0;
+  }
+
+  PictureGrabSpecs(pic, &tmp);
+  tmp.width = width;
+  tmp.height = height;
+  if (!WebPPictureAlloc(&tmp)) return 0;
+
+  if (!pic->use_argb) {
+    work = (rescaler_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
+    if (work == NULL) {
+      WebPPictureFree(&tmp);
+      return 0;
+    }
+    // If present, we need to rescale alpha first (for AlphaMultiplyY).
+    if (pic->a != NULL) {
+      WebPInitAlphaProcessing();
+      if (!RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+                        tmp.a, width, height, tmp.a_stride, work, 1)) {
+        return 0;
+      }
+    }
+
+    // We take transparency into account on the luma plane only. That's not
+    // totally exact blending, but still is a good approximation.
+    AlphaMultiplyY(pic, 0);
+    if (!RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
+                      tmp.y, width, height, tmp.y_stride, work, 1) ||
+        !RescalePlane(pic->u,
+                      HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
+                      tmp.u,
+                      HALVE(width), HALVE(height), tmp.uv_stride, work, 1) ||
+        !RescalePlane(pic->v,
+                      HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
+                      tmp.v,
+                      HALVE(width), HALVE(height), tmp.uv_stride, work, 1)) {
+      return 0;
+    }
+    AlphaMultiplyY(&tmp, 1);
+  } else {
+    work = (rescaler_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
+    if (work == NULL) {
+      WebPPictureFree(&tmp);
+      return 0;
+    }
+    // In order to correctly interpolate colors, we need to apply the alpha
+    // weighting first (black-matting), scale the RGB values, and remove
+    // the premultiplication afterward (while preserving the alpha channel).
+    WebPInitAlphaProcessing();
+    AlphaMultiplyARGB(pic, 0);
+    if (!RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
+                      pic->argb_stride * 4,
+                      (uint8_t*)tmp.argb, width, height,
+                      tmp.argb_stride * 4, work, 4)) {
+      return 0;
+    }
+    AlphaMultiplyARGB(&tmp, 1);
+  }
+  WebPPictureFree(pic);
+  WebPSafeFree(work);
+  *pic = tmp;
+  return 1;
+}
+
+#else  // defined(WEBP_REDUCE_SIZE)
+
+int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
+  (void)src;
+  (void)dst;
+  return 0;
+}
+
+int WebPPictureIsView(const WebPPicture* picture) {
+  (void)picture;
+  return 0;
+}
+
+int WebPPictureView(const WebPPicture* src,
+                    int left, int top, int width, int height,
+                    WebPPicture* dst) {
+  (void)src;
+  (void)left;
+  (void)top;
+  (void)width;
+  (void)height;
+  (void)dst;
+  return 0;
+}
+
+int WebPPictureCrop(WebPPicture* pic,
+                    int left, int top, int width, int height) {
+  (void)pic;
+  (void)left;
+  (void)top;
+  (void)width;
+  (void)height;
+  return 0;
+}
+
+int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+  (void)pic;
+  (void)width;
+  (void)height;
+  return 0;
+}
+#endif  // !defined(WEBP_REDUCE_SIZE)
diff --git a/media/libwebp/enc/picture_tools_enc.c b/media/libwebp/enc/picture_tools_enc.c
new file mode 100644
index 0000000000..02d48c5223
--- /dev/null
+++ b/media/libwebp/enc/picture_tools_enc.c
@@ -0,0 +1,273 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools: alpha handling, etc.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../dsp/yuv.h"
+
+//------------------------------------------------------------------------------
+// Helper: clean up fully transparent area to help compressibility.
+
+#define SIZE 8
+#define SIZE2 (SIZE / 2)
+static int IsTransparentARGBArea(const uint32_t* ptr, int stride, int size) {
+  int y, x;
+  for (y = 0; y < size; ++y) {
+    for (x = 0; x < size; ++x) {
+      if (ptr[x] & 0xff000000u) {
+        return 0;
+      }
+    }
+    ptr += stride;
+  }
+  return 1;
+}
+
+static void Flatten(uint8_t* ptr, int v, int stride, int size) {
+  int y;
+  for (y = 0; y < size; ++y) {
+    memset(ptr, v, size);
+    ptr += stride;
+  }
+}
+
+static void FlattenARGB(uint32_t* ptr, uint32_t v, int stride, int size) {
+  int x, y;
+  for (y = 0; y < size; ++y) {
+    for (x = 0; x < size; ++x) ptr[x] = v;
+    ptr += stride;
+  }
+}
+
+// Smoothen the luma components of transparent pixels. Return true if the whole
+// block is transparent.
+static int SmoothenBlock(const uint8_t* a_ptr, int a_stride, uint8_t* y_ptr,
+                         int y_stride, int width, int height) {
+  int sum = 0, count = 0;
+  int x, y;
+  const uint8_t* alpha_ptr = a_ptr;
+  uint8_t* luma_ptr = y_ptr;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      if (alpha_ptr[x] != 0) {
+        ++count;
+        sum += luma_ptr[x];
+      }
+    }
+    alpha_ptr += a_stride;
+    luma_ptr += y_stride;
+  }
+  if (count > 0 && count < width * height) {
+    const uint8_t avg_u8 = (uint8_t)(sum / count);
+    alpha_ptr = a_ptr;
+    luma_ptr = y_ptr;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        if (alpha_ptr[x] == 0) luma_ptr[x] = avg_u8;
+      }
+      alpha_ptr += a_stride;
+      luma_ptr += y_stride;
+    }
+  }
+  return (count == 0);
+}
+
+void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color) {
+  if (pic != NULL && pic->use_argb) {
+    int y = pic->height;
+    uint32_t* argb = pic->argb;
+    color &= 0xffffffu;   // force alpha=0
+    WebPInitAlphaProcessing();
+    while (y-- > 0) {
+      WebPAlphaReplace(argb, pic->width, color);
+      argb += pic->argb_stride;
+    }
+  }
+}
+
+void WebPCleanupTransparentArea(WebPPicture* pic) {
+  int x, y, w, h;
+  if (pic == NULL) return;
+  w = pic->width / SIZE;
+  h = pic->height / SIZE;
+
+  // note: we ignore the left-overs on right/bottom, except for SmoothenBlock().
+  if (pic->use_argb) {
+    uint32_t argb_value = 0;
+    for (y = 0; y < h; ++y) {
+      int need_reset = 1;
+      for (x = 0; x < w; ++x) {
+        const int off = (y * pic->argb_stride + x) * SIZE;
+        if (IsTransparentARGBArea(pic->argb + off, pic->argb_stride, SIZE)) {
+          if (need_reset) {
+            argb_value = pic->argb[off];
+            need_reset = 0;
+          }
+          FlattenARGB(pic->argb + off, argb_value, pic->argb_stride, SIZE);
+        } else {
+          need_reset = 1;
+        }
+      }
+    }
+  } else {
+    const int width = pic->width;
+    const int height = pic->height;
+    const int y_stride = pic->y_stride;
+    const int uv_stride = pic->uv_stride;
+    const int a_stride = pic->a_stride;
+    uint8_t* y_ptr = pic->y;
+    uint8_t* u_ptr = pic->u;
+    uint8_t* v_ptr = pic->v;
+    const uint8_t* a_ptr = pic->a;
+    int values[3] = { 0 };
+    if (a_ptr == NULL || y_ptr == NULL || u_ptr == NULL || v_ptr == NULL) {
+      return;
+    }
+    for (y = 0; y + SIZE <= height; y += SIZE) {
+      int need_reset = 1;
+      for (x = 0; x + SIZE <= width; x += SIZE) {
+        if (SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                          SIZE, SIZE)) {
+          if (need_reset) {
+            values[0] = y_ptr[x];
+            values[1] = u_ptr[x >> 1];
+            values[2] = v_ptr[x >> 1];
+            need_reset = 0;
+          }
+          Flatten(y_ptr + x,        values[0], y_stride,  SIZE);
+          Flatten(u_ptr + (x >> 1), values[1], uv_stride, SIZE2);
+          Flatten(v_ptr + (x >> 1), values[2], uv_stride, SIZE2);
+        } else {
+          need_reset = 1;
+        }
+      }
+      if (x < width) {
+        SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                      width - x, SIZE);
+      }
+      a_ptr += SIZE * a_stride;
+      y_ptr += SIZE * y_stride;
+      u_ptr += SIZE2 * uv_stride;
+      v_ptr += SIZE2 * uv_stride;
+    }
+    if (y < height) {
+      const int sub_height = height - y;
+      for (x = 0; x + SIZE <= width; x += SIZE) {
+        SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                      SIZE, sub_height);
+      }
+      if (x < width) {
+        SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                      width - x, sub_height);
+      }
+    }
+  }
+}
+
+#undef SIZE
+#undef SIZE2
+
+//------------------------------------------------------------------------------
+// Blend color and remove transparency info
+
+#define BLEND(V0, V1, ALPHA) \
+    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 256) >> 16)
+#define BLEND_10BIT(V0, V1, ALPHA) \
+    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 1024) >> 18)
+
+static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
+  return (0xff000000u | (r << 16) | (g << 8) | b);
+}
+
+void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
+  const int red = (background_rgb >> 16) & 0xff;
+  const int green = (background_rgb >> 8) & 0xff;
+  const int blue = (background_rgb >> 0) & 0xff;
+  int x, y;
+  if (pic == NULL) return;
+  if (!pic->use_argb) {
+    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
+    const int Y0 = VP8RGBToY(red, green, blue, YUV_HALF);
+    // VP8RGBToU/V expects the u/v values summed over four pixels
+    const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
+    const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
+    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
+    uint8_t* y_ptr = pic->y;
+    uint8_t* u_ptr = pic->u;
+    uint8_t* v_ptr = pic->v;
+    uint8_t* a_ptr = pic->a;
+    if (!has_alpha || a_ptr == NULL) return;    // nothing to do
+    for (y = 0; y < pic->height; ++y) {
+      // Luma blending
+      for (x = 0; x < pic->width; ++x) {
+        const uint8_t alpha = a_ptr[x];
+        if (alpha < 0xff) {
+          y_ptr[x] = BLEND(Y0, y_ptr[x], alpha);
+        }
+      }
+      // Chroma blending every even line
+      if ((y & 1) == 0) {
+        uint8_t* const a_ptr2 =
+            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
+        for (x = 0; x < uv_width; ++x) {
+          // Average four alpha values into a single blending weight.
+          // TODO(skal): might lead to visible contouring. Can we do better?
+          const uint32_t alpha =
+              a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
+              a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
+          u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
+          v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
+        }
+        if (pic->width & 1) {   // rightmost pixel
+          const uint32_t alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
+          u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
+          v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
+        }
+      } else {
+        u_ptr += pic->uv_stride;
+        v_ptr += pic->uv_stride;
+      }
+      memset(a_ptr, 0xff, pic->width);  // reset alpha value to opaque
+      a_ptr += pic->a_stride;
+      y_ptr += pic->y_stride;
+    }
+  } else {
+    uint32_t* argb = pic->argb;
+    const uint32_t background = MakeARGB32(red, green, blue);
+    for (y = 0; y < pic->height; ++y) {
+      for (x = 0; x < pic->width; ++x) {
+        const int alpha = (argb[x] >> 24) & 0xff;
+        if (alpha != 0xff) {
+          if (alpha > 0) {
+            int r = (argb[x] >> 16) & 0xff;
+            int g = (argb[x] >>  8) & 0xff;
+            int b = (argb[x] >>  0) & 0xff;
+            r = BLEND(red, r, alpha);
+            g = BLEND(green, g, alpha);
+            b = BLEND(blue, b, alpha);
+            argb[x] = MakeARGB32(r, g, b);
+          } else {
+            argb[x] = background;
+          }
+        }
+      }
+      argb += pic->argb_stride;
+    }
+  }
+}
+
+#undef BLEND
+#undef BLEND_10BIT
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/predictor_enc.c b/media/libwebp/enc/predictor_enc.c
new file mode 100644
index 0000000000..794c45cde6
--- /dev/null
+++ b/media/libwebp/enc/predictor_enc.c
@@ -0,0 +1,772 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+//          Urvang Joshi (urvang@google.com)
+//          Vincent Rabaud (vrabaud@google.com)
+
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../enc/vp8li_enc.h"
+
+#define MAX_DIFF_COST (1e30f)
+
+static const float kSpatialPredictorBias = 15.f;
+static const int kPredLowEffort = 11;
+static const uint32_t kMaskAlpha = 0xff000000;
+
+// Mostly used to reduce code size + readability
+static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
+
+//------------------------------------------------------------------------------
+// Methods to calculate Entropy (Shannon).
+
+static float PredictionCostSpatial(const int counts[256], int weight_0,
+                                   double exp_val) {
+  const int significant_symbols = 256 >> 4;
+  const double exp_decay_factor = 0.6;
+  double bits = weight_0 * counts[0];
+  int i;
+  for (i = 1; i < significant_symbols; ++i) {
+    bits += exp_val * (counts[i] + counts[256 - i]);
+    exp_val *= exp_decay_factor;
+  }
+  return (float)(-0.1 * bits);
+}
+
+static float PredictionCostSpatialHistogram(const int accumulated[4][256],
+                                            const int tile[4][256]) {
+  int i;
+  double retval = 0;
+  for (i = 0; i < 4; ++i) {
+    const double kExpValue = 0.94;
+    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
+    retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
+  }
+  return (float)retval;
+}
+
+static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
+  ++histo_argb[0][argb >> 24];
+  ++histo_argb[1][(argb >> 16) & 0xff];
+  ++histo_argb[2][(argb >> 8) & 0xff];
+  ++histo_argb[3][argb & 0xff];
+}
+
+//------------------------------------------------------------------------------
+// Spatial transform functions.
+
+static WEBP_INLINE void PredictBatch(int mode, int x_start, int y,
+                                     int num_pixels, const uint32_t* current,
+                                     const uint32_t* upper, uint32_t* out) {
+  if (x_start == 0) {
+    if (y == 0) {
+      // ARGB_BLACK.
+      VP8LPredictorsSub[0](current, NULL, 1, out);
+    } else {
+      // Top one.
+      VP8LPredictorsSub[2](current, upper, 1, out);
+    }
+    ++x_start;
+    ++out;
+    --num_pixels;
+  }
+  if (y == 0) {
+    // Left one.
+    VP8LPredictorsSub[1](current + x_start, NULL, num_pixels, out);
+  } else {
+    VP8LPredictorsSub[mode](current + x_start, upper + x_start, num_pixels,
+                            out);
+  }
+}
+
+#if (WEBP_NEAR_LOSSLESS == 1)
+static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
+
+static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
+  const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
+  const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
+  const int diff_g = abs((int)((p1 >> 8) & 0xff) - (int)((p2 >> 8) & 0xff));
+  const int diff_b = abs((int)(p1 & 0xff) - (int)(p2 & 0xff));
+  return GetMax(GetMax(diff_a, diff_r), GetMax(diff_g, diff_b));
+}
+
+static int MaxDiffAroundPixel(uint32_t current, uint32_t up, uint32_t down,
+                              uint32_t left, uint32_t right) {
+  const int diff_up = MaxDiffBetweenPixels(current, up);
+  const int diff_down = MaxDiffBetweenPixels(current, down);
+  const int diff_left = MaxDiffBetweenPixels(current, left);
+  const int diff_right = MaxDiffBetweenPixels(current, right);
+  return GetMax(GetMax(diff_up, diff_down), GetMax(diff_left, diff_right));
+}
+
+static uint32_t AddGreenToBlueAndRed(uint32_t argb) {
+  const uint32_t green = (argb >> 8) & 0xff;
+  uint32_t red_blue = argb & 0x00ff00ffu;
+  red_blue += (green << 16) | green;
+  red_blue &= 0x00ff00ffu;
+  return (argb & 0xff00ff00u) | red_blue;
+}
+
+static void MaxDiffsForRow(int width, int stride, const uint32_t* const argb,
+                           uint8_t* const max_diffs, int used_subtract_green) {
+  uint32_t current, up, down, left, right;
+  int x;
+  if (width <= 2) return;
+  current = argb[0];
+  right = argb[1];
+  if (used_subtract_green) {
+    current = AddGreenToBlueAndRed(current);
+    right = AddGreenToBlueAndRed(right);
+  }
+  // max_diffs[0] and max_diffs[width - 1] are never used.
+  for (x = 1; x < width - 1; ++x) {
+    up = argb[-stride + x];
+    down = argb[stride + x];
+    left = current;
+    current = right;
+    right = argb[x + 1];
+    if (used_subtract_green) {
+      up = AddGreenToBlueAndRed(up);
+      down = AddGreenToBlueAndRed(down);
+      right = AddGreenToBlueAndRed(right);
+    }
+    max_diffs[x] = MaxDiffAroundPixel(current, up, down, left, right);
+  }
+}
+
+// Quantize the difference between the actual component value and its prediction
+// to a multiple of quantization, working modulo 256, taking care not to cross
+// a boundary (inclusive upper limit).
+static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
+                                     uint8_t boundary, int quantization) {
+  const int residual = (value - predict) & 0xff;
+  const int boundary_residual = (boundary - predict) & 0xff;
+  const int lower = residual & ~(quantization - 1);
+  const int upper = lower + quantization;
+  // Resolve ties towards a value closer to the prediction (i.e. towards lower
+  // if value comes after prediction and towards upper otherwise).
+  const int bias = ((boundary - value) & 0xff) < boundary_residual;
+  if (residual - lower < upper - residual + bias) {
+    // lower is closer to residual than upper.
+    if (residual > boundary_residual && lower <= boundary_residual) {
+      // Halve quantization step to avoid crossing boundary. This midpoint is
+      // on the same side of boundary as residual because midpoint >= residual
+      // (since lower is closer than upper) and residual is above the boundary.
+      return lower + (quantization >> 1);
+    }
+    return lower;
+  } else {
+    // upper is closer to residual than lower.
+    if (residual <= boundary_residual && upper > boundary_residual) {
+      // Halve quantization step to avoid crossing boundary. This midpoint is
+      // on the same side of boundary as residual because midpoint <= residual
+      // (since upper is closer than lower) and residual is below the boundary.
+      return lower + (quantization >> 1);
+    }
+    return upper & 0xff;
+  }
+}
+
+static WEBP_INLINE uint8_t NearLosslessDiff(uint8_t a, uint8_t b) {
+  return (uint8_t)((((int)(a) - (int)(b))) & 0xff);
+}
+
+// Quantize every component of the difference between the actual pixel value and
+// its prediction to a multiple of a quantization (a power of 2, not larger than
+// max_quantization which is a power of 2, smaller than max_diff). Take care if
+// value and predict have undergone subtract green, which means that red and
+// blue are represented as offsets from green.
+static uint32_t NearLossless(uint32_t value, uint32_t predict,
+                             int max_quantization, int max_diff,
+                             int used_subtract_green) {
+  int quantization;
+  uint8_t new_green = 0;
+  uint8_t green_diff = 0;
+  uint8_t a, r, g, b;
+  if (max_diff <= 2) {
+    return VP8LSubPixels(value, predict);
+  }
+  quantization = max_quantization;
+  while (quantization >= max_diff) {
+    quantization >>= 1;
+  }
+  if ((value >> 24) == 0 || (value >> 24) == 0xff) {
+    // Preserve transparency of fully transparent or fully opaque pixels.
+    a = NearLosslessDiff((value >> 24) & 0xff, (predict >> 24) & 0xff);
+  } else {
+    a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
+  }
+  g = NearLosslessComponent((value >> 8) & 0xff, (predict >> 8) & 0xff, 0xff,
+                            quantization);
+  if (used_subtract_green) {
+    // The green offset will be added to red and blue components during decoding
+    // to obtain the actual red and blue values.
+    new_green = ((predict >> 8) + g) & 0xff;
+    // The amount by which green has been adjusted during quantization. It is
+    // subtracted from red and blue for compensation, to avoid accumulating two
+    // quantization errors in them.
+    green_diff = NearLosslessDiff(new_green, (value >> 8) & 0xff);
+  }
+  r = NearLosslessComponent(NearLosslessDiff((value >> 16) & 0xff, green_diff),
+                            (predict >> 16) & 0xff, 0xff - new_green,
+                            quantization);
+  b = NearLosslessComponent(NearLosslessDiff(value & 0xff, green_diff),
+                            predict & 0xff, 0xff - new_green, quantization);
+  return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
+}
+#endif  // (WEBP_NEAR_LOSSLESS == 1)
+
+// Stores the difference between the pixel and its prediction in "out".
+// In case of a lossy encoding, updates the source image to avoid propagating
+// the deviation further to pixels which depend on the current pixel for their
+// predictions.
+static WEBP_INLINE void GetResidual(
+    int width, int height, uint32_t* const upper_row,
+    uint32_t* const current_row, const uint8_t* const max_diffs, int mode,
+    int x_start, int x_end, int y, int max_quantization, int exact,
+    int used_subtract_green, uint32_t* const out) {
+  if (exact) {
+    PredictBatch(mode, x_start, y, x_end - x_start, current_row, upper_row,
+                 out);
+  } else {
+    const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
+    int x;
+    for (x = x_start; x < x_end; ++x) {
+      uint32_t predict;
+      uint32_t residual;
+      if (y == 0) {
+        predict = (x == 0) ? ARGB_BLACK : current_row[x - 1];  // Left.
+      } else if (x == 0) {
+        predict = upper_row[x];  // Top.
+      } else {
+        predict = pred_func(&current_row[x - 1], upper_row + x);
+      }
+#if (WEBP_NEAR_LOSSLESS == 1)
+      if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
+          x == 0 || x == width - 1) {
+        residual = VP8LSubPixels(current_row[x], predict);
+      } else {
+        residual = NearLossless(current_row[x], predict, max_quantization,
+                                max_diffs[x], used_subtract_green);
+        // Update the source image.
+        current_row[x] = VP8LAddPixels(predict, residual);
+        // x is never 0 here so we do not need to update upper_row like below.
+      }
+#else
+      (void)max_diffs;
+      (void)height;
+      (void)max_quantization;
+      (void)used_subtract_green;
+      residual = VP8LSubPixels(current_row[x], predict);
+#endif
+      if ((current_row[x] & kMaskAlpha) == 0) {
+        // If alpha is 0, cleanup RGB. We can choose the RGB values of the
+        // residual for best compression. The prediction of alpha itself can be
+        // non-zero and must be kept though. We choose RGB of the residual to be
+        // 0.
+        residual &= kMaskAlpha;
+        // Update the source image.
+        current_row[x] = predict & ~kMaskAlpha;
+        // The prediction for the rightmost pixel in a row uses the leftmost
+        // pixel
+        // in that row as its top-right context pixel. Hence if we change the
+        // leftmost pixel of current_row, the corresponding change must be
+        // applied
+        // to upper_row as well where top-right context is being read from.
+        if (x == 0 && y != 0) upper_row[width] = current_row[0];
+      }
+      out[x - x_start] = residual;
+    }
+  }
+}
+
+// Returns best predictor and updates the accumulated histogram.
+// If max_quantization > 1, assumes that near lossless processing will be
+// applied, quantizing residuals to multiples of quantization levels up to
+// max_quantization (the actual quantization level depends on smoothness near
+// the given pixel).
+static int GetBestPredictorForTile(int width, int height,
+                                   int tile_x, int tile_y, int bits,
+                                   int accumulated[4][256],
+                                   uint32_t* const argb_scratch,
+                                   const uint32_t* const argb,
+                                   int max_quantization,
+                                   int exact, int used_subtract_green,
+                                   const uint32_t* const modes) {
+  const int kNumPredModes = 14;
+  const int start_x = tile_x << bits;
+  const int start_y = tile_y << bits;
+  const int tile_size = 1 << bits;
+  const int max_y = GetMin(tile_size, height - start_y);
+  const int max_x = GetMin(tile_size, width - start_x);
+  // Whether there exist columns just outside the tile.
+  const int have_left = (start_x > 0);
+  // Position and size of the strip covering the tile and adjacent columns if
+  // they exist.
+  const int context_start_x = start_x - have_left;
+#if (WEBP_NEAR_LOSSLESS == 1)
+  const int context_width = max_x + have_left + (max_x < width - start_x);
+#endif
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  // Prediction modes of the left and above neighbor tiles.
+  const int left_mode = (tile_x > 0) ?
+      (modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff : 0xff;
+  const int above_mode = (tile_y > 0) ?
+      (modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff : 0xff;
+  // The width of upper_row and current_row is one pixel larger than image width
+  // to allow the top right pixel to point to the leftmost pixel of the next row
+  // when at the right edge.
+  uint32_t* upper_row = argb_scratch;
+  uint32_t* current_row = upper_row + width + 1;
+  uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
+  float best_diff = MAX_DIFF_COST;
+  int best_mode = 0;
+  int mode;
+  int histo_stack_1[4][256];
+  int histo_stack_2[4][256];
+  // Need pointers to be able to swap arrays.
+  int (*histo_argb)[256] = histo_stack_1;
+  int (*best_histo)[256] = histo_stack_2;
+  int i, j;
+  uint32_t residuals[1 << MAX_TRANSFORM_BITS];
+  assert(bits <= MAX_TRANSFORM_BITS);
+  assert(max_x <= (1 << MAX_TRANSFORM_BITS));
+
+  for (mode = 0; mode < kNumPredModes; ++mode) {
+    float cur_diff;
+    int relative_y;
+    memset(histo_argb, 0, sizeof(histo_stack_1));
+    if (start_y > 0) {
+      // Read the row above the tile which will become the first upper_row.
+      // Include a pixel to the left if it exists; include a pixel to the right
+      // in all cases (wrapping to the leftmost pixel of the next row if it does
+      // not exist).
+      memcpy(current_row + context_start_x,
+             argb + (start_y - 1) * width + context_start_x,
+             sizeof(*argb) * (max_x + have_left + 1));
+    }
+    for (relative_y = 0; relative_y < max_y; ++relative_y) {
+      const int y = start_y + relative_y;
+      int relative_x;
+      uint32_t* tmp = upper_row;
+      upper_row = current_row;
+      current_row = tmp;
+      // Read current_row. Include a pixel to the left if it exists; include a
+      // pixel to the right in all cases except at the bottom right corner of
+      // the image (wrapping to the leftmost pixel of the next row if it does
+      // not exist in the current row).
+      memcpy(current_row + context_start_x,
+             argb + y * width + context_start_x,
+             sizeof(*argb) * (max_x + have_left + (y + 1 < height)));
+#if (WEBP_NEAR_LOSSLESS == 1)
+      if (max_quantization > 1 && y >= 1 && y + 1 < height) {
+        MaxDiffsForRow(context_width, width, argb + y * width + context_start_x,
+                       max_diffs + context_start_x, used_subtract_green);
+      }
+#endif
+
+      GetResidual(width, height, upper_row, current_row, max_diffs, mode,
+                  start_x, start_x + max_x, y, max_quantization, exact,
+                  used_subtract_green, residuals);
+      for (relative_x = 0; relative_x < max_x; ++relative_x) {
+        UpdateHisto(histo_argb, residuals[relative_x]);
+      }
+    }
+    cur_diff = PredictionCostSpatialHistogram(
+        (const int (*)[256])accumulated, (const int (*)[256])histo_argb);
+    // Favor keeping the areas locally similar.
+    if (mode == left_mode) cur_diff -= kSpatialPredictorBias;
+    if (mode == above_mode) cur_diff -= kSpatialPredictorBias;
+
+    if (cur_diff < best_diff) {
+      int (*tmp)[256] = histo_argb;
+      histo_argb = best_histo;
+      best_histo = tmp;
+      best_diff = cur_diff;
+      best_mode = mode;
+    }
+  }
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 256; j++) {
+      accumulated[i][j] += best_histo[i][j];
+    }
+  }
+
+  return best_mode;
+}
+
+// Converts pixels of the image to residuals with respect to predictions.
+// If max_quantization > 1, applies near lossless processing, quantizing
+// residuals to multiples of quantization levels up to max_quantization
+// (the actual quantization level depends on smoothness near the given pixel).
+static void CopyImageWithPrediction(int width, int height,
+                                    int bits, uint32_t* const modes,
+                                    uint32_t* const argb_scratch,
+                                    uint32_t* const argb,
+                                    int low_effort, int max_quantization,
+                                    int exact, int used_subtract_green) {
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  // The width of upper_row and current_row is one pixel larger than image width
+  // to allow the top right pixel to point to the leftmost pixel of the next row
+  // when at the right edge.
+  uint32_t* upper_row = argb_scratch;
+  uint32_t* current_row = upper_row + width + 1;
+  uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1);
+#if (WEBP_NEAR_LOSSLESS == 1)
+  uint8_t* lower_max_diffs = current_max_diffs + width;
+#endif
+  int y;
+
+  for (y = 0; y < height; ++y) {
+    int x;
+    uint32_t* const tmp32 = upper_row;
+    upper_row = current_row;
+    current_row = tmp32;
+    memcpy(current_row, argb + y * width,
+           sizeof(*argb) * (width + (y + 1 < height)));
+
+    if (low_effort) {
+      PredictBatch(kPredLowEffort, 0, y, width, current_row, upper_row,
+                   argb + y * width);
+    } else {
+#if (WEBP_NEAR_LOSSLESS == 1)
+      if (max_quantization > 1) {
+        // Compute max_diffs for the lower row now, because that needs the
+        // contents of argb for the current row, which we will overwrite with
+        // residuals before proceeding with the next row.
+        uint8_t* const tmp8 = current_max_diffs;
+        current_max_diffs = lower_max_diffs;
+        lower_max_diffs = tmp8;
+        if (y + 2 < height) {
+          MaxDiffsForRow(width, width, argb + (y + 1) * width, lower_max_diffs,
+                         used_subtract_green);
+        }
+      }
+#endif
+      for (x = 0; x < width;) {
+        const int mode =
+            (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
+        int x_end = x + (1 << bits);
+        if (x_end > width) x_end = width;
+        GetResidual(width, height, upper_row, current_row, current_max_diffs,
+                    mode, x, x_end, y, max_quantization, exact,
+                    used_subtract_green, argb + y * width + x);
+        x = x_end;
+      }
+    }
+  }
+}
+
+// Finds the best predictor for each tile, and converts the image to residuals
+// with respect to predictions. If near_lossless_quality < 100, applies
+// near lossless processing, shaving off more bits of residuals for lower
+// qualities.
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image, int near_lossless_quality,
+                       int exact, int used_subtract_green) {
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  const int tiles_per_col = VP8LSubSampleSize(height, bits);
+  int tile_y;
+  int histo[4][256];
+  const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
+  if (low_effort) {
+    int i;
+    for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
+      image[i] = ARGB_BLACK | (kPredLowEffort << 8);
+    }
+  } else {
+    memset(histo, 0, sizeof(histo));
+    for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
+      int tile_x;
+      for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
+        const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
+            bits, histo, argb_scratch, argb, max_quantization, exact,
+            used_subtract_green, image);
+        image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
+      }
+    }
+  }
+
+  CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
+                          low_effort, max_quantization, exact,
+                          used_subtract_green);
+}
+
+//------------------------------------------------------------------------------
+// Color transform functions.
+
+static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
+  m->green_to_red_ = 0;
+  m->green_to_blue_ = 0;
+  m->red_to_blue_ = 0;
+}
+
+static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
+                                               VP8LMultipliers* const m) {
+  m->green_to_red_  = (color_code >>  0) & 0xff;
+  m->green_to_blue_ = (color_code >>  8) & 0xff;
+  m->red_to_blue_   = (color_code >> 16) & 0xff;
+}
+
+static WEBP_INLINE uint32_t MultipliersToColorCode(
+    const VP8LMultipliers* const m) {
+  return 0xff000000u |
+         ((uint32_t)(m->red_to_blue_) << 16) |
+         ((uint32_t)(m->green_to_blue_) << 8) |
+         m->green_to_red_;
+}
+
+static float PredictionCostCrossColor(const int accumulated[256],
+                                      const int counts[256]) {
+  // Favor low entropy, locally and globally.
+  // Favor small absolute values for PredictionCostSpatial
+  static const double kExpValue = 2.4;
+  return VP8LCombinedShannonEntropy(counts, accumulated) +
+         PredictionCostSpatial(counts, 3, kExpValue);
+}
+
+static float GetPredictionCostCrossColorRed(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
+    const int accumulated_red_histo[256]) {
+  int histo[256] = { 0 };
+  float cur_diff;
+
+  VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
+                                green_to_red, histo);
+
+  cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
+  if ((uint8_t)green_to_red == prev_x.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_red == prev_y.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_red == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+static void GetBestGreenToRed(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) {
+  const int kMaxIters = 4 + ((7 * quality) >> 8);  // in range [4..6]
+  int green_to_red_best = 0;
+  int iter, offset;
+  float best_diff = GetPredictionCostCrossColorRed(
+      argb, stride, tile_width, tile_height, prev_x, prev_y,
+      green_to_red_best, accumulated_red_histo);
+  for (iter = 0; iter < kMaxIters; ++iter) {
+    // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
+    // one in color computation. Having initial delta here as 1 is sufficient
+    // to explore the range of (-2, 2).
+    const int delta = 32 >> iter;
+    // Try a negative and a positive delta from the best known value.
+    for (offset = -delta; offset <= delta; offset += 2 * delta) {
+      const int green_to_red_cur = offset + green_to_red_best;
+      const float cur_diff = GetPredictionCostCrossColorRed(
+          argb, stride, tile_width, tile_height, prev_x, prev_y,
+          green_to_red_cur, accumulated_red_histo);
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        green_to_red_best = green_to_red_cur;
+      }
+    }
+  }
+  best_tx->green_to_red_ = (green_to_red_best & 0xff);
+}
+
+static float GetPredictionCostCrossColorBlue(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y,
+    int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) {
+  int histo[256] = { 0 };
+  float cur_diff;
+
+  VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
+                                 green_to_blue, red_to_blue, histo);
+
+  cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
+  if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  if (red_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+#define kGreenRedToBlueNumAxis 8
+#define kGreenRedToBlueMaxIters 7
+static void GetBestGreenRedToBlue(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_blue_histo[256],
+    VP8LMultipliers* const best_tx) {
+  const int8_t offset[kGreenRedToBlueNumAxis][2] =
+      {{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
+  const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
+  const int iters =
+      (quality < 25) ? 1 : (quality > 50) ? kGreenRedToBlueMaxIters : 4;
+  int green_to_blue_best = 0;
+  int red_to_blue_best = 0;
+  int iter;
+  // Initial value at origin:
+  float best_diff = GetPredictionCostCrossColorBlue(
+      argb, stride, tile_width, tile_height, prev_x, prev_y,
+      green_to_blue_best, red_to_blue_best, accumulated_blue_histo);
+  for (iter = 0; iter < iters; ++iter) {
+    const int delta = delta_lut[iter];
+    int axis;
+    for (axis = 0; axis < kGreenRedToBlueNumAxis; ++axis) {
+      const int green_to_blue_cur =
+          offset[axis][0] * delta + green_to_blue_best;
+      const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
+      const float cur_diff = GetPredictionCostCrossColorBlue(
+          argb, stride, tile_width, tile_height, prev_x, prev_y,
+          green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        green_to_blue_best = green_to_blue_cur;
+        red_to_blue_best = red_to_blue_cur;
+      }
+      if (quality < 25 && iter == 4) {
+        // Only axis aligned diffs for lower quality.
+        break;  // next iter.
+      }
+    }
+    if (delta == 2 && green_to_blue_best == 0 && red_to_blue_best == 0) {
+      // Further iterations would not help.
+      break;  // out of iter-loop.
+    }
+  }
+  best_tx->green_to_blue_ = green_to_blue_best & 0xff;
+  best_tx->red_to_blue_ = red_to_blue_best & 0xff;
+}
+#undef kGreenRedToBlueMaxIters
+#undef kGreenRedToBlueNumAxis
+
+static VP8LMultipliers GetBestColorTransformForTile(
+    int tile_x, int tile_y, int bits,
+    VP8LMultipliers prev_x,
+    VP8LMultipliers prev_y,
+    int quality, int xsize, int ysize,
+    const int accumulated_red_histo[256],
+    const int accumulated_blue_histo[256],
+    const uint32_t* const argb) {
+  const int max_tile_size = 1 << bits;
+  const int tile_y_offset = tile_y * max_tile_size;
+  const int tile_x_offset = tile_x * max_tile_size;
+  const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
+  const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
+  const int tile_width = all_x_max - tile_x_offset;
+  const int tile_height = all_y_max - tile_y_offset;
+  const uint32_t* const tile_argb = argb + tile_y_offset * xsize
+                                  + tile_x_offset;
+  VP8LMultipliers best_tx;
+  MultipliersClear(&best_tx);
+
+  GetBestGreenToRed(tile_argb, xsize, tile_width, tile_height,
+                    prev_x, prev_y, quality, accumulated_red_histo, &best_tx);
+  GetBestGreenRedToBlue(tile_argb, xsize, tile_width, tile_height,
+                        prev_x, prev_y, quality, accumulated_blue_histo,
+                        &best_tx);
+  return best_tx;
+}
+
+static void CopyTileWithColorTransform(int xsize, int ysize,
+                                       int tile_x, int tile_y,
+                                       int max_tile_size,
+                                       VP8LMultipliers color_transform,
+                                       uint32_t* argb) {
+  const int xscan = GetMin(max_tile_size, xsize - tile_x);
+  int yscan = GetMin(max_tile_size, ysize - tile_y);
+  argb += tile_y * xsize + tile_x;
+  while (yscan-- > 0) {
+    VP8LTransformColor(&color_transform, argb, xscan);
+    argb += xsize;
+  }
+}
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                             uint32_t* const argb, uint32_t* image) {
+  const int max_tile_size = 1 << bits;
+  const int tile_xsize = VP8LSubSampleSize(width, bits);
+  const int tile_ysize = VP8LSubSampleSize(height, bits);
+  int accumulated_red_histo[256] = { 0 };
+  int accumulated_blue_histo[256] = { 0 };
+  int tile_x, tile_y;
+  VP8LMultipliers prev_x, prev_y;
+  MultipliersClear(&prev_y);
+  MultipliersClear(&prev_x);
+  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
+    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
+      int y;
+      const int tile_x_offset = tile_x * max_tile_size;
+      const int tile_y_offset = tile_y * max_tile_size;
+      const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
+      const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
+      const int offset = tile_y * tile_xsize + tile_x;
+      if (tile_y != 0) {
+        ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
+      }
+      prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
+                                            prev_x, prev_y,
+                                            quality, width, height,
+                                            accumulated_red_histo,
+                                            accumulated_blue_histo,
+                                            argb);
+      image[offset] = MultipliersToColorCode(&prev_x);
+      CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
+                                 max_tile_size, prev_x, argb);
+
+      // Gather accumulated histogram data.
+      for (y = tile_y_offset; y < all_y_max; ++y) {
+        int ix = y * width + tile_x_offset;
+        const int ix_end = ix + all_x_max - tile_x_offset;
+        for (; ix < ix_end; ++ix) {
+          const uint32_t pix = argb[ix];
+          if (ix >= 2 &&
+              pix == argb[ix - 2] &&
+              pix == argb[ix - 1]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          if (ix >= width + 2 &&
+              argb[ix - 2] == argb[ix - width - 2] &&
+              argb[ix - 1] == argb[ix - width - 1] &&
+              pix == argb[ix - width]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          ++accumulated_red_histo[(pix >> 16) & 0xff];
+          ++accumulated_blue_histo[(pix >> 0) & 0xff];
+        }
+      }
+    }
+  }
+}
diff --git a/media/libwebp/enc/quant_enc.c b/media/libwebp/enc/quant_enc.c
new file mode 100644
index 0000000000..029d62ca05
--- /dev/null
+++ b/media/libwebp/enc/quant_enc.c
@@ -0,0 +1,1388 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   Quantization
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>  // for abs()
+
+#include "../dsp/quant.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/cost_enc.h"
+
+#define DO_TRELLIS_I4  1
+#define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
+#define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
+#define USE_TDISTO 1
+
+#define MID_ALPHA 64      // neutral value for susceptibility
+#define MIN_ALPHA 30      // lowest usable value for susceptibility
+#define MAX_ALPHA 100     // higher meaningful value for susceptibility
+
+#define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
+                          // power-law modulation. Must be strictly less than 1.
+
+// number of non-zero coeffs below which we consider the block very flat
+// (and apply a penalty to complex predictions)
+#define FLATNESS_LIMIT_I16 0       // I16 mode (special case)
+#define FLATNESS_LIMIT_I4  3       // I4 mode
+#define FLATNESS_LIMIT_UV  2       // UV mode
+#define FLATNESS_PENALTY   140     // roughly ~1bit per block
+
+#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
+
+#define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
+
+// #define DEBUG_BLOCK
+
+//------------------------------------------------------------------------------
+
+#if defined(DEBUG_BLOCK)
+
+#include <stdio.h>
+#include <stdlib.h>
+
+static void PrintBlockInfo(const VP8EncIterator* const it,
+                           const VP8ModeScore* const rd) {
+  int i, j;
+  const int is_i16 = (it->mb_->type_ == 1);
+  const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
+  const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
+  const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
+  const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
+  printf("SOURCE / OUTPUT / ABS DELTA\n");
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
+    printf("     ");
+    for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
+    printf("     ");
+    for (i = 0; i < 16; ++i) {
+      printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
+    }
+    printf("\n");
+  }
+  printf("\n");   // newline before the U/V block
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
+    printf(" ");
+    for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
+    printf("    ");
+    for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
+    printf(" ");
+    for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
+    printf("   ");
+    for (i = 0; i < 8; ++i) {
+      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
+    }
+    printf(" ");
+    for (i = 8; i < 16; ++i) {
+      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
+    }
+    printf("\n");
+  }
+  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
+    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
+    (int)rd->score);
+  if (is_i16) {
+    printf("Mode: %d\n", rd->mode_i16);
+    printf("y_dc_levels:");
+    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
+    printf("\n");
+  } else {
+    printf("Modes[16]: ");
+    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
+    printf("\n");
+  }
+  printf("y_ac_levels:\n");
+  for (j = 0; j < 16; ++j) {
+    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
+      printf("%4d ", rd->y_ac_levels[j][i]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+  printf("uv_levels (mode=%d):\n", rd->mode_uv);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 16; ++i) {
+      printf("%4d ", rd->uv_levels[j][i]);
+    }
+    printf("\n");
+  }
+}
+
+#endif   // DEBUG_BLOCK
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int clip(int v, int m, int M) {
+  return v < m ? m : v > M ? M : v;
+}
+
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+static const uint8_t kDcTable[128] = {
+  4,     5,   6,   7,   8,   9,  10,  10,
+  11,   12,  13,  14,  15,  16,  17,  17,
+  18,   19,  20,  20,  21,  21,  22,  22,
+  23,   23,  24,  25,  25,  26,  27,  28,
+  29,   30,  31,  32,  33,  34,  35,  36,
+  37,   37,  38,  39,  40,  41,  42,  43,
+  44,   45,  46,  46,  47,  48,  49,  50,
+  51,   52,  53,  54,  55,  56,  57,  58,
+  59,   60,  61,  62,  63,  64,  65,  66,
+  67,   68,  69,  70,  71,  72,  73,  74,
+  75,   76,  76,  77,  78,  79,  80,  81,
+  82,   83,  84,  85,  86,  87,  88,  89,
+  91,   93,  95,  96,  98, 100, 101, 102,
+  104, 106, 108, 110, 112, 114, 116, 118,
+  122, 124, 126, 128, 130, 132, 134, 136,
+  138, 140, 143, 145, 148, 151, 154, 157
+};
+
+static const uint16_t kAcTable[128] = {
+  4,     5,   6,   7,   8,   9,  10,  11,
+  12,   13,  14,  15,  16,  17,  18,  19,
+  20,   21,  22,  23,  24,  25,  26,  27,
+  28,   29,  30,  31,  32,  33,  34,  35,
+  36,   37,  38,  39,  40,  41,  42,  43,
+  44,   45,  46,  47,  48,  49,  50,  51,
+  52,   53,  54,  55,  56,  57,  58,  60,
+  62,   64,  66,  68,  70,  72,  74,  76,
+  78,   80,  82,  84,  86,  88,  90,  92,
+  94,   96,  98, 100, 102, 104, 106, 108,
+  110, 112, 114, 116, 119, 122, 125, 128,
+  131, 134, 137, 140, 143, 146, 149, 152,
+  155, 158, 161, 164, 167, 170, 173, 177,
+  181, 185, 189, 193, 197, 201, 205, 209,
+  213, 217, 221, 225, 229, 234, 239, 245,
+  249, 254, 259, 264, 269, 274, 279, 284
+};
+
+static const uint16_t kAcTable2[128] = {
+  8,     8,   9,  10,  12,  13,  15,  17,
+  18,   20,  21,  23,  24,  26,  27,  29,
+  31,   32,  34,  35,  37,  38,  40,  41,
+  43,   44,  46,  48,  49,  51,  52,  54,
+  55,   57,  58,  60,  62,  63,  65,  66,
+  68,   69,  71,  72,  74,  75,  77,  79,
+  80,   82,  83,  85,  86,  88,  89,  93,
+  96,   99, 102, 105, 108, 111, 114, 117,
+  120, 124, 127, 130, 133, 136, 139, 142,
+  145, 148, 151, 155, 158, 161, 164, 167,
+  170, 173, 176, 179, 184, 189, 193, 198,
+  203, 207, 212, 217, 221, 226, 230, 235,
+  240, 244, 249, 254, 258, 263, 268, 274,
+  280, 286, 292, 299, 305, 311, 317, 323,
+  330, 336, 342, 348, 354, 362, 370, 379,
+  385, 393, 401, 409, 416, 424, 432, 440
+};
+
+static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
+  { 96, 110 }, { 96, 108 }, { 110, 115 }
+};
+
+// Sharpening by (slightly) raising the hi-frequency coeffs.
+// Hack-ish but helpful for mid-bitrate range. Use with care.
+#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
+static const uint8_t kFreqSharpening[16] = {
+  0,  30, 60, 90,
+  30, 60, 90, 90,
+  60, 90, 90, 90,
+  90, 90, 90, 90
+};
+
+//------------------------------------------------------------------------------
+// Initialize quantization parameters in VP8Matrix
+
+// Returns the average quantizer
+static int ExpandMatrix(VP8Matrix* const m, int type) {
+  int i, sum;
+  for (i = 0; i < 2; ++i) {
+    const int is_ac_coeff = (i > 0);
+    const int bias = kBiasMatrices[type][is_ac_coeff];
+    m->iq_[i] = (1 << QFIX) / m->q_[i];
+    m->bias_[i] = BIAS(bias);
+    // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
+    //   * zero if coeff <= zthresh
+    //   * non-zero if coeff > zthresh
+    m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
+  }
+  for (i = 2; i < 16; ++i) {
+    m->q_[i] = m->q_[1];
+    m->iq_[i] = m->iq_[1];
+    m->bias_[i] = m->bias_[1];
+    m->zthresh_[i] = m->zthresh_[1];
+  }
+  for (sum = 0, i = 0; i < 16; ++i) {
+    if (type == 0) {  // we only use sharpening for AC luma coeffs
+      m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
+    } else {
+      m->sharpen_[i] = 0;
+    }
+    sum += m->q_[i];
+  }
+  return (sum + 8) >> 4;
+}
+
+static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
+
+static void SetupMatrices(VP8Encoder* enc) {
+  int i;
+  const int tlambda_scale =
+    (enc->method_ >= 4) ? enc->config_->sns_strength
+                        : 0;
+  const int num_segments = enc->segment_hdr_.num_segments_;
+  for (i = 0; i < num_segments; ++i) {
+    VP8SegmentInfo* const m = &enc->dqm_[i];
+    const int q = m->quant_;
+    int q_i4, q_i16, q_uv;
+    m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
+    m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
+
+    m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
+    m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
+
+    m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
+    m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
+
+    q_i4  = ExpandMatrix(&m->y1_, 0);
+    q_i16 = ExpandMatrix(&m->y2_, 1);
+    q_uv  = ExpandMatrix(&m->uv_, 2);
+
+    m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
+    m->lambda_i16_         = (3 * q_i16 * q_i16);
+    m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
+    m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
+    m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
+    m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
+    m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
+    m->tlambda_            = (tlambda_scale * q_i4) >> 5;
+
+    // none of these constants should be < 1
+    CheckLambdaValue(&m->lambda_i4_);
+    CheckLambdaValue(&m->lambda_i16_);
+    CheckLambdaValue(&m->lambda_uv_);
+    CheckLambdaValue(&m->lambda_mode_);
+    CheckLambdaValue(&m->lambda_trellis_i4_);
+    CheckLambdaValue(&m->lambda_trellis_i16_);
+    CheckLambdaValue(&m->lambda_trellis_uv_);
+    CheckLambdaValue(&m->tlambda_);
+
+    m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
+    m->max_edge_  = 0;
+
+    m->i4_penalty_ = 1000 * q_i4 * q_i4;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Initialize filtering parameters
+
+// Very small filter-strength values have close to no visual effect. So we can
+// save a little decoding-CPU by turning filtering off for these.
+#define FSTRENGTH_CUTOFF 2
+
+static void SetupFilterStrength(VP8Encoder* const enc) {
+  int i;
+  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
+  const int level0 = 5 * enc->config_->filter_strength;
+  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
+    VP8SegmentInfo* const m = &enc->dqm_[i];
+    // We focus on the quantization of AC coeffs.
+    const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
+    const int base_strength =
+        VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
+    // Segments with lower complexity ('beta') will be less filtered.
+    const int f = base_strength * level0 / (256 + m->beta_);
+    m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
+  }
+  // We record the initial strength (mainly for the case of 1-segment only).
+  enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
+  enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
+  enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
+}
+
+//------------------------------------------------------------------------------
+
+// Note: if you change the values below, remember that the max range
+// allowed by the syntax for DQ_UV is [-16,16].
+#define MAX_DQ_UV (6)
+#define MIN_DQ_UV (-4)
+
+// We want to emulate jpeg-like behaviour where the expected "good" quality
+// is around q=75. Internally, our "good" middle is around c=50. So we
+// map accordingly using linear piece-wise function
+static double QualityToCompression(double c) {
+  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
+  // The file size roughly scales as pow(quantizer, 3.). Actually, the
+  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
+  // in the mid-quant range. So we scale the compressibility inversely to
+  // this power-law: quant ~= compression ^ 1/3. This law holds well for
+  // low quant. Finer modeling for high-quant would make use of kAcTable[]
+  // more explicitly.
+  const double v = pow(linear_c, 1 / 3.);
+  return v;
+}
+
+static double QualityToJPEGCompression(double c, double alpha) {
+  // We map the complexity 'alpha' and quality setting 'c' to a compression
+  // exponent empirically matched to the compression curve of libjpeg6b.
+  // On average, the WebP output size will be roughly similar to that of a
+  // JPEG file compressed with same quality factor.
+  const double amin = 0.30;
+  const double amax = 0.85;
+  const double exp_min = 0.4;
+  const double exp_max = 0.9;
+  const double slope = (exp_min - exp_max) / (amax - amin);
+  // Linearly interpolate 'expn' from exp_min to exp_max
+  // in the [amin, amax] range.
+  const double expn = (alpha > amax) ? exp_min
+                    : (alpha < amin) ? exp_max
+                    : exp_max + slope * (alpha - amin);
+  const double v = pow(c, expn);
+  return v;
+}
+
+static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
+                                 const VP8SegmentInfo* const S2) {
+  return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
+}
+
+static void SimplifySegments(VP8Encoder* const enc) {
+  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
+  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
+  // explicit check is needed to avoid a spurious warning about 'i' exceeding
+  // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
+  const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
+                               ? enc->segment_hdr_.num_segments_
+                               : NUM_MB_SEGMENTS;
+  int num_final_segments = 1;
+  int s1, s2;
+  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
+    const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
+    int found = 0;
+    // check if we already have similar segment
+    for (s2 = 0; s2 < num_final_segments; ++s2) {
+      const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
+      if (SegmentsAreEquivalent(S1, S2)) {
+        found = 1;
+        break;
+      }
+    }
+    map[s1] = s2;
+    if (!found) {
+      if (num_final_segments != s1) {
+        enc->dqm_[num_final_segments] = enc->dqm_[s1];
+      }
+      ++num_final_segments;
+    }
+  }
+  if (num_final_segments < num_segments) {  // Remap
+    int i = enc->mb_w_ * enc->mb_h_;
+    while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
+    enc->segment_hdr_.num_segments_ = num_final_segments;
+    // Replicate the trailing segment infos (it's mostly cosmetics)
+    for (i = num_final_segments; i < num_segments; ++i) {
+      enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
+    }
+  }
+}
+
+void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
+  int i;
+  int dq_uv_ac, dq_uv_dc;
+  const int num_segments = enc->segment_hdr_.num_segments_;
+  const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
+  const double Q = quality / 100.;
+  const double c_base = enc->config_->emulate_jpeg_size ?
+      QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
+      QualityToCompression(Q);
+  for (i = 0; i < num_segments; ++i) {
+    // We modulate the base coefficient to accommodate for the quantization
+    // susceptibility and allow denser segments to be quantized more.
+    const double expn = 1. - amp * enc->dqm_[i].alpha_;
+    const double c = pow(c_base, expn);
+    const int q = (int)(127. * (1. - c));
+    assert(expn > 0.);
+    enc->dqm_[i].quant_ = clip(q, 0, 127);
+  }
+
+  // purely indicative in the bitstream (except for the 1-segment case)
+  enc->base_quant_ = enc->dqm_[0].quant_;
+
+  // fill-in values for the unused segments (required by the syntax)
+  for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
+    enc->dqm_[i].quant_ = enc->base_quant_;
+  }
+
+  // uv_alpha_ is normally spread around ~60. The useful range is
+  // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
+  // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
+  dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
+                                          / (MAX_ALPHA - MIN_ALPHA);
+  // we rescale by the user-defined strength of adaptation
+  dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
+  // and make it safe.
+  dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
+  // We also boost the dc-uv-quant a little, based on sns-strength, since
+  // U/V channels are quite more reactive to high quants (flat DC-blocks
+  // tend to appear, and are unpleasant).
+  dq_uv_dc = -4 * enc->config_->sns_strength / 100;
+  dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
+
+  enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
+  enc->dq_y2_dc_ = 0;
+  enc->dq_y2_ac_ = 0;
+  enc->dq_uv_dc_ = dq_uv_dc;
+  enc->dq_uv_ac_ = dq_uv_ac;
+
+  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
+
+  if (num_segments > 1) SimplifySegments(enc);
+
+  SetupMatrices(enc);         // finalize quantization matrices
+}
+
+//------------------------------------------------------------------------------
+// Form the predictions in cache
+
+// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
+const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
+const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
+
+// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
+const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
+  I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
+};
+
+void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
+  const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
+  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
+  VP8EncPredLuma16(it->yuv_p_, left, top);
+}
+
+void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
+  const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
+  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
+  VP8EncPredChroma8(it->yuv_p_, left, top);
+}
+
+void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
+  VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
+}
+
+//------------------------------------------------------------------------------
+// Quantize
+
+// Layout:
+// +----+----+
+// |YYYY|UUVV| 0
+// |YYYY|UUVV| 4
+// |YYYY|....| 8
+// |YYYY|....| 12
+// +----+----+
+
+const uint16_t VP8Scan[16] = {  // Luma
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
+};
+
+static const uint16_t VP8ScanUV[4 + 4] = {
+  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
+  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
+};
+
+//------------------------------------------------------------------------------
+// Distortion measurement
+
+static const uint16_t kWeightY[16] = {
+  38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
+};
+
+static const uint16_t kWeightTrellis[16] = {
+#if USE_TDISTO == 0
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+#else
+  30, 27, 19, 11,
+  27, 24, 17, 10,
+  19, 17, 12,  8,
+  11, 10,  8,  6
+#endif
+};
+
+// Init/Copy the common fields in score.
+static void InitScore(VP8ModeScore* const rd) {
+  rd->D  = 0;
+  rd->SD = 0;
+  rd->R  = 0;
+  rd->H  = 0;
+  rd->nz = 0;
+  rd->score = MAX_COST;
+}
+
+static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
+  dst->D  = src->D;
+  dst->SD = src->SD;
+  dst->R  = src->R;
+  dst->H  = src->H;
+  dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
+  dst->score = src->score;
+}
+
+static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
+  dst->D  += src->D;
+  dst->SD += src->SD;
+  dst->R  += src->R;
+  dst->H  += src->H;
+  dst->nz |= src->nz;     // here, new nz bits are accumulated.
+  dst->score += src->score;
+}
+
+//------------------------------------------------------------------------------
+// Performs trellis-optimized quantization.
+
+// Trellis node
+typedef struct {
+  int8_t prev;            // best previous node
+  int8_t sign;            // sign of coeff_i
+  int16_t level;          // level
+} Node;
+
+// Score state
+typedef struct {
+  score_t score;          // partial RD score
+  const uint16_t* costs;  // shortcut to cost tables
+} ScoreState;
+
+// If a coefficient was quantized to a value Q (using a neutral bias),
+// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
+// We don't test negative values though.
+#define MIN_DELTA 0   // how much lower level to try
+#define MAX_DELTA 1   // how much higher
+#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
+#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
+#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
+
+static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
+  rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
+}
+
+static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
+                                          score_t distortion) {
+  return rate * lambda + RD_DISTO_MULT * distortion;
+}
+
+// Coefficient type.
+enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
+
+static int TrellisQuantizeBlock(const VP8Encoder* const enc,
+                                int16_t in[16], int16_t out[16],
+                                int ctx0, int coeff_type,
+                                const VP8Matrix* const mtx,
+                                int lambda) {
+  const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
+  CostArrayPtr const costs =
+      (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
+  const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
+  Node nodes[16][NUM_NODES];
+  ScoreState score_states[2][NUM_NODES];
+  ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
+  ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
+  int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
+  score_t best_score;
+  int n, m, p, last;
+
+  {
+    score_t cost;
+    const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
+    const int last_proba = probas[VP8EncBands[first]][ctx0][0];
+
+    // compute the position of the last interesting coefficient
+    last = first - 1;
+    for (n = 15; n >= first; --n) {
+      const int j = kZigzag[n];
+      const int err = in[j] * in[j];
+      if (err > thresh) {
+        last = n;
+        break;
+      }
+    }
+    // we don't need to go inspect up to n = 16 coeffs. We can just go up
+    // to last + 1 (inclusive) without losing much.
+    if (last < 15) ++last;
+
+    // compute 'skip' score. This is the max score one can do.
+    cost = VP8BitCost(0, last_proba);
+    best_score = RDScoreTrellis(lambda, cost, 0);
+
+    // initialize source node.
+    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
+      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
+      ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
+      ss_cur[m].costs = costs[first][ctx0];
+    }
+  }
+
+  // traverse trellis.
+  for (n = first; n <= last; ++n) {
+    const int j = kZigzag[n];
+    const uint32_t Q  = mtx->q_[j];
+    const uint32_t iQ = mtx->iq_[j];
+    const uint32_t B = BIAS(0x00);     // neutral bias
+    // note: it's important to take sign of the _original_ coeff,
+    // so we don't have to consider level < 0 afterward.
+    const int sign = (in[j] < 0);
+    const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    int level0 = QUANTDIV(coeff0, iQ, B);
+    int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
+    if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
+    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
+
+    {   // Swap current and previous score states
+      ScoreState* const tmp = ss_cur;
+      ss_cur = ss_prev;
+      ss_prev = tmp;
+    }
+
+    // test all alternate level values around level0.
+    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
+      Node* const cur = &NODE(n, m);
+      const int level = level0 + m;
+      const int ctx = (level > 2) ? 2 : level;
+      const int band = VP8EncBands[n + 1];
+      score_t base_score;
+      score_t best_cur_score;
+      int best_prev;
+      score_t cost, score;
+
+      ss_cur[m].costs = costs[n + 1][ctx];
+      if (level < 0 || level > thresh_level) {
+        ss_cur[m].score = MAX_COST;
+        // Node is dead.
+        continue;
+      }
+
+      {
+        // Compute delta_error = how much coding this level will
+        // subtract to max_error as distortion.
+        // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
+        const int new_error = coeff0 - level * Q;
+        const int delta_error =
+            kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
+        base_score = RDScoreTrellis(lambda, 0, delta_error);
+      }
+
+      // Inspect all possible non-dead predecessors. Retain only the best one.
+      // The base_score is added to all scores so it is only added for the final
+      // value after the loop.
+      cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
+      best_cur_score =
+          ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
+      best_prev = -MIN_DELTA;
+      for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
+        // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
+        // eliminated since their score can't be better than the current best.
+        cost = VP8LevelCost(ss_prev[p].costs, level);
+        // Examine node assuming it's a non-terminal one.
+        score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
+        if (score < best_cur_score) {
+          best_cur_score = score;
+          best_prev = p;
+        }
+      }
+      best_cur_score += base_score;
+      // Store best finding in current node.
+      cur->sign = sign;
+      cur->level = level;
+      cur->prev = best_prev;
+      ss_cur[m].score = best_cur_score;
+
+      // Now, record best terminal node (and thus best entry in the graph).
+      if (level != 0 && best_cur_score < best_score) {
+        const score_t last_pos_cost =
+            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
+        const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
+        score = best_cur_score + last_pos_score;
+        if (score < best_score) {
+          best_score = score;
+          best_path[0] = n;                     // best eob position
+          best_path[1] = m;                     // best node index
+          best_path[2] = best_prev;             // best predecessor
+        }
+      }
+    }
+  }
+
+  // Fresh start
+  // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
+  if (coeff_type == TYPE_I16_AC) {
+    memset(in + 1, 0, 15 * sizeof(*in));
+    memset(out + 1, 0, 15 * sizeof(*out));
+  } else {
+    memset(in, 0, 16 * sizeof(*in));
+    memset(out, 0, 16 * sizeof(*out));
+  }
+  if (best_path[0] == -1) {
+    return 0;  // skip!
+  }
+
+  {
+    // Unwind the best path.
+    // Note: best-prev on terminal node is not necessarily equal to the
+    // best_prev for non-terminal. So we patch best_path[2] in.
+    int nz = 0;
+    int best_node = best_path[1];
+    n = best_path[0];
+    NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
+
+    for (; n >= first; --n) {
+      const Node* const node = &NODE(n, best_node);
+      const int j = kZigzag[n];
+      out[n] = node->sign ? -node->level : node->level;
+      nz |= node->level;
+      in[j] = out[n] * mtx->q_[j];
+      best_node = node->prev;
+    }
+    return (nz != 0);
+  }
+}
+
+#undef NODE
+
+//------------------------------------------------------------------------------
+// Performs: difference, transform, quantize, back-transform, add
+// all at once. Output is the reconstructed block in *yuv_out, and the
+// quantized levels in *levels.
+
+static int ReconstructIntra16(VP8EncIterator* const it,
+                              VP8ModeScore* const rd,
+                              uint8_t* const yuv_out,
+                              int mode) {
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
+  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  int nz = 0;
+  int n;
+  int16_t tmp[16][16], dc_tmp[16];
+
+  for (n = 0; n < 16; n += 2) {
+    VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
+  }
+  VP8FTransformWHT(tmp[0], dc_tmp);
+  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
+
+  if (DO_TRELLIS_I16 && it->do_trellis_) {
+    int x, y;
+    VP8IteratorNzToBytes(it);
+    for (y = 0, n = 0; y < 4; ++y) {
+      for (x = 0; x < 4; ++x, ++n) {
+        const int ctx = it->top_nz_[x] + it->left_nz_[y];
+        const int non_zero = TrellisQuantizeBlock(
+            enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
+            dqm->lambda_trellis_i16_);
+        it->top_nz_[x] = it->left_nz_[y] = non_zero;
+        rd->y_ac_levels[n][0] = 0;
+        nz |= non_zero << n;
+      }
+    }
+  } else {
+    for (n = 0; n < 16; n += 2) {
+      // Zero-out the first coeff, so that: a) nz is correct below, and
+      // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
+      tmp[n][0] = tmp[n + 1][0] = 0;
+      nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
+      assert(rd->y_ac_levels[n + 0][0] == 0);
+      assert(rd->y_ac_levels[n + 1][0] == 0);
+    }
+  }
+
+  // Transform back
+  VP8TransformWHT(dc_tmp, tmp[0]);
+  for (n = 0; n < 16; n += 2) {
+    VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
+  }
+
+  return nz;
+}
+
+static int ReconstructIntra4(VP8EncIterator* const it,
+                             int16_t levels[16],
+                             const uint8_t* const src,
+                             uint8_t* const yuv_out,
+                             int mode) {
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  int nz = 0;
+  int16_t tmp[16];
+
+  VP8FTransform(src, ref, tmp);
+  if (DO_TRELLIS_I4 && it->do_trellis_) {
+    const int x = it->i4_ & 3, y = it->i4_ >> 2;
+    const int ctx = it->top_nz_[x] + it->left_nz_[y];
+    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
+                              dqm->lambda_trellis_i4_);
+  } else {
+    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
+  }
+  VP8ITransform(ref, tmp, yuv_out, 0);
+  return nz;
+}
+
+//------------------------------------------------------------------------------
+// DC-error diffusion
+
+// Diffusion weights. We under-correct a bit (15/16th of the error is actually
+// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
+#define C1 7    // fraction of error sent to the 4x4 block below
+#define C2 8    // fraction of error sent to the 4x4 block on the right
+#define DSHIFT 4
+#define DSCALE 1   // storage descaling, needed to make the error fit int8_t
+
+// Quantize as usual, but also compute and return the quantization error.
+// Error is already divided by DSHIFT.
+static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
+  int V = *v;
+  const int sign = (V < 0);
+  if (sign) V = -V;
+  if (V > (int)mtx->zthresh_[0]) {
+    const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
+    const int err = (V - qV);
+    *v = sign ? -qV : qV;
+    return (sign ? -err : err) >> DSCALE;
+  }
+  *v = 0;
+  return (sign ? -V : V) >> DSCALE;
+}
+
+static void CorrectDCValues(const VP8EncIterator* const it,
+                            const VP8Matrix* const mtx,
+                            int16_t tmp[][16], VP8ModeScore* const rd) {
+  //         | top[0] | top[1]
+  // --------+--------+---------
+  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
+  // left[1] | tmp[2]   tmp[3]        err2 err3
+  //
+  // Final errors {err1,err2,err3} are preserved and later restored
+  // as top[]/left[] on the next block.
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    const int8_t* const top = it->top_derr_[it->x_][ch];
+    const int8_t* const left = it->left_derr_[ch];
+    int16_t (* const c)[16] = &tmp[ch * 4];
+    int err0, err1, err2, err3;
+    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
+    err0 = QuantizeSingle(&c[0][0], mtx);
+    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
+    err1 = QuantizeSingle(&c[1][0], mtx);
+    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
+    err2 = QuantizeSingle(&c[2][0], mtx);
+    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
+    err3 = QuantizeSingle(&c[3][0], mtx);
+    // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
+    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
+    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
+    rd->derr[ch][0] = (int8_t)err1;
+    rd->derr[ch][1] = (int8_t)err2;
+    rd->derr[ch][2] = (int8_t)err3;
+  }
+}
+
+static void StoreDiffusionErrors(VP8EncIterator* const it,
+                                 const VP8ModeScore* const rd) {
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    int8_t* const top = it->top_derr_[it->x_][ch];
+    int8_t* const left = it->left_derr_[ch];
+    left[0] = rd->derr[ch][0];            // restore err1
+    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
+    top[0]  = rd->derr[ch][1];            //     ... err2
+    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
+  }
+}
+
+#undef C1
+#undef C2
+#undef DSHIFT
+#undef DSCALE
+
+//------------------------------------------------------------------------------
+
+static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
+                         uint8_t* const yuv_out, int mode) {
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
+  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  int nz = 0;
+  int n;
+  int16_t tmp[8][16];
+
+  for (n = 0; n < 8; n += 2) {
+    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
+  }
+  if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
+
+  if (DO_TRELLIS_UV && it->do_trellis_) {
+    int ch, x, y;
+    for (ch = 0, n = 0; ch <= 2; ch += 2) {
+      for (y = 0; y < 2; ++y) {
+        for (x = 0; x < 2; ++x, ++n) {
+          const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+          const int non_zero = TrellisQuantizeBlock(
+              enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
+              dqm->lambda_trellis_uv_);
+          it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
+          nz |= non_zero << n;
+        }
+      }
+    }
+  } else {
+    for (n = 0; n < 8; n += 2) {
+      nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
+    }
+  }
+
+  for (n = 0; n < 8; n += 2) {
+    VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
+  }
+  return (nz << 16);
+}
+
+//------------------------------------------------------------------------------
+// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
+// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
+
+static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
+  // We look at the first three AC coefficients to determine what is the average
+  // delta between each sub-4x4 block.
+  const int v0 = abs(DCs[1]);
+  const int v1 = abs(DCs[2]);
+  const int v2 = abs(DCs[4]);
+  int max_v = (v1 > v0) ? v1 : v0;
+  max_v = (v2 > max_v) ? v2 : max_v;
+  if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
+}
+
+static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
+  VP8ModeScore* const tmp = *a;
+  *a = *b;
+  *b = tmp;
+}
+
+static void SwapPtr(uint8_t** a, uint8_t** b) {
+  uint8_t* const tmp = *a;
+  *a = *b;
+  *b = tmp;
+}
+
+static void SwapOut(VP8EncIterator* const it) {
+  SwapPtr(&it->yuv_out_, &it->yuv_out2_);
+}
+
+static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
+  const int kNumBlocks = 16;
+  VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
+  const int lambda = dqm->lambda_i16_;
+  const int tlambda = dqm->tlambda_;
+  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
+  VP8ModeScore rd_tmp;
+  VP8ModeScore* rd_cur = &rd_tmp;
+  VP8ModeScore* rd_best = rd;
+  int mode;
+  int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
+
+  rd->mode_i16 = -1;
+  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
+    rd_cur->mode_i16 = mode;
+
+    // Reconstruct
+    rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
+
+    // Measure RD-score
+    rd_cur->D = VP8SSE16x16(src, tmp_dst);
+    rd_cur->SD =
+        tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
+    rd_cur->H = VP8FixedCostsI16[mode];
+    rd_cur->R = VP8GetCostLuma16(it, rd_cur);
+    if (is_flat) {
+      // refine the first impression (which was in pixel space)
+      is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
+      if (is_flat) {
+        // Block is very flat. We put emphasis on the distortion being very low!
+        rd_cur->D *= 2;
+        rd_cur->SD *= 2;
+      }
+    }
+
+    // Since we always examine Intra16 first, we can overwrite *rd directly.
+    SetRDScore(lambda, rd_cur);
+    if (mode == 0 || rd_cur->score < rd_best->score) {
+      SwapModeScore(&rd_cur, &rd_best);
+      SwapOut(it);
+    }
+  }
+  if (rd_best != rd) {
+    memcpy(rd, rd_best, sizeof(*rd));
+  }
+  SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
+  VP8SetIntra16Mode(it, rd->mode_i16);
+
+  // we have a blocky macroblock (only DCs are non-zero) with fairly high
+  // distortion, record max delta so we can later adjust the minimal filtering
+  // strength needed to smooth these blocks out.
+  if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
+    StoreMaxDelta(dqm, rd->y_dc_levels);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+// return the cost array corresponding to the surrounding prediction modes.
+static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
+                                     const uint8_t modes[16]) {
+  const int preds_w = it->enc_->preds_w_;
+  const int x = (it->i4_ & 3), y = it->i4_ >> 2;
+  const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
+  const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
+  return VP8FixedCostsI4[top][left];
+}
+
+static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
+  const VP8Encoder* const enc = it->enc_;
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const int lambda = dqm->lambda_i4_;
+  const int tlambda = dqm->tlambda_;
+  const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
+  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
+  int total_header_bits = 0;
+  VP8ModeScore rd_best;
+
+  if (enc->max_i4_header_bits_ == 0) {
+    return 0;
+  }
+
+  InitScore(&rd_best);
+  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
+  SetRDScore(dqm->lambda_mode_, &rd_best);
+  VP8IteratorStartI4(it);
+  do {
+    const int kNumBlocks = 1;
+    VP8ModeScore rd_i4;
+    int mode;
+    int best_mode = -1;
+    const uint8_t* const src = src0 + VP8Scan[it->i4_];
+    const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
+    uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
+    uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
+
+    InitScore(&rd_i4);
+    VP8MakeIntra4Preds(it);
+    for (mode = 0; mode < NUM_BMODES; ++mode) {
+      VP8ModeScore rd_tmp;
+      int16_t tmp_levels[16];
+
+      // Reconstruct
+      rd_tmp.nz =
+          ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
+
+      // Compute RD-score
+      rd_tmp.D = VP8SSE4x4(src, tmp_dst);
+      rd_tmp.SD =
+          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
+                  : 0;
+      rd_tmp.H = mode_costs[mode];
+
+      // Add flatness penalty, to avoid flat area to be mispredicted
+      // by a complex mode.
+      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
+        rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
+      } else {
+        rd_tmp.R = 0;
+      }
+
+      // early-out check
+      SetRDScore(lambda, &rd_tmp);
+      if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
+
+      // finish computing score
+      rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
+      SetRDScore(lambda, &rd_tmp);
+
+      if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
+        CopyScore(&rd_i4, &rd_tmp);
+        best_mode = mode;
+        SwapPtr(&tmp_dst, &best_block);
+        memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
+               sizeof(rd_best.y_ac_levels[it->i4_]));
+      }
+    }
+    SetRDScore(dqm->lambda_mode_, &rd_i4);
+    AddScore(&rd_best, &rd_i4);
+    if (rd_best.score >= rd->score) {
+      return 0;
+    }
+    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
+    if (total_header_bits > enc->max_i4_header_bits_) {
+      return 0;
+    }
+    // Copy selected samples if not in the right place already.
+    if (best_block != best_blocks + VP8Scan[it->i4_]) {
+      VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
+    }
+    rd->modes_i4[it->i4_] = best_mode;
+    it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
+  } while (VP8IteratorRotateI4(it, best_blocks));
+
+  // finalize state
+  CopyScore(rd, &rd_best);
+  VP8SetIntra4Mode(it, rd->modes_i4);
+  SwapOut(it);
+  memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
+  return 1;   // select intra4x4 over intra16x16
+}
+
+//------------------------------------------------------------------------------
+
+static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
+  const int kNumBlocks = 8;
+  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
+  const int lambda = dqm->lambda_uv_;
+  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+  uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
+  uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
+  uint8_t* dst = dst0;
+  VP8ModeScore rd_best;
+  int mode;
+
+  rd->mode_uv = -1;
+  InitScore(&rd_best);
+  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+    VP8ModeScore rd_uv;
+
+    // Reconstruct
+    rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
+
+    // Compute RD-score
+    rd_uv.D  = VP8SSE16x8(src, tmp_dst);
+    rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
+    rd_uv.H  = VP8FixedCostsUV[mode];
+    rd_uv.R  = VP8GetCostUV(it, &rd_uv);
+    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
+      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
+    }
+
+    SetRDScore(lambda, &rd_uv);
+    if (mode == 0 || rd_uv.score < rd_best.score) {
+      CopyScore(&rd_best, &rd_uv);
+      rd->mode_uv = mode;
+      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
+      if (it->top_derr_ != NULL) {
+        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
+      }
+      SwapPtr(&dst, &tmp_dst);
+    }
+  }
+  VP8SetIntraUVMode(it, rd->mode_uv);
+  AddScore(rd, &rd_best);
+  if (dst != dst0) {   // copy 16x8 block if needed
+    VP8Copy16x8(dst, dst0);
+  }
+  if (it->top_derr_ != NULL) {  // store diffusion errors for next block
+    StoreDiffusionErrors(it, rd);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Final reconstruction and quantization.
+
+static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
+  const VP8Encoder* const enc = it->enc_;
+  const int is_i16 = (it->mb_->type_ == 1);
+  int nz = 0;
+
+  if (is_i16) {
+    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
+  } else {
+    VP8IteratorStartI4(it);
+    do {
+      const int mode =
+          it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
+      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
+      uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
+      VP8MakeIntra4Preds(it);
+      nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
+                              src, dst, mode) << it->i4_;
+    } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
+  }
+
+  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
+  rd->nz = nz;
+}
+
+// Refine intra16/intra4 sub-modes based on distortion only (not rate).
+static void RefineUsingDistortion(VP8EncIterator* const it,
+                                  int try_both_modes, int refine_uv_mode,
+                                  VP8ModeScore* const rd) {
+  score_t best_score = MAX_COST;
+  int nz = 0;
+  int mode;
+  int is_i16 = try_both_modes || (it->mb_->type_ == 1);
+
+  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
+  // Some empiric constants, of approximate order of magnitude.
+  const int lambda_d_i16 = 106;
+  const int lambda_d_i4 = 11;
+  const int lambda_d_uv = 120;
+  score_t score_i4 = dqm->i4_penalty_;
+  score_t i4_bit_sum = 0;
+  const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
+                                           : MAX_COST;  // no early-out allowed
+
+  if (is_i16) {   // First, evaluate Intra16 distortion
+    int best_mode = -1;
+    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
+    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
+      const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
+                          + VP8FixedCostsI16[mode] * lambda_d_i16;
+      if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
+        continue;
+      }
+
+      if (score < best_score) {
+        best_mode = mode;
+        best_score = score;
+      }
+    }
+    if (it->x_ == 0 || it->y_ == 0) {
+      // avoid starting a checkerboard resonance from the border. See bug #432.
+      if (IsFlatSource16(src)) {
+        best_mode = (it->x_ == 0) ? 0 : 2;
+        try_both_modes = 0;  // stick to i16
+      }
+    }
+    VP8SetIntra16Mode(it, best_mode);
+    // we'll reconstruct later, if i16 mode actually gets selected
+  }
+
+  // Next, evaluate Intra4
+  if (try_both_modes || !is_i16) {
+    // We don't evaluate the rate here, but just account for it through a
+    // constant penalty (i4 mode usually needs more bits compared to i16).
+    is_i16 = 0;
+    VP8IteratorStartI4(it);
+    do {
+      int best_i4_mode = -1;
+      score_t best_i4_score = MAX_COST;
+      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
+      const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
+
+      VP8MakeIntra4Preds(it);
+      for (mode = 0; mode < NUM_BMODES; ++mode) {
+        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
+        const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
+                            + mode_costs[mode] * lambda_d_i4;
+        if (score < best_i4_score) {
+          best_i4_mode = mode;
+          best_i4_score = score;
+        }
+      }
+      i4_bit_sum += mode_costs[best_i4_mode];
+      rd->modes_i4[it->i4_] = best_i4_mode;
+      score_i4 += best_i4_score;
+      if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
+        // Intra4 won't be better than Intra16. Bail out and pick Intra16.
+        is_i16 = 1;
+        break;
+      } else {  // reconstruct partial block inside yuv_out2_ buffer
+        uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
+        nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
+                                src, tmp_dst, best_i4_mode) << it->i4_;
+      }
+    } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
+  }
+
+  // Final reconstruction, depending on which mode is selected.
+  if (!is_i16) {
+    VP8SetIntra4Mode(it, rd->modes_i4);
+    SwapOut(it);
+    best_score = score_i4;
+  } else {
+    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
+  }
+
+  // ... and UV!
+  if (refine_uv_mode) {
+    int best_mode = -1;
+    score_t best_uv_score = MAX_COST;
+    const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+      const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
+      const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
+                          + VP8FixedCostsUV[mode] * lambda_d_uv;
+      if (score < best_uv_score) {
+        best_mode = mode;
+        best_uv_score = score;
+      }
+    }
+    VP8SetIntraUVMode(it, best_mode);
+  }
+  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
+
+  rd->nz = nz;
+  rd->score = best_score;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
+                VP8RDLevel rd_opt) {
+  int is_skipped;
+  const int method = it->enc_->method_;
+
+  InitScore(rd);
+
+  // We can perform predictions for Luma16x16 and Chroma8x8 already.
+  // Luma4x4 predictions needs to be done as-we-go.
+  VP8MakeLuma16Preds(it);
+  VP8MakeChroma8Preds(it);
+
+  if (rd_opt > RD_OPT_NONE) {
+    it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
+    PickBestIntra16(it, rd);
+    if (method >= 2) {
+      PickBestIntra4(it, rd);
+    }
+    PickBestUV(it, rd);
+    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
+      it->do_trellis_ = 1;
+      SimpleQuantize(it, rd);
+    }
+  } else {
+    // At this point we have heuristically decided intra16 / intra4.
+    // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
+    // For method <= 1, we don't re-examine the decision but just go ahead with
+    // quantization/reconstruction.
+    RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
+  }
+  is_skipped = (rd->nz == 0);
+  VP8SetSkip(it, is_skipped);
+  return is_skipped;
+}
diff --git a/media/libwebp/enc/syntax_enc.c b/media/libwebp/enc/syntax_enc.c
new file mode 100644
index 0000000000..28fd1f1ee0
--- /dev/null
+++ b/media/libwebp/enc/syntax_enc.c
@@ -0,0 +1,388 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Header syntax writing
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"  // RIFF constants
+#include "../webp/mux_types.h"         // ALPHA_FLAG
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Helper functions
+
+static int IsVP8XNeeded(const VP8Encoder* const enc) {
+  return !!enc->has_alpha_;  // Currently the only case when VP8X is needed.
+                             // This could change in the future.
+}
+
+static int PutPaddingByte(const WebPPicture* const pic) {
+  const uint8_t pad_byte[1] = { 0 };
+  return !!pic->writer(pad_byte, 1, pic);
+}
+
+//------------------------------------------------------------------------------
+// Writers for header's various pieces (in order of appearance)
+
+static WebPEncodingError PutRIFFHeader(const VP8Encoder* const enc,
+                                       size_t riff_size) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t riff[RIFF_HEADER_SIZE] = {
+    'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P'
+  };
+  assert(riff_size == (uint32_t)riff_size);
+  PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
+  if (!pic->writer(riff, sizeof(riff), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8XHeader(const VP8Encoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t vp8x[CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE] = {
+    'V', 'P', '8', 'X'
+  };
+  uint32_t flags = 0;
+
+  assert(IsVP8XNeeded(enc));
+  assert(pic->width >= 1 && pic->height >= 1);
+  assert(pic->width <= MAX_CANVAS_SIZE && pic->height <= MAX_CANVAS_SIZE);
+
+  if (enc->has_alpha_) {
+    flags |= ALPHA_FLAG;
+  }
+
+  PutLE32(vp8x + TAG_SIZE,              VP8X_CHUNK_SIZE);
+  PutLE32(vp8x + CHUNK_HEADER_SIZE,     flags);
+  PutLE24(vp8x + CHUNK_HEADER_SIZE + 4, pic->width - 1);
+  PutLE24(vp8x + CHUNK_HEADER_SIZE + 7, pic->height - 1);
+  if (!pic->writer(vp8x, sizeof(vp8x), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutAlphaChunk(const VP8Encoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t alpha_chunk_hdr[CHUNK_HEADER_SIZE] = {
+    'A', 'L', 'P', 'H'
+  };
+
+  assert(enc->has_alpha_);
+
+  // Alpha chunk header.
+  PutLE32(alpha_chunk_hdr + TAG_SIZE, enc->alpha_data_size_);
+  if (!pic->writer(alpha_chunk_hdr, sizeof(alpha_chunk_hdr), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+
+  // Alpha chunk data.
+  if (!pic->writer(enc->alpha_data_, enc->alpha_data_size_, pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+
+  // Padding.
+  if ((enc->alpha_data_size_ & 1) && !PutPaddingByte(pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8Header(const WebPPicture* const pic,
+                                      size_t vp8_size) {
+  uint8_t vp8_chunk_hdr[CHUNK_HEADER_SIZE] = {
+    'V', 'P', '8', ' '
+  };
+  assert(vp8_size == (uint32_t)vp8_size);
+  PutLE32(vp8_chunk_hdr + TAG_SIZE, (uint32_t)vp8_size);
+  if (!pic->writer(vp8_chunk_hdr, sizeof(vp8_chunk_hdr), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8FrameHeader(const WebPPicture* const pic,
+                                           int profile, size_t size0) {
+  uint8_t vp8_frm_hdr[VP8_FRAME_HEADER_SIZE];
+  uint32_t bits;
+
+  if (size0 >= VP8_MAX_PARTITION0_SIZE) {  // partition #0 is too big to fit
+    return VP8_ENC_ERROR_PARTITION0_OVERFLOW;
+  }
+
+  // Paragraph 9.1.
+  bits = 0                         // keyframe (1b)
+       | (profile << 1)            // profile (3b)
+       | (1 << 4)                  // visible (1b)
+       | ((uint32_t)size0 << 5);   // partition length (19b)
+  vp8_frm_hdr[0] = (bits >>  0) & 0xff;
+  vp8_frm_hdr[1] = (bits >>  8) & 0xff;
+  vp8_frm_hdr[2] = (bits >> 16) & 0xff;
+  // signature
+  vp8_frm_hdr[3] = (VP8_SIGNATURE >> 16) & 0xff;
+  vp8_frm_hdr[4] = (VP8_SIGNATURE >>  8) & 0xff;
+  vp8_frm_hdr[5] = (VP8_SIGNATURE >>  0) & 0xff;
+  // dimensions
+  vp8_frm_hdr[6] = pic->width & 0xff;
+  vp8_frm_hdr[7] = pic->width >> 8;
+  vp8_frm_hdr[8] = pic->height & 0xff;
+  vp8_frm_hdr[9] = pic->height >> 8;
+
+  if (!pic->writer(vp8_frm_hdr, sizeof(vp8_frm_hdr), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+// WebP Headers.
+static int PutWebPHeaders(const VP8Encoder* const enc, size_t size0,
+                          size_t vp8_size, size_t riff_size) {
+  WebPPicture* const pic = enc->pic_;
+  WebPEncodingError err = VP8_ENC_OK;
+
+  // RIFF header.
+  err = PutRIFFHeader(enc, riff_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // VP8X.
+  if (IsVP8XNeeded(enc)) {
+    err = PutVP8XHeader(enc);
+    if (err != VP8_ENC_OK) goto Error;
+  }
+
+  // Alpha.
+  if (enc->has_alpha_) {
+    err = PutAlphaChunk(enc);
+    if (err != VP8_ENC_OK) goto Error;
+  }
+
+  // VP8 header.
+  err = PutVP8Header(pic, vp8_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // VP8 frame header.
+  err = PutVP8FrameHeader(pic, enc->profile_, size0);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // All OK.
+  return 1;
+
+  // Error.
+ Error:
+  return WebPEncodingSetError(pic, err);
+}
+
+// Segmentation header
+static void PutSegmentHeader(VP8BitWriter* const bw,
+                             const VP8Encoder* const enc) {
+  const VP8EncSegmentHeader* const hdr = &enc->segment_hdr_;
+  const VP8EncProba* const proba = &enc->proba_;
+  if (VP8PutBitUniform(bw, (hdr->num_segments_ > 1))) {
+    // We always 'update' the quant and filter strength values
+    const int update_data = 1;
+    int s;
+    VP8PutBitUniform(bw, hdr->update_map_);
+    if (VP8PutBitUniform(bw, update_data)) {
+      // we always use absolute values, not relative ones
+      VP8PutBitUniform(bw, 1);   // (segment_feature_mode = 1. Paragraph 9.3.)
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        VP8PutSignedBits(bw, enc->dqm_[s].quant_, 7);
+      }
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        VP8PutSignedBits(bw, enc->dqm_[s].fstrength_, 6);
+      }
+    }
+    if (hdr->update_map_) {
+      for (s = 0; s < 3; ++s) {
+        if (VP8PutBitUniform(bw, (proba->segments_[s] != 255u))) {
+          VP8PutBits(bw, proba->segments_[s], 8);
+        }
+      }
+    }
+  }
+}
+
+// Filtering parameters header
+static void PutFilterHeader(VP8BitWriter* const bw,
+                            const VP8EncFilterHeader* const hdr) {
+  const int use_lf_delta = (hdr->i4x4_lf_delta_ != 0);
+  VP8PutBitUniform(bw, hdr->simple_);
+  VP8PutBits(bw, hdr->level_, 6);
+  VP8PutBits(bw, hdr->sharpness_, 3);
+  if (VP8PutBitUniform(bw, use_lf_delta)) {
+    // '0' is the default value for i4x4_lf_delta_ at frame #0.
+    const int need_update = (hdr->i4x4_lf_delta_ != 0);
+    if (VP8PutBitUniform(bw, need_update)) {
+      // we don't use ref_lf_delta => emit four 0 bits
+      VP8PutBits(bw, 0, 4);
+      // we use mode_lf_delta for i4x4
+      VP8PutSignedBits(bw, hdr->i4x4_lf_delta_, 6);
+      VP8PutBits(bw, 0, 3);    // all others unused
+    }
+  }
+}
+
+// Nominal quantization parameters
+static void PutQuant(VP8BitWriter* const bw,
+                     const VP8Encoder* const enc) {
+  VP8PutBits(bw, enc->base_quant_, 7);
+  VP8PutSignedBits(bw, enc->dq_y1_dc_, 4);
+  VP8PutSignedBits(bw, enc->dq_y2_dc_, 4);
+  VP8PutSignedBits(bw, enc->dq_y2_ac_, 4);
+  VP8PutSignedBits(bw, enc->dq_uv_dc_, 4);
+  VP8PutSignedBits(bw, enc->dq_uv_ac_, 4);
+}
+
+// Partition sizes
+static int EmitPartitionsSize(const VP8Encoder* const enc,
+                              WebPPicture* const pic) {
+  uint8_t buf[3 * (MAX_NUM_PARTITIONS - 1)];
+  int p;
+  for (p = 0; p < enc->num_parts_ - 1; ++p) {
+    const size_t part_size = VP8BitWriterSize(enc->parts_ + p);
+    if (part_size >= VP8_MAX_PARTITION_SIZE) {
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION_OVERFLOW);
+    }
+    buf[3 * p + 0] = (part_size >>  0) & 0xff;
+    buf[3 * p + 1] = (part_size >>  8) & 0xff;
+    buf[3 * p + 2] = (part_size >> 16) & 0xff;
+  }
+  return p ? pic->writer(buf, 3 * p, pic) : 1;
+}
+
+//------------------------------------------------------------------------------
+
+static int GeneratePartition0(VP8Encoder* const enc) {
+  VP8BitWriter* const bw = &enc->bw_;
+  const int mb_size = enc->mb_w_ * enc->mb_h_;
+  uint64_t pos1, pos2, pos3;
+
+  pos1 = VP8BitWriterPos(bw);
+  if (!VP8BitWriterInit(bw, mb_size * 7 / 8)) {        // ~7 bits per macroblock
+    return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+  VP8PutBitUniform(bw, 0);   // colorspace
+  VP8PutBitUniform(bw, 0);   // clamp type
+
+  PutSegmentHeader(bw, enc);
+  PutFilterHeader(bw, &enc->filter_hdr_);
+  VP8PutBits(bw, enc->num_parts_ == 8 ? 3 :
+                 enc->num_parts_ == 4 ? 2 :
+                 enc->num_parts_ == 2 ? 1 : 0, 2);
+  PutQuant(bw, enc);
+  VP8PutBitUniform(bw, 0);   // no proba update
+  VP8WriteProbas(bw, &enc->proba_);
+  pos2 = VP8BitWriterPos(bw);
+  VP8CodeIntraModes(enc);
+  VP8BitWriterFinish(bw);
+
+  pos3 = VP8BitWriterPos(bw);
+
+#if !defined(WEBP_DISABLE_STATS)
+  if (enc->pic_->stats) {
+    enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3);
+    enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3);
+    enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_;
+  }
+#else
+  (void)pos1;
+  (void)pos2;
+  (void)pos3;
+#endif
+  if (bw->error_) {
+    return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+  return 1;
+}
+
+void VP8EncFreeBitWriters(VP8Encoder* const enc) {
+  int p;
+  VP8BitWriterWipeOut(&enc->bw_);
+  for (p = 0; p < enc->num_parts_; ++p) {
+    VP8BitWriterWipeOut(enc->parts_ + p);
+  }
+}
+
+int VP8EncWrite(VP8Encoder* const enc) {
+  WebPPicture* const pic = enc->pic_;
+  VP8BitWriter* const bw = &enc->bw_;
+  const int task_percent = 19;
+  const int percent_per_part = task_percent / enc->num_parts_;
+  const int final_percent = enc->percent_ + task_percent;
+  int ok = 0;
+  size_t vp8_size, pad, riff_size;
+  int p;
+
+  // Partition #0 with header and partition sizes
+  ok = GeneratePartition0(enc);
+  if (!ok) return 0;
+
+  // Compute VP8 size
+  vp8_size = VP8_FRAME_HEADER_SIZE +
+             VP8BitWriterSize(bw) +
+             3 * (enc->num_parts_ - 1);
+  for (p = 0; p < enc->num_parts_; ++p) {
+    vp8_size += VP8BitWriterSize(enc->parts_ + p);
+  }
+  pad = vp8_size & 1;
+  vp8_size += pad;
+
+  // Compute RIFF size
+  // At the minimum it is: "WEBPVP8 nnnn" + VP8 data size.
+  riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8_size;
+  if (IsVP8XNeeded(enc)) {  // Add size for: VP8X header + data.
+    riff_size += CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
+  }
+  if (enc->has_alpha_) {  // Add size for: ALPH header + data.
+    const uint32_t padded_alpha_size = enc->alpha_data_size_ +
+                                       (enc->alpha_data_size_ & 1);
+    riff_size += CHUNK_HEADER_SIZE + padded_alpha_size;
+  }
+  // RIFF size should fit in 32-bits.
+  if (riff_size > 0xfffffffeU) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_FILE_TOO_BIG);
+  }
+
+  // Emit headers and partition #0
+  {
+    const uint8_t* const part0 = VP8BitWriterBuf(bw);
+    const size_t size0 = VP8BitWriterSize(bw);
+    ok = ok && PutWebPHeaders(enc, size0, vp8_size, riff_size)
+            && pic->writer(part0, size0, pic)
+            && EmitPartitionsSize(enc, pic);
+    VP8BitWriterWipeOut(bw);    // will free the internal buffer.
+  }
+
+  // Token partitions
+  for (p = 0; p < enc->num_parts_; ++p) {
+    const uint8_t* const buf = VP8BitWriterBuf(enc->parts_ + p);
+    const size_t size = VP8BitWriterSize(enc->parts_ + p);
+    if (size) ok = ok && pic->writer(buf, size, pic);
+    VP8BitWriterWipeOut(enc->parts_ + p);    // will free the internal buffer.
+    ok = ok && WebPReportProgress(pic, enc->percent_ + percent_per_part,
+                                  &enc->percent_);
+  }
+
+  // Padding byte
+  if (ok && pad) {
+    ok = PutPaddingByte(pic);
+  }
+
+  enc->coded_size_ = (int)(CHUNK_HEADER_SIZE + riff_size);
+  ok = ok && WebPReportProgress(pic, final_percent, &enc->percent_);
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+
diff --git a/media/libwebp/enc/token_enc.c b/media/libwebp/enc/token_enc.c
new file mode 100644
index 0000000000..52711eb782
--- /dev/null
+++ b/media/libwebp/enc/token_enc.c
@@ -0,0 +1,262 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Paginated token buffer
+//
+//  A 'token' is a bit value associated with a probability, either fixed
+// or a later-to-be-determined after statistics have been collected.
+// For dynamic probability, we just record the slot id (idx) for the probability
+// value in the final probability array (uint8_t* probas in VP8EmitTokens).
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../utils/utils.h"
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+// we use pages to reduce the number of memcpy()
+#define MIN_PAGE_SIZE 8192          // minimum number of token per page
+#define FIXED_PROBA_BIT (1u << 14)
+
+typedef uint16_t token_t;  // bit #15: bit value
+                           // bit #14: flags for constant proba or idx
+                           // bits #0..13: slot or constant proba
+struct VP8Tokens {
+  VP8Tokens* next_;        // pointer to next page
+};
+// Token data is located in memory just after the next_ field.
+// This macro is used to return their address and hide the trick.
+#define TOKEN_DATA(p) ((const token_t*)&(p)[1])
+
+//------------------------------------------------------------------------------
+
+void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
+  b->tokens_ = NULL;
+  b->pages_ = NULL;
+  b->last_page_ = &b->pages_;
+  b->left_ = 0;
+  b->page_size_ = (page_size < MIN_PAGE_SIZE) ? MIN_PAGE_SIZE : page_size;
+  b->error_ = 0;
+}
+
+void VP8TBufferClear(VP8TBuffer* const b) {
+  if (b != NULL) {
+    VP8Tokens* p = b->pages_;
+    while (p != NULL) {
+      VP8Tokens* const next = p->next_;
+      WebPSafeFree(p);
+      p = next;
+    }
+    VP8TBufferInit(b, b->page_size_);
+  }
+}
+
+static int TBufferNewPage(VP8TBuffer* const b) {
+  VP8Tokens* page = NULL;
+  if (!b->error_) {
+    const size_t size = sizeof(*page) + b->page_size_ * sizeof(token_t);
+    page = (VP8Tokens*)WebPSafeMalloc(1ULL, size);
+  }
+  if (page == NULL) {
+    b->error_ = 1;
+    return 0;
+  }
+  page->next_ = NULL;
+
+  *b->last_page_ = page;
+  b->last_page_ = &page->next_;
+  b->left_ = b->page_size_;
+  b->tokens_ = (token_t*)TOKEN_DATA(page);
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+#define TOKEN_ID(t, b, ctx) \
+    (NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
+
+static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b, uint32_t bit,
+                                     uint32_t proba_idx,
+                                     proba_t* const stats) {
+  assert(proba_idx < FIXED_PROBA_BIT);
+  assert(bit <= 1);
+  if (b->left_ > 0 || TBufferNewPage(b)) {
+    const int slot = --b->left_;
+    b->tokens_[slot] = (bit << 15) | proba_idx;
+  }
+  VP8RecordStats(bit, stats);
+  return bit;
+}
+
+static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
+                                         uint32_t bit, uint32_t proba) {
+  assert(proba < 256);
+  assert(bit <= 1);
+  if (b->left_ > 0 || TBufferNewPage(b)) {
+    const int slot = --b->left_;
+    b->tokens_[slot] = (bit << 15) | FIXED_PROBA_BIT | proba;
+  }
+}
+
+int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
+                         VP8TBuffer* const tokens) {
+  const int16_t* const coeffs = res->coeffs;
+  const int coeff_type = res->coeff_type;
+  const int last = res->last;
+  int n = res->first;
+  uint32_t base_id = TOKEN_ID(coeff_type, n, ctx);
+  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  proba_t* s = res->stats[n][ctx];
+  if (!AddToken(tokens, last >= 0, base_id + 0, s + 0)) {
+    return 0;
+  }
+
+  while (n < 16) {
+    const int c = coeffs[n++];
+    const int sign = c < 0;
+    const uint32_t v = sign ? -c : c;
+    if (!AddToken(tokens, v != 0, base_id + 1, s + 1)) {
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 0);  // ctx=0
+      s = res->stats[VP8EncBands[n]][0];
+      continue;
+    }
+    if (!AddToken(tokens, v > 1, base_id + 2, s + 2)) {
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 1);  // ctx=1
+      s = res->stats[VP8EncBands[n]][1];
+    } else {
+      if (!AddToken(tokens, v > 4, base_id + 3, s + 3)) {
+        if (AddToken(tokens, v != 2, base_id + 4, s + 4)) {
+          AddToken(tokens, v == 4, base_id + 5, s + 5);
+        }
+      } else if (!AddToken(tokens, v > 10, base_id + 6, s + 6)) {
+        if (!AddToken(tokens, v > 6, base_id + 7, s + 7)) {
+          AddConstantToken(tokens, v == 6, 159);
+        } else {
+          AddConstantToken(tokens, v >= 9, 165);
+          AddConstantToken(tokens, !(v & 1), 145);
+        }
+      } else {
+        int mask;
+        const uint8_t* tab;
+        uint32_t residue = v - 3;
+        if (residue < (8 << 1)) {          // VP8Cat3  (3b)
+          AddToken(tokens, 0, base_id + 8, s + 8);
+          AddToken(tokens, 0, base_id + 9, s + 9);
+          residue -= (8 << 0);
+          mask = 1 << 2;
+          tab = VP8Cat3;
+        } else if (residue < (8 << 2)) {   // VP8Cat4  (4b)
+          AddToken(tokens, 0, base_id + 8, s + 8);
+          AddToken(tokens, 1, base_id + 9, s + 9);
+          residue -= (8 << 1);
+          mask = 1 << 3;
+          tab = VP8Cat4;
+        } else if (residue < (8 << 3)) {   // VP8Cat5  (5b)
+          AddToken(tokens, 1, base_id + 8, s + 8);
+          AddToken(tokens, 0, base_id + 10, s + 9);
+          residue -= (8 << 2);
+          mask = 1 << 4;
+          tab = VP8Cat5;
+        } else {                         // VP8Cat6 (11b)
+          AddToken(tokens, 1, base_id + 8, s + 8);
+          AddToken(tokens, 1, base_id + 10, s + 9);
+          residue -= (8 << 3);
+          mask = 1 << 10;
+          tab = VP8Cat6;
+        }
+        while (mask) {
+          AddConstantToken(tokens, !!(residue & mask), *tab++);
+          mask >>= 1;
+        }
+      }
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 2);  // ctx=2
+      s = res->stats[VP8EncBands[n]][2];
+    }
+    AddConstantToken(tokens, sign, 128);
+    if (n == 16 || !AddToken(tokens, n <= last, base_id + 0, s + 0)) {
+      return 1;   // EOB
+    }
+  }
+  return 1;
+}
+
+#undef TOKEN_ID
+
+//------------------------------------------------------------------------------
+// Final coding pass, with known probabilities
+
+int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
+                  const uint8_t* const probas, int final_pass) {
+  const VP8Tokens* p = b->pages_;
+  assert(!b->error_);
+  while (p != NULL) {
+    const VP8Tokens* const next = p->next_;
+    const int N = (next == NULL) ? b->left_ : 0;
+    int n = b->page_size_;
+    const token_t* const tokens = TOKEN_DATA(p);
+    while (n-- > N) {
+      const token_t token = tokens[n];
+      const int bit = (token >> 15) & 1;
+      if (token & FIXED_PROBA_BIT) {
+        VP8PutBit(bw, bit, token & 0xffu);  // constant proba
+      } else {
+        VP8PutBit(bw, bit, probas[token & 0x3fffu]);
+      }
+    }
+    if (final_pass) WebPSafeFree((void*)p);
+    p = next;
+  }
+  if (final_pass) b->pages_ = NULL;
+  return 1;
+}
+
+// Size estimation
+size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
+  size_t size = 0;
+  const VP8Tokens* p = b->pages_;
+  assert(!b->error_);
+  while (p != NULL) {
+    const VP8Tokens* const next = p->next_;
+    const int N = (next == NULL) ? b->left_ : 0;
+    int n = b->page_size_;
+    const token_t* const tokens = TOKEN_DATA(p);
+    while (n-- > N) {
+      const token_t token = tokens[n];
+      const int bit = token & (1 << 15);
+      if (token & FIXED_PROBA_BIT) {
+        size += VP8BitCost(bit, token & 0xffu);
+      } else {
+        size += VP8BitCost(bit, probas[token & 0x3fffu]);
+      }
+    }
+    p = next;
+  }
+  return size;
+}
+
+//------------------------------------------------------------------------------
+
+#else     // DISABLE_TOKEN_BUFFER
+
+void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
+  (void)b;
+  (void)page_size;
+}
+void VP8TBufferClear(VP8TBuffer* const b) {
+  (void)b;
+}
+
+#endif    // !DISABLE_TOKEN_BUFFER
+
diff --git a/media/libwebp/enc/tree_enc.c b/media/libwebp/enc/tree_enc.c
new file mode 100644
index 0000000000..7bf9b47d08
--- /dev/null
+++ b/media/libwebp/enc/tree_enc.c
@@ -0,0 +1,504 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Coding of token probabilities, intra modes and segments.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Default probabilities
+
+// Paragraph 13.5
+const uint8_t
+  VP8CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+      { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+    },
+    { { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+      { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+    },
+    { { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+      { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+      { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
+      { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
+      { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
+    },
+    { { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+      { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
+    },
+    { { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+      { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+      { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
+    },
+    { { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+      { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+      { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+      { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
+    },
+    { { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+      { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
+    },
+    { { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+      { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
+    },
+    { { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+      { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+      { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
+    },
+    { { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+      { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+      { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+      { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+      { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
+    },
+    { { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+      { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+      { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+      { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
+    },
+    { { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+      { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
+    },
+    { { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+      { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+      { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
+    },
+    { { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+      { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
+    },
+    { { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+      { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
+    },
+    { { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+      { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+      { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+      { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
+    },
+    { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  }
+};
+
+void VP8DefaultProbas(VP8Encoder* const enc) {
+  VP8EncProba* const probas = &enc->proba_;
+  probas->use_skip_proba_ = 0;
+  memset(probas->segments_, 255u, sizeof(probas->segments_));
+  memcpy(probas->coeffs_, VP8CoeffsProba0, sizeof(VP8CoeffsProba0));
+  // Note: we could hard-code the level_costs_ corresponding to VP8CoeffsProba0,
+  // but that's ~11k of static data. Better call VP8CalculateLevelCosts() later.
+  probas->dirty_ = 1;
+}
+
+// Paragraph 11.5.  900bytes.
+static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
+  { { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
+    { 152, 179, 64, 126, 170, 118, 46, 70, 95 },
+    { 175, 69, 143, 80, 85, 82, 72, 155, 103 },
+    { 56, 58, 10, 171, 218, 189, 17, 13, 152 },
+    { 114, 26, 17, 163, 44, 195, 21, 10, 173 },
+    { 121, 24, 80, 195, 26, 62, 44, 64, 85 },
+    { 144, 71, 10, 38, 171, 213, 144, 34, 26 },
+    { 170, 46, 55, 19, 136, 160, 33, 206, 71 },
+    { 63, 20, 8, 114, 114, 208, 12, 9, 226 },
+    { 81, 40, 11, 96, 182, 84, 29, 16, 36 } },
+  { { 134, 183, 89, 137, 98, 101, 106, 165, 148 },
+    { 72, 187, 100, 130, 157, 111, 32, 75, 80 },
+    { 66, 102, 167, 99, 74, 62, 40, 234, 128 },
+    { 41, 53, 9, 178, 241, 141, 26, 8, 107 },
+    { 74, 43, 26, 146, 73, 166, 49, 23, 157 },
+    { 65, 38, 105, 160, 51, 52, 31, 115, 128 },
+    { 104, 79, 12, 27, 217, 255, 87, 17, 7 },
+    { 87, 68, 71, 44, 114, 51, 15, 186, 23 },
+    { 47, 41, 14, 110, 182, 183, 21, 17, 194 },
+    { 66, 45, 25, 102, 197, 189, 23, 18, 22 } },
+  { { 88, 88, 147, 150, 42, 46, 45, 196, 205 },
+    { 43, 97, 183, 117, 85, 38, 35, 179, 61 },
+    { 39, 53, 200, 87, 26, 21, 43, 232, 171 },
+    { 56, 34, 51, 104, 114, 102, 29, 93, 77 },
+    { 39, 28, 85, 171, 58, 165, 90, 98, 64 },
+    { 34, 22, 116, 206, 23, 34, 43, 166, 73 },
+    { 107, 54, 32, 26, 51, 1, 81, 43, 31 },
+    { 68, 25, 106, 22, 64, 171, 36, 225, 114 },
+    { 34, 19, 21, 102, 132, 188, 16, 76, 124 },
+    { 62, 18, 78, 95, 85, 57, 50, 48, 51 } },
+  { { 193, 101, 35, 159, 215, 111, 89, 46, 111 },
+    { 60, 148, 31, 172, 219, 228, 21, 18, 111 },
+    { 112, 113, 77, 85, 179, 255, 38, 120, 114 },
+    { 40, 42, 1, 196, 245, 209, 10, 25, 109 },
+    { 88, 43, 29, 140, 166, 213, 37, 43, 154 },
+    { 61, 63, 30, 155, 67, 45, 68, 1, 209 },
+    { 100, 80, 8, 43, 154, 1, 51, 26, 71 },
+    { 142, 78, 78, 16, 255, 128, 34, 197, 171 },
+    { 41, 40, 5, 102, 211, 183, 4, 1, 221 },
+    { 51, 50, 17, 168, 209, 192, 23, 25, 82 } },
+  { { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
+    { 67, 87, 58, 169, 82, 115, 26, 59, 179 },
+    { 63, 59, 90, 180, 59, 166, 93, 73, 154 },
+    { 40, 40, 21, 116, 143, 209, 34, 39, 175 },
+    { 47, 15, 16, 183, 34, 223, 49, 45, 183 },
+    { 46, 17, 33, 183, 6, 98, 15, 32, 183 },
+    { 57, 46, 22, 24, 128, 1, 54, 17, 37 },
+    { 65, 32, 73, 115, 28, 128, 23, 128, 205 },
+    { 40, 3, 9, 115, 51, 192, 18, 6, 223 },
+    { 87, 37, 9, 115, 59, 77, 64, 21, 47 } },
+  { { 104, 55, 44, 218, 9, 54, 53, 130, 226 },
+    { 64, 90, 70, 205, 40, 41, 23, 26, 57 },
+    { 54, 57, 112, 184, 5, 41, 38, 166, 213 },
+    { 30, 34, 26, 133, 152, 116, 10, 32, 134 },
+    { 39, 19, 53, 221, 26, 114, 32, 73, 255 },
+    { 31, 9, 65, 234, 2, 15, 1, 118, 73 },
+    { 75, 32, 12, 51, 192, 255, 160, 43, 51 },
+    { 88, 31, 35, 67, 102, 85, 55, 186, 85 },
+    { 56, 21, 23, 111, 59, 205, 45, 37, 192 },
+    { 55, 38, 70, 124, 73, 102, 1, 34, 98 } },
+  { { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
+    { 95, 84, 53, 89, 128, 100, 113, 101, 45 },
+    { 75, 79, 123, 47, 51, 128, 81, 171, 1 },
+    { 57, 17, 5, 71, 102, 57, 53, 41, 49 },
+    { 38, 33, 13, 121, 57, 73, 26, 1, 85 },
+    { 41, 10, 67, 138, 77, 110, 90, 47, 114 },
+    { 115, 21, 2, 10, 102, 255, 166, 23, 6 },
+    { 101, 29, 16, 10, 85, 128, 101, 196, 26 },
+    { 57, 18, 10, 102, 102, 213, 34, 20, 43 },
+    { 117, 20, 15, 36, 163, 128, 68, 1, 26 } },
+  { { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
+    { 69, 60, 71, 38, 73, 119, 28, 222, 37 },
+    { 68, 45, 128, 34, 1, 47, 11, 245, 171 },
+    { 62, 17, 19, 70, 146, 85, 55, 62, 70 },
+    { 37, 43, 37, 154, 100, 163, 85, 160, 1 },
+    { 63, 9, 92, 136, 28, 64, 32, 201, 85 },
+    { 75, 15, 9, 9, 64, 255, 184, 119, 16 },
+    { 86, 6, 28, 5, 64, 255, 25, 248, 1 },
+    { 56, 8, 17, 132, 137, 255, 55, 116, 128 },
+    { 58, 15, 20, 82, 135, 57, 26, 121, 40 } },
+  { { 164, 50, 31, 137, 154, 133, 25, 35, 218 },
+    { 51, 103, 44, 131, 131, 123, 31, 6, 158 },
+    { 86, 40, 64, 135, 148, 224, 45, 183, 128 },
+    { 22, 26, 17, 131, 240, 154, 14, 1, 209 },
+    { 45, 16, 21, 91, 64, 222, 7, 1, 197 },
+    { 56, 21, 39, 155, 60, 138, 23, 102, 213 },
+    { 83, 12, 13, 54, 192, 255, 68, 47, 28 },
+    { 85, 26, 85, 85, 128, 128, 32, 146, 171 },
+    { 18, 11, 7, 63, 144, 171, 4, 4, 246 },
+    { 35, 27, 10, 146, 174, 171, 12, 26, 128 } },
+  { { 190, 80, 35, 99, 180, 80, 126, 54, 45 },
+    { 85, 126, 47, 87, 176, 51, 41, 20, 32 },
+    { 101, 75, 128, 139, 118, 146, 116, 128, 85 },
+    { 56, 41, 15, 176, 236, 85, 37, 9, 62 },
+    { 71, 30, 17, 119, 118, 255, 17, 18, 138 },
+    { 101, 38, 60, 138, 55, 70, 43, 26, 142 },
+    { 146, 36, 19, 30, 171, 255, 97, 27, 20 },
+    { 138, 45, 61, 62, 219, 1, 81, 188, 64 },
+    { 32, 41, 20, 117, 151, 142, 20, 21, 163 },
+    { 112, 19, 12, 61, 195, 128, 48, 4, 24 } }
+};
+
+static int PutI4Mode(VP8BitWriter* const bw, int mode,
+                     const uint8_t* const prob) {
+  if (VP8PutBit(bw, mode != B_DC_PRED, prob[0])) {
+    if (VP8PutBit(bw, mode != B_TM_PRED, prob[1])) {
+      if (VP8PutBit(bw, mode != B_VE_PRED, prob[2])) {
+        if (!VP8PutBit(bw, mode >= B_LD_PRED, prob[3])) {
+          if (VP8PutBit(bw, mode != B_HE_PRED, prob[4])) {
+            VP8PutBit(bw, mode != B_RD_PRED, prob[5]);
+          }
+        } else {
+          if (VP8PutBit(bw, mode != B_LD_PRED, prob[6])) {
+            if (VP8PutBit(bw, mode != B_VL_PRED, prob[7])) {
+              VP8PutBit(bw, mode != B_HD_PRED, prob[8]);
+            }
+          }
+        }
+      }
+    }
+  }
+  return mode;
+}
+
+static void PutI16Mode(VP8BitWriter* const bw, int mode) {
+  if (VP8PutBit(bw, (mode == TM_PRED || mode == H_PRED), 156)) {
+    VP8PutBit(bw, mode == TM_PRED, 128);    // TM or HE
+  } else {
+    VP8PutBit(bw, mode == V_PRED, 163);     // VE or DC
+  }
+}
+
+static void PutUVMode(VP8BitWriter* const bw, int uv_mode) {
+  if (VP8PutBit(bw, uv_mode != DC_PRED, 142)) {
+    if (VP8PutBit(bw, uv_mode != V_PRED, 114)) {
+      VP8PutBit(bw, uv_mode != H_PRED, 183);    // else: TM_PRED
+    }
+  }
+}
+
+static void PutSegment(VP8BitWriter* const bw, int s, const uint8_t* p) {
+  if (VP8PutBit(bw, s >= 2, p[0])) p += 1;
+  VP8PutBit(bw, s & 1, p[1]);
+}
+
+void VP8CodeIntraModes(VP8Encoder* const enc) {
+  VP8BitWriter* const bw = &enc->bw_;
+  VP8EncIterator it;
+  VP8IteratorInit(enc, &it);
+  do {
+    const VP8MBInfo* const mb = it.mb_;
+    const uint8_t* preds = it.preds_;
+    if (enc->segment_hdr_.update_map_) {
+      PutSegment(bw, mb->segment_, enc->proba_.segments_);
+    }
+    if (enc->proba_.use_skip_proba_) {
+      VP8PutBit(bw, mb->skip_, enc->proba_.skip_proba_);
+    }
+    if (VP8PutBit(bw, (mb->type_ != 0), 145)) {  // i16x16
+      PutI16Mode(bw, preds[0]);
+    } else {
+      const int preds_w = enc->preds_w_;
+      const uint8_t* top_pred = preds - preds_w;
+      int x, y;
+      for (y = 0; y < 4; ++y) {
+        int left = preds[-1];
+        for (x = 0; x < 4; ++x) {
+          const uint8_t* const probas = kBModesProba[top_pred[x]][left];
+          left = PutI4Mode(bw, preds[x], probas);
+        }
+        top_pred = preds;
+        preds += preds_w;
+      }
+    }
+    PutUVMode(bw, mb->uv_mode_);
+  } while (VP8IteratorNext(&it));
+}
+
+//------------------------------------------------------------------------------
+// Paragraph 13
+
+const uint8_t
+    VP8CoeffsUpdateProba[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  { { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255 },
+      { 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255 }
+    },
+    { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255 },
+      { 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  }
+};
+
+void VP8WriteProbas(VP8BitWriter* const bw, const VP8EncProba* const probas) {
+  int t, b, c, p;
+  for (t = 0; t < NUM_TYPES; ++t) {
+    for (b = 0; b < NUM_BANDS; ++b) {
+      for (c = 0; c < NUM_CTX; ++c) {
+        for (p = 0; p < NUM_PROBAS; ++p) {
+          const uint8_t p0 = probas->coeffs_[t][b][c][p];
+          const int update = (p0 != VP8CoeffsProba0[t][b][c][p]);
+          if (VP8PutBit(bw, update, VP8CoeffsUpdateProba[t][b][c][p])) {
+            VP8PutBits(bw, p0, 8);
+          }
+        }
+      }
+    }
+  }
+  if (VP8PutBitUniform(bw, probas->use_skip_proba_)) {
+    VP8PutBits(bw, probas->skip_proba_, 8);
+  }
+}
+
diff --git a/media/libwebp/enc/vp8i_enc.h b/media/libwebp/enc/vp8i_enc.h
index 009ccf2239..9bca205e01 100644
--- a/media/libwebp/enc/vp8i_enc.h
+++ b/media/libwebp/enc/vp8i_enc.h
@@ -31,7 +31,7 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 1
-#define ENC_MIN_VERSION 0
+#define ENC_MIN_VERSION 2
 #define ENC_REV_VERSION 2
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
@@ -249,7 +249,7 @@ typedef struct {
   int           percent0_;         // saved initial progress percent
 
   DError        left_derr_;        // left error diffusion (u/v)
-  DError       *top_derr_;         // top diffusion error - NULL if disabled
+  DError*       top_derr_;         // top diffusion error - NULL if disabled
 
   uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
   uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
@@ -286,8 +286,7 @@ int VP8IteratorNext(VP8EncIterator* const it);
 // save the yuv_out_ boundary values to top_/left_ arrays for next iterations.
 void VP8IteratorSaveBoundary(VP8EncIterator* const it);
 // Report progression based on macroblock rows. Return 0 for user-abort request.
-int VP8IteratorProgress(const VP8EncIterator* const it,
-                        int final_delta_percent);
+int VP8IteratorProgress(const VP8EncIterator* const it, int delta);
 // Intra4x4 iterations
 void VP8IteratorStartI4(VP8EncIterator* const it);
 // returns true if not done.
@@ -505,9 +504,9 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
 // Returns false in case of error (invalid param, out-of-memory).
 int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
 
-// Clean-up the RGB samples under fully transparent area, to help lossless
-// compressibility (no guarantee, though). Assumes that pic->use_argb is true.
-void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);
+// Replace samples that are fully transparent by 'color' to help compressibility
+// (no guarantee, though). Assumes pic->use_argb is true.
+void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color);
 
 //------------------------------------------------------------------------------
 
diff --git a/media/libwebp/enc/vp8l_enc.c b/media/libwebp/enc/vp8l_enc.c
new file mode 100644
index 0000000000..4aed5d8e32
--- /dev/null
+++ b/media/libwebp/enc/vp8l_enc.c
@@ -0,0 +1,2138 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// main entry for the lossless encoder.
+//
+// Author: Vikas Arora (vikaas.arora@gmail.com)
+//
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../enc/backward_references_enc.h"
+#include "../enc/histogram_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/vp8li_enc.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/bit_writer_utils.h"
+#include "../utils/huffman_encode_utils.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+// Maximum number of histogram images (sub-blocks).
+#define MAX_HUFF_IMAGE_SIZE       2600
+
+// Palette reordering for smaller sum of deltas (and for smaller storage).
+
+static int PaletteCompareColorsForQsort(const void* p1, const void* p2) {
+  const uint32_t a = WebPMemToUint32((uint8_t*)p1);
+  const uint32_t b = WebPMemToUint32((uint8_t*)p2);
+  assert(a != b);
+  return (a < b) ? -1 : 1;
+}
+
+static WEBP_INLINE uint32_t PaletteComponentDistance(uint32_t v) {
+  return (v <= 128) ? v : (256 - v);
+}
+
+// Computes a value that is related to the entropy created by the
+// palette entry diff.
+//
+// Note that the last & 0xff is a no-operation in the next statement, but
+// removed by most compilers and is here only for regularity of the code.
+static WEBP_INLINE uint32_t PaletteColorDistance(uint32_t col1, uint32_t col2) {
+  const uint32_t diff = VP8LSubPixels(col1, col2);
+  const int kMoreWeightForRGBThanForAlpha = 9;
+  uint32_t score;
+  score =  PaletteComponentDistance((diff >>  0) & 0xff);
+  score += PaletteComponentDistance((diff >>  8) & 0xff);
+  score += PaletteComponentDistance((diff >> 16) & 0xff);
+  score *= kMoreWeightForRGBThanForAlpha;
+  score += PaletteComponentDistance((diff >> 24) & 0xff);
+  return score;
+}
+
+static WEBP_INLINE void SwapColor(uint32_t* const col1, uint32_t* const col2) {
+  const uint32_t tmp = *col1;
+  *col1 = *col2;
+  *col2 = tmp;
+}
+
+static WEBP_INLINE int SearchColorNoIdx(const uint32_t sorted[], uint32_t color,
+                                        int num_colors) {
+  int low = 0, hi = num_colors;
+  if (sorted[low] == color) return low;  // loop invariant: sorted[low] != color
+  while (1) {
+    const int mid = (low + hi) >> 1;
+    if (sorted[mid] == color) {
+      return mid;
+    } else if (sorted[mid] < color) {
+      low = mid;
+    } else {
+      hi = mid;
+    }
+  }
+  assert(0);
+  return 0;
+}
+
+// The palette has been sorted by alpha. This function checks if the other
+// components of the palette have a monotonic development with regards to
+// position in the palette. If all have monotonic development, there is
+// no benefit to re-organize them greedily. A monotonic development
+// would be spotted in green-only situations (like lossy alpha) or gray-scale
+// images.
+static int PaletteHasNonMonotonousDeltas(const uint32_t* const palette,
+                                         int num_colors) {
+  uint32_t predict = 0x000000;
+  int i;
+  uint8_t sign_found = 0x00;
+  for (i = 0; i < num_colors; ++i) {
+    const uint32_t diff = VP8LSubPixels(palette[i], predict);
+    const uint8_t rd = (diff >> 16) & 0xff;
+    const uint8_t gd = (diff >>  8) & 0xff;
+    const uint8_t bd = (diff >>  0) & 0xff;
+    if (rd != 0x00) {
+      sign_found |= (rd < 0x80) ? 1 : 2;
+    }
+    if (gd != 0x00) {
+      sign_found |= (gd < 0x80) ? 8 : 16;
+    }
+    if (bd != 0x00) {
+      sign_found |= (bd < 0x80) ? 64 : 128;
+    }
+    predict = palette[i];
+  }
+  return (sign_found & (sign_found << 1)) != 0;  // two consequent signs.
+}
+
+static void PaletteSortMinimizeDeltas(const uint32_t* const palette_sorted,
+                                      int num_colors, uint32_t* const palette) {
+  uint32_t predict = 0x00000000;
+  int i, k;
+  memcpy(palette, palette_sorted, num_colors * sizeof(*palette));
+  if (!PaletteHasNonMonotonousDeltas(palette_sorted, num_colors)) return;
+  // Find greedily always the closest color of the predicted color to minimize
+  // deltas in the palette. This reduces storage needs since the
+  // palette is stored with delta encoding.
+  for (i = 0; i < num_colors; ++i) {
+    int best_ix = i;
+    uint32_t best_score = ~0U;
+    for (k = i; k < num_colors; ++k) {
+      const uint32_t cur_score = PaletteColorDistance(palette[k], predict);
+      if (best_score > cur_score) {
+        best_score = cur_score;
+        best_ix = k;
+      }
+    }
+    SwapColor(&palette[best_ix], &palette[i]);
+    predict = palette[i];
+  }
+}
+
+// Sort palette in increasing order and prepare an inverse mapping array.
+static void PrepareMapToPalette(const uint32_t palette[], uint32_t num_colors,
+                                uint32_t sorted[], uint32_t idx_map[]) {
+  uint32_t i;
+  memcpy(sorted, palette, num_colors * sizeof(*sorted));
+  qsort(sorted, num_colors, sizeof(*sorted), PaletteCompareColorsForQsort);
+  for (i = 0; i < num_colors; ++i) {
+    idx_map[SearchColorNoIdx(sorted, palette[i], num_colors)] = i;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Modified Zeng method from "A Survey on Palette Reordering
+// Methods for Improving the Compression of Color-Indexed Images" by Armando J.
+// Pinho and Antonio J. R. Neves.
+
+// Finds the biggest cooccurrence in the matrix.
+static void CoOccurrenceFindMax(const uint32_t* const cooccurrence,
+                                uint32_t num_colors, uint8_t* const c1,
+                                uint8_t* const c2) {
+  // Find the index that is most frequently located adjacent to other
+  // (different) indexes.
+  uint32_t best_sum = 0u;
+  uint32_t i, j, best_cooccurrence;
+  *c1 = 0u;
+  for (i = 0; i < num_colors; ++i) {
+    uint32_t sum = 0;
+    for (j = 0; j < num_colors; ++j) sum += cooccurrence[i * num_colors + j];
+    if (sum > best_sum) {
+      best_sum = sum;
+      *c1 = i;
+    }
+  }
+  // Find the index that is most frequently found adjacent to *c1.
+  *c2 = 0u;
+  best_cooccurrence = 0u;
+  for (i = 0; i < num_colors; ++i) {
+    if (cooccurrence[*c1 * num_colors + i] > best_cooccurrence) {
+      best_cooccurrence = cooccurrence[*c1 * num_colors + i];
+      *c2 = i;
+    }
+  }
+  assert(*c1 != *c2);
+}
+
+// Builds the cooccurrence matrix
+static WebPEncodingError CoOccurrenceBuild(const WebPPicture* const pic,
+                                           const uint32_t* const palette,
+                                           uint32_t num_colors,
+                                           uint32_t* cooccurrence) {
+  uint32_t *lines, *line_top, *line_current, *line_tmp;
+  int x, y;
+  const uint32_t* src = pic->argb;
+  uint32_t prev_pix = ~src[0];
+  uint32_t prev_idx = 0u;
+  uint32_t idx_map[MAX_PALETTE_SIZE] = {0};
+  uint32_t palette_sorted[MAX_PALETTE_SIZE];
+  lines = (uint32_t*)WebPSafeMalloc(2 * pic->width, sizeof(*lines));
+  if (lines == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+  line_top = &lines[0];
+  line_current = &lines[pic->width];
+  PrepareMapToPalette(palette, num_colors, palette_sorted, idx_map);
+  for (y = 0; y < pic->height; ++y) {
+    for (x = 0; x < pic->width; ++x) {
+      const uint32_t pix = src[x];
+      if (pix != prev_pix) {
+        prev_idx = idx_map[SearchColorNoIdx(palette_sorted, pix, num_colors)];
+        prev_pix = pix;
+      }
+      line_current[x] = prev_idx;
+      // 4-connectivity is what works best as mentioned in "On the relation
+      // between Memon's and the modified Zeng's palette reordering methods".
+      if (x > 0 && prev_idx != line_current[x - 1]) {
+        const uint32_t left_idx = line_current[x - 1];
+        ++cooccurrence[prev_idx * num_colors + left_idx];
+        ++cooccurrence[left_idx * num_colors + prev_idx];
+      }
+      if (y > 0 && prev_idx != line_top[x]) {
+        const uint32_t top_idx = line_top[x];
+        ++cooccurrence[prev_idx * num_colors + top_idx];
+        ++cooccurrence[top_idx * num_colors + prev_idx];
+      }
+    }
+    line_tmp = line_top;
+    line_top = line_current;
+    line_current = line_tmp;
+    src += pic->argb_stride;
+  }
+  WebPSafeFree(lines);
+  return VP8_ENC_OK;
+}
+
+struct Sum {
+  uint8_t index;
+  uint32_t sum;
+};
+
+// Implements the modified Zeng method from "A Survey on Palette Reordering
+// Methods for Improving the Compression of Color-Indexed Images" by Armando J.
+// Pinho and Antonio J. R. Neves.
+static WebPEncodingError PaletteSortModifiedZeng(
+    const WebPPicture* const pic, const uint32_t* const palette_sorted,
+    uint32_t num_colors, uint32_t* const palette) {
+  uint32_t i, j, ind;
+  uint8_t remapping[MAX_PALETTE_SIZE];
+  uint32_t* cooccurrence;
+  struct Sum sums[MAX_PALETTE_SIZE];
+  uint32_t first, last;
+  uint32_t num_sums;
+  // TODO(vrabaud) check whether one color images should use palette or not.
+  if (num_colors <= 1) return VP8_ENC_OK;
+  // Build the co-occurrence matrix.
+  cooccurrence =
+      (uint32_t*)WebPSafeCalloc(num_colors * num_colors, sizeof(*cooccurrence));
+  if (cooccurrence == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+  if (CoOccurrenceBuild(pic, palette_sorted, num_colors, cooccurrence) !=
+      VP8_ENC_OK) {
+    WebPSafeFree(cooccurrence);
+    return VP8_ENC_ERROR_OUT_OF_MEMORY;
+  }
+
+  // Initialize the mapping list with the two best indices.
+  CoOccurrenceFindMax(cooccurrence, num_colors, &remapping[0], &remapping[1]);
+
+  // We need to append and prepend to the list of remapping. To this end, we
+  // actually define the next start/end of the list as indices in a vector (with
+  // a wrap around when the end is reached).
+  first = 0;
+  last = 1;
+  num_sums = num_colors - 2;  // -2 because we know the first two values
+  if (num_sums > 0) {
+    // Initialize the sums with the first two remappings and find the best one
+    struct Sum* best_sum = &sums[0];
+    best_sum->index = 0u;
+    best_sum->sum = 0u;
+    for (i = 0, j = 0; i < num_colors; ++i) {
+      if (i == remapping[0] || i == remapping[1]) continue;
+      sums[j].index = i;
+      sums[j].sum = cooccurrence[i * num_colors + remapping[0]] +
+                    cooccurrence[i * num_colors + remapping[1]];
+      if (sums[j].sum > best_sum->sum) best_sum = &sums[j];
+      ++j;
+    }
+
+    while (num_sums > 0) {
+      const uint8_t best_index = best_sum->index;
+      // Compute delta to know if we need to prepend or append the best index.
+      int32_t delta = 0;
+      const int32_t n = num_colors - num_sums;
+      for (ind = first, j = 0; (ind + j) % num_colors != last + 1; ++j) {
+        const uint16_t l_j = remapping[(ind + j) % num_colors];
+        delta += (n - 1 - 2 * (int32_t)j) *
+                 (int32_t)cooccurrence[best_index * num_colors + l_j];
+      }
+      if (delta > 0) {
+        first = (first == 0) ? num_colors - 1 : first - 1;
+        remapping[first] = best_index;
+      } else {
+        ++last;
+        remapping[last] = best_index;
+      }
+      // Remove best_sum from sums.
+      *best_sum = sums[num_sums - 1];
+      --num_sums;
+      // Update all the sums and find the best one.
+      best_sum = &sums[0];
+      for (i = 0; i < num_sums; ++i) {
+        sums[i].sum += cooccurrence[best_index * num_colors + sums[i].index];
+        if (sums[i].sum > best_sum->sum) best_sum = &sums[i];
+      }
+    }
+  }
+  assert((last + 1) % num_colors == first);
+  WebPSafeFree(cooccurrence);
+
+  // Re-map the palette.
+  for (i = 0; i < num_colors; ++i) {
+    palette[i] = palette_sorted[remapping[(first + i) % num_colors]];
+  }
+  return VP8_ENC_OK;
+}
+
+// -----------------------------------------------------------------------------
+// Palette
+
+// These five modes are evaluated and their respective entropy is computed.
+typedef enum {
+  kDirect = 0,
+  kSpatial = 1,
+  kSubGreen = 2,
+  kSpatialSubGreen = 3,
+  kPalette = 4,
+  kPaletteAndSpatial = 5,
+  kNumEntropyIx = 6
+} EntropyIx;
+
+typedef enum {
+  kSortedDefault = 0,
+  kMinimizeDelta = 1,
+  kModifiedZeng = 2,
+  kUnusedPalette = 3,
+} PaletteSorting;
+
+typedef enum {
+  kHistoAlpha = 0,
+  kHistoAlphaPred,
+  kHistoGreen,
+  kHistoGreenPred,
+  kHistoRed,
+  kHistoRedPred,
+  kHistoBlue,
+  kHistoBluePred,
+  kHistoRedSubGreen,
+  kHistoRedPredSubGreen,
+  kHistoBlueSubGreen,
+  kHistoBluePredSubGreen,
+  kHistoPalette,
+  kHistoTotal  // Must be last.
+} HistoIx;
+
+static void AddSingleSubGreen(int p, uint32_t* const r, uint32_t* const b) {
+  const int green = p >> 8;  // The upper bits are masked away later.
+  ++r[((p >> 16) - green) & 0xff];
+  ++b[((p >>  0) - green) & 0xff];
+}
+
+static void AddSingle(uint32_t p,
+                      uint32_t* const a, uint32_t* const r,
+                      uint32_t* const g, uint32_t* const b) {
+  ++a[(p >> 24) & 0xff];
+  ++r[(p >> 16) & 0xff];
+  ++g[(p >>  8) & 0xff];
+  ++b[(p >>  0) & 0xff];
+}
+
+static WEBP_INLINE uint32_t HashPix(uint32_t pix) {
+  // Note that masking with 0xffffffffu is for preventing an
+  // 'unsigned int overflow' warning. Doesn't impact the compiled code.
+  return ((((uint64_t)pix + (pix >> 19)) * 0x39c5fba7ull) & 0xffffffffu) >> 24;
+}
+
+static int AnalyzeEntropy(const uint32_t* argb,
+                          int width, int height, int argb_stride,
+                          int use_palette,
+                          int palette_size, int transform_bits,
+                          EntropyIx* const min_entropy_ix,
+                          int* const red_and_blue_always_zero) {
+  // Allocate histogram set with cache_bits = 0.
+  uint32_t* histo;
+
+  if (use_palette && palette_size <= 16) {
+    // In the case of small palettes, we pack 2, 4 or 8 pixels together. In
+    // practice, small palettes are better than any other transform.
+    *min_entropy_ix = kPalette;
+    *red_and_blue_always_zero = 1;
+    return 1;
+  }
+  histo = (uint32_t*)WebPSafeCalloc(kHistoTotal, sizeof(*histo) * 256);
+  if (histo != NULL) {
+    int i, x, y;
+    const uint32_t* prev_row = NULL;
+    const uint32_t* curr_row = argb;
+    uint32_t pix_prev = argb[0];  // Skip the first pixel.
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        const uint32_t pix = curr_row[x];
+        const uint32_t pix_diff = VP8LSubPixels(pix, pix_prev);
+        pix_prev = pix;
+        if ((pix_diff == 0) || (prev_row != NULL && pix == prev_row[x])) {
+          continue;
+        }
+        AddSingle(pix,
+                  &histo[kHistoAlpha * 256],
+                  &histo[kHistoRed * 256],
+                  &histo[kHistoGreen * 256],
+                  &histo[kHistoBlue * 256]);
+        AddSingle(pix_diff,
+                  &histo[kHistoAlphaPred * 256],
+                  &histo[kHistoRedPred * 256],
+                  &histo[kHistoGreenPred * 256],
+                  &histo[kHistoBluePred * 256]);
+        AddSingleSubGreen(pix,
+                          &histo[kHistoRedSubGreen * 256],
+                          &histo[kHistoBlueSubGreen * 256]);
+        AddSingleSubGreen(pix_diff,
+                          &histo[kHistoRedPredSubGreen * 256],
+                          &histo[kHistoBluePredSubGreen * 256]);
+        {
+          // Approximate the palette by the entropy of the multiplicative hash.
+          const uint32_t hash = HashPix(pix);
+          ++histo[kHistoPalette * 256 + hash];
+        }
+      }
+      prev_row = curr_row;
+      curr_row += argb_stride;
+    }
+    {
+      double entropy_comp[kHistoTotal];
+      double entropy[kNumEntropyIx];
+      int k;
+      int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen;
+      int j;
+      // Let's add one zero to the predicted histograms. The zeros are removed
+      // too efficiently by the pix_diff == 0 comparison, at least one of the
+      // zeros is likely to exist.
+      ++histo[kHistoRedPredSubGreen * 256];
+      ++histo[kHistoBluePredSubGreen * 256];
+      ++histo[kHistoRedPred * 256];
+      ++histo[kHistoGreenPred * 256];
+      ++histo[kHistoBluePred * 256];
+      ++histo[kHistoAlphaPred * 256];
+
+      for (j = 0; j < kHistoTotal; ++j) {
+        entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256);
+      }
+      entropy[kDirect] = entropy_comp[kHistoAlpha] +
+          entropy_comp[kHistoRed] +
+          entropy_comp[kHistoGreen] +
+          entropy_comp[kHistoBlue];
+      entropy[kSpatial] = entropy_comp[kHistoAlphaPred] +
+          entropy_comp[kHistoRedPred] +
+          entropy_comp[kHistoGreenPred] +
+          entropy_comp[kHistoBluePred];
+      entropy[kSubGreen] = entropy_comp[kHistoAlpha] +
+          entropy_comp[kHistoRedSubGreen] +
+          entropy_comp[kHistoGreen] +
+          entropy_comp[kHistoBlueSubGreen];
+      entropy[kSpatialSubGreen] = entropy_comp[kHistoAlphaPred] +
+          entropy_comp[kHistoRedPredSubGreen] +
+          entropy_comp[kHistoGreenPred] +
+          entropy_comp[kHistoBluePredSubGreen];
+      entropy[kPalette] = entropy_comp[kHistoPalette];
+
+      // When including transforms, there is an overhead in bits from
+      // storing them. This overhead is small but matters for small images.
+      // For spatial, there are 14 transformations.
+      entropy[kSpatial] += VP8LSubSampleSize(width, transform_bits) *
+                           VP8LSubSampleSize(height, transform_bits) *
+                           VP8LFastLog2(14);
+      // For color transforms: 24 as only 3 channels are considered in a
+      // ColorTransformElement.
+      entropy[kSpatialSubGreen] += VP8LSubSampleSize(width, transform_bits) *
+                                   VP8LSubSampleSize(height, transform_bits) *
+                                   VP8LFastLog2(24);
+      // For palettes, add the cost of storing the palette.
+      // We empirically estimate the cost of a compressed entry as 8 bits.
+      // The palette is differential-coded when compressed hence a much
+      // lower cost than sizeof(uint32_t)*8.
+      entropy[kPalette] += palette_size * 8;
+
+      *min_entropy_ix = kDirect;
+      for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
+        if (entropy[*min_entropy_ix] > entropy[k]) {
+          *min_entropy_ix = (EntropyIx)k;
+        }
+      }
+      assert((int)*min_entropy_ix <= last_mode_to_analyze);
+      *red_and_blue_always_zero = 1;
+      // Let's check if the histogram of the chosen entropy mode has
+      // non-zero red and blue values. If all are zero, we can later skip
+      // the cross color optimization.
+      {
+        static const uint8_t kHistoPairs[5][2] = {
+          { kHistoRed, kHistoBlue },
+          { kHistoRedPred, kHistoBluePred },
+          { kHistoRedSubGreen, kHistoBlueSubGreen },
+          { kHistoRedPredSubGreen, kHistoBluePredSubGreen },
+          { kHistoRed, kHistoBlue }
+        };
+        const uint32_t* const red_histo =
+            &histo[256 * kHistoPairs[*min_entropy_ix][0]];
+        const uint32_t* const blue_histo =
+            &histo[256 * kHistoPairs[*min_entropy_ix][1]];
+        for (i = 1; i < 256; ++i) {
+          if ((red_histo[i] | blue_histo[i]) != 0) {
+            *red_and_blue_always_zero = 0;
+            break;
+          }
+        }
+      }
+    }
+    WebPSafeFree(histo);
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+static int GetHistoBits(int method, int use_palette, int width, int height) {
+  // Make tile size a function of encoding method (Range: 0 to 6).
+  int histo_bits = (use_palette ? 9 : 7) - method;
+  while (1) {
+    const int huff_image_size = VP8LSubSampleSize(width, histo_bits) *
+                                VP8LSubSampleSize(height, histo_bits);
+    if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
+    ++histo_bits;
+  }
+  return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
+         (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
+}
+
+static int GetTransformBits(int method, int histo_bits) {
+  const int max_transform_bits = (method < 4) ? 6 : (method > 4) ? 4 : 5;
+  const int res =
+      (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+  assert(res <= MAX_TRANSFORM_BITS);
+  return res;
+}
+
+// Set of parameters to be used in each iteration of the cruncher.
+#define CRUNCH_SUBCONFIGS_MAX 2
+typedef struct {
+  int lz77_;
+  int do_no_cache_;
+} CrunchSubConfig;
+typedef struct {
+  int entropy_idx_;
+  PaletteSorting palette_sorting_type_;
+  CrunchSubConfig sub_configs_[CRUNCH_SUBCONFIGS_MAX];
+  int sub_configs_size_;
+} CrunchConfig;
+
+// +2 because we add a palette sorting configuration for kPalette and
+// kPaletteAndSpatial.
+#define CRUNCH_CONFIGS_MAX (kNumEntropyIx + 2)
+
+static int EncoderAnalyze(VP8LEncoder* const enc,
+                          CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX],
+                          int* const crunch_configs_size,
+                          int* const red_and_blue_always_zero) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+  const WebPConfig* const config = enc->config_;
+  const int method = config->method;
+  const int low_effort = (config->method == 0);
+  int i;
+  int use_palette;
+  int n_lz77s;
+  // If set to 0, analyze the cache with the computed cache value. If 1, also
+  // analyze with no-cache.
+  int do_no_cache = 0;
+  assert(pic != NULL && pic->argb != NULL);
+
+  // Check whether a palette is possible.
+  enc->palette_size_ = WebPGetColorPalette(pic, enc->palette_sorted_);
+  use_palette = (enc->palette_size_ <= MAX_PALETTE_SIZE);
+  if (!use_palette) {
+    enc->palette_size_ = 0;
+  } else {
+    qsort(enc->palette_sorted_, enc->palette_size_,
+          sizeof(*enc->palette_sorted_), PaletteCompareColorsForQsort);
+  }
+
+  // Empirical bit sizes.
+  enc->histo_bits_ = GetHistoBits(method, use_palette,
+                                  pic->width, pic->height);
+  enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
+
+  if (low_effort) {
+    // AnalyzeEntropy is somewhat slow.
+    crunch_configs[0].entropy_idx_ = use_palette ? kPalette : kSpatialSubGreen;
+    crunch_configs[0].palette_sorting_type_ =
+        use_palette ? kSortedDefault : kUnusedPalette;
+    n_lz77s = 1;
+    *crunch_configs_size = 1;
+  } else {
+    EntropyIx min_entropy_ix;
+    // Try out multiple LZ77 on images with few colors.
+    n_lz77s = (enc->palette_size_ > 0 && enc->palette_size_ <= 16) ? 2 : 1;
+    if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride, use_palette,
+                        enc->palette_size_, enc->transform_bits_,
+                        &min_entropy_ix, red_and_blue_always_zero)) {
+      return 0;
+    }
+    if (method == 6 && config->quality == 100) {
+      do_no_cache = 1;
+      // Go brute force on all transforms.
+      *crunch_configs_size = 0;
+      for (i = 0; i < kNumEntropyIx; ++i) {
+        // We can only apply kPalette or kPaletteAndSpatial if we can indeed use
+        // a palette.
+        if ((i != kPalette && i != kPaletteAndSpatial) || use_palette) {
+          assert(*crunch_configs_size < CRUNCH_CONFIGS_MAX);
+          crunch_configs[(*crunch_configs_size)].entropy_idx_ = i;
+          if (use_palette && (i == kPalette || i == kPaletteAndSpatial)) {
+            crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+                kMinimizeDelta;
+            ++*crunch_configs_size;
+            // Also add modified Zeng's method.
+            crunch_configs[(*crunch_configs_size)].entropy_idx_ = i;
+            crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+                kModifiedZeng;
+          } else {
+            crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+                kUnusedPalette;
+          }
+          ++*crunch_configs_size;
+        }
+      }
+    } else {
+      // Only choose the guessed best transform.
+      *crunch_configs_size = 1;
+      crunch_configs[0].entropy_idx_ = min_entropy_ix;
+      crunch_configs[0].palette_sorting_type_ =
+          use_palette ? kMinimizeDelta : kUnusedPalette;
+      if (config->quality >= 75 && method == 5) {
+        // Test with and without color cache.
+        do_no_cache = 1;
+        // If we have a palette, also check in combination with spatial.
+        if (min_entropy_ix == kPalette) {
+          *crunch_configs_size = 2;
+          crunch_configs[1].entropy_idx_ = kPaletteAndSpatial;
+          crunch_configs[1].palette_sorting_type_ = kMinimizeDelta;
+        }
+      }
+    }
+  }
+  // Fill in the different LZ77s.
+  assert(n_lz77s <= CRUNCH_SUBCONFIGS_MAX);
+  for (i = 0; i < *crunch_configs_size; ++i) {
+    int j;
+    for (j = 0; j < n_lz77s; ++j) {
+      assert(j < CRUNCH_SUBCONFIGS_MAX);
+      crunch_configs[i].sub_configs_[j].lz77_ =
+          (j == 0) ? kLZ77Standard | kLZ77RLE : kLZ77Box;
+      crunch_configs[i].sub_configs_[j].do_no_cache_ = do_no_cache;
+    }
+    crunch_configs[i].sub_configs_size_ = n_lz77s;
+  }
+  return 1;
+}
+
+static int EncoderInit(VP8LEncoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+  const int pix_cnt = width * height;
+  // we round the block size up, so we're guaranteed to have
+  // at most MAX_REFS_BLOCK_PER_IMAGE blocks used:
+  const int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1;
+  int i;
+  if (!VP8LHashChainInit(&enc->hash_chain_, pix_cnt)) return 0;
+
+  for (i = 0; i < 4; ++i) VP8LBackwardRefsInit(&enc->refs_[i], refs_block_size);
+
+  return 1;
+}
+
+// Returns false in case of memory error.
+static int GetHuffBitLengthsAndCodes(
+    const VP8LHistogramSet* const histogram_image,
+    HuffmanTreeCode* const huffman_codes) {
+  int i, k;
+  int ok = 0;
+  uint64_t total_length_size = 0;
+  uint8_t* mem_buf = NULL;
+  const int histogram_image_size = histogram_image->size;
+  int max_num_symbols = 0;
+  uint8_t* buf_rle = NULL;
+  HuffmanTree* huff_tree = NULL;
+
+  // Iterate over all histograms and get the aggregate number of codes used.
+  for (i = 0; i < histogram_image_size; ++i) {
+    const VP8LHistogram* const histo = histogram_image->histograms[i];
+    HuffmanTreeCode* const codes = &huffman_codes[5 * i];
+    assert(histo != NULL);
+    for (k = 0; k < 5; ++k) {
+      const int num_symbols =
+          (k == 0) ? VP8LHistogramNumCodes(histo->palette_code_bits_) :
+          (k == 4) ? NUM_DISTANCE_CODES : 256;
+      codes[k].num_symbols = num_symbols;
+      total_length_size += num_symbols;
+    }
+  }
+
+  // Allocate and Set Huffman codes.
+  {
+    uint16_t* codes;
+    uint8_t* lengths;
+    mem_buf = (uint8_t*)WebPSafeCalloc(total_length_size,
+                                       sizeof(*lengths) + sizeof(*codes));
+    if (mem_buf == NULL) goto End;
+
+    codes = (uint16_t*)mem_buf;
+    lengths = (uint8_t*)&codes[total_length_size];
+    for (i = 0; i < 5 * histogram_image_size; ++i) {
+      const int bit_length = huffman_codes[i].num_symbols;
+      huffman_codes[i].codes = codes;
+      huffman_codes[i].code_lengths = lengths;
+      codes += bit_length;
+      lengths += bit_length;
+      if (max_num_symbols < bit_length) {
+        max_num_symbols = bit_length;
+      }
+    }
+  }
+
+  buf_rle = (uint8_t*)WebPSafeMalloc(1ULL, max_num_symbols);
+  huff_tree = (HuffmanTree*)WebPSafeMalloc(3ULL * max_num_symbols,
+                                           sizeof(*huff_tree));
+  if (buf_rle == NULL || huff_tree == NULL) goto End;
+
+  // Create Huffman trees.
+  for (i = 0; i < histogram_image_size; ++i) {
+    HuffmanTreeCode* const codes = &huffman_codes[5 * i];
+    VP8LHistogram* const histo = histogram_image->histograms[i];
+    VP8LCreateHuffmanTree(histo->literal_, 15, buf_rle, huff_tree, codes + 0);
+    VP8LCreateHuffmanTree(histo->red_, 15, buf_rle, huff_tree, codes + 1);
+    VP8LCreateHuffmanTree(histo->blue_, 15, buf_rle, huff_tree, codes + 2);
+    VP8LCreateHuffmanTree(histo->alpha_, 15, buf_rle, huff_tree, codes + 3);
+    VP8LCreateHuffmanTree(histo->distance_, 15, buf_rle, huff_tree, codes + 4);
+  }
+  ok = 1;
+ End:
+  WebPSafeFree(huff_tree);
+  WebPSafeFree(buf_rle);
+  if (!ok) {
+    WebPSafeFree(mem_buf);
+    memset(huffman_codes, 0, 5 * histogram_image_size * sizeof(*huffman_codes));
+  }
+  return ok;
+}
+
+static void StoreHuffmanTreeOfHuffmanTreeToBitMask(
+    VP8LBitWriter* const bw, const uint8_t* code_length_bitdepth) {
+  // RFC 1951 will calm you down if you are worried about this funny sequence.
+  // This sequence is tuned from that, but more weighted for lower symbol count,
+  // and more spiking histograms.
+  static const uint8_t kStorageOrder[CODE_LENGTH_CODES] = {
+    17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  int i;
+  // Throw away trailing zeros:
+  int codes_to_store = CODE_LENGTH_CODES;
+  for (; codes_to_store > 4; --codes_to_store) {
+    if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
+      break;
+    }
+  }
+  VP8LPutBits(bw, codes_to_store - 4, 4);
+  for (i = 0; i < codes_to_store; ++i) {
+    VP8LPutBits(bw, code_length_bitdepth[kStorageOrder[i]], 3);
+  }
+}
+
+static void ClearHuffmanTreeIfOnlyOneSymbol(
+    HuffmanTreeCode* const huffman_code) {
+  int k;
+  int count = 0;
+  for (k = 0; k < huffman_code->num_symbols; ++k) {
+    if (huffman_code->code_lengths[k] != 0) {
+      ++count;
+      if (count > 1) return;
+    }
+  }
+  for (k = 0; k < huffman_code->num_symbols; ++k) {
+    huffman_code->code_lengths[k] = 0;
+    huffman_code->codes[k] = 0;
+  }
+}
+
+static void StoreHuffmanTreeToBitMask(
+    VP8LBitWriter* const bw,
+    const HuffmanTreeToken* const tokens, const int num_tokens,
+    const HuffmanTreeCode* const huffman_code) {
+  int i;
+  for (i = 0; i < num_tokens; ++i) {
+    const int ix = tokens[i].code;
+    const int extra_bits = tokens[i].extra_bits;
+    VP8LPutBits(bw, huffman_code->codes[ix], huffman_code->code_lengths[ix]);
+    switch (ix) {
+      case 16:
+        VP8LPutBits(bw, extra_bits, 2);
+        break;
+      case 17:
+        VP8LPutBits(bw, extra_bits, 3);
+        break;
+      case 18:
+        VP8LPutBits(bw, extra_bits, 7);
+        break;
+    }
+  }
+}
+
+// 'huff_tree' and 'tokens' are pre-alloacted buffers.
+static void StoreFullHuffmanCode(VP8LBitWriter* const bw,
+                                 HuffmanTree* const huff_tree,
+                                 HuffmanTreeToken* const tokens,
+                                 const HuffmanTreeCode* const tree) {
+  uint8_t code_length_bitdepth[CODE_LENGTH_CODES] = { 0 };
+  uint16_t code_length_bitdepth_symbols[CODE_LENGTH_CODES] = { 0 };
+  const int max_tokens = tree->num_symbols;
+  int num_tokens;
+  HuffmanTreeCode huffman_code;
+  huffman_code.num_symbols = CODE_LENGTH_CODES;
+  huffman_code.code_lengths = code_length_bitdepth;
+  huffman_code.codes = code_length_bitdepth_symbols;
+
+  VP8LPutBits(bw, 0, 1);
+  num_tokens = VP8LCreateCompressedHuffmanTree(tree, tokens, max_tokens);
+  {
+    uint32_t histogram[CODE_LENGTH_CODES] = { 0 };
+    uint8_t buf_rle[CODE_LENGTH_CODES] = { 0 };
+    int i;
+    for (i = 0; i < num_tokens; ++i) {
+      ++histogram[tokens[i].code];
+    }
+
+    VP8LCreateHuffmanTree(histogram, 7, buf_rle, huff_tree, &huffman_code);
+  }
+
+  StoreHuffmanTreeOfHuffmanTreeToBitMask(bw, code_length_bitdepth);
+  ClearHuffmanTreeIfOnlyOneSymbol(&huffman_code);
+  {
+    int trailing_zero_bits = 0;
+    int trimmed_length = num_tokens;
+    int write_trimmed_length;
+    int length;
+    int i = num_tokens;
+    while (i-- > 0) {
+      const int ix = tokens[i].code;
+      if (ix == 0 || ix == 17 || ix == 18) {
+        --trimmed_length;   // discount trailing zeros
+        trailing_zero_bits += code_length_bitdepth[ix];
+        if (ix == 17) {
+          trailing_zero_bits += 3;
+        } else if (ix == 18) {
+          trailing_zero_bits += 7;
+        }
+      } else {
+        break;
+      }
+    }
+    write_trimmed_length = (trimmed_length > 1 && trailing_zero_bits > 12);
+    length = write_trimmed_length ? trimmed_length : num_tokens;
+    VP8LPutBits(bw, write_trimmed_length, 1);
+    if (write_trimmed_length) {
+      if (trimmed_length == 2) {
+        VP8LPutBits(bw, 0, 3 + 2);     // nbitpairs=1, trimmed_length=2
+      } else {
+        const int nbits = BitsLog2Floor(trimmed_length - 2);
+        const int nbitpairs = nbits / 2 + 1;
+        assert(trimmed_length > 2);
+        assert(nbitpairs - 1 < 8);
+        VP8LPutBits(bw, nbitpairs - 1, 3);
+        VP8LPutBits(bw, trimmed_length - 2, nbitpairs * 2);
+      }
+    }
+    StoreHuffmanTreeToBitMask(bw, tokens, length, &huffman_code);
+  }
+}
+
+// 'huff_tree' and 'tokens' are pre-alloacted buffers.
+static void StoreHuffmanCode(VP8LBitWriter* const bw,
+                             HuffmanTree* const huff_tree,
+                             HuffmanTreeToken* const tokens,
+                             const HuffmanTreeCode* const huffman_code) {
+  int i;
+  int count = 0;
+  int symbols[2] = { 0, 0 };
+  const int kMaxBits = 8;
+  const int kMaxSymbol = 1 << kMaxBits;
+
+  // Check whether it's a small tree.
+  for (i = 0; i < huffman_code->num_symbols && count < 3; ++i) {
+    if (huffman_code->code_lengths[i] != 0) {
+      if (count < 2) symbols[count] = i;
+      ++count;
+    }
+  }
+
+  if (count == 0) {   // emit minimal tree for empty cases
+    // bits: small tree marker: 1, count-1: 0, large 8-bit code: 0, code: 0
+    VP8LPutBits(bw, 0x01, 4);
+  } else if (count <= 2 && symbols[0] < kMaxSymbol && symbols[1] < kMaxSymbol) {
+    VP8LPutBits(bw, 1, 1);  // Small tree marker to encode 1 or 2 symbols.
+    VP8LPutBits(bw, count - 1, 1);
+    if (symbols[0] <= 1) {
+      VP8LPutBits(bw, 0, 1);  // Code bit for small (1 bit) symbol value.
+      VP8LPutBits(bw, symbols[0], 1);
+    } else {
+      VP8LPutBits(bw, 1, 1);
+      VP8LPutBits(bw, symbols[0], 8);
+    }
+    if (count == 2) {
+      VP8LPutBits(bw, symbols[1], 8);
+    }
+  } else {
+    StoreFullHuffmanCode(bw, huff_tree, tokens, huffman_code);
+  }
+}
+
+static WEBP_INLINE void WriteHuffmanCode(VP8LBitWriter* const bw,
+                             const HuffmanTreeCode* const code,
+                             int code_index) {
+  const int depth = code->code_lengths[code_index];
+  const int symbol = code->codes[code_index];
+  VP8LPutBits(bw, symbol, depth);
+}
+
+static WEBP_INLINE void WriteHuffmanCodeWithExtraBits(
+    VP8LBitWriter* const bw,
+    const HuffmanTreeCode* const code,
+    int code_index,
+    int bits,
+    int n_bits) {
+  const int depth = code->code_lengths[code_index];
+  const int symbol = code->codes[code_index];
+  VP8LPutBits(bw, (bits << depth) | symbol, depth + n_bits);
+}
+
+static WebPEncodingError StoreImageToBitMask(
+    VP8LBitWriter* const bw, int width, int histo_bits,
+    const VP8LBackwardRefs* const refs,
+    const uint16_t* histogram_symbols,
+    const HuffmanTreeCode* const huffman_codes) {
+  const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
+  const int tile_mask = (histo_bits == 0) ? 0 : -(1 << histo_bits);
+  // x and y trace the position in the image.
+  int x = 0;
+  int y = 0;
+  int tile_x = x & tile_mask;
+  int tile_y = y & tile_mask;
+  int histogram_ix = histogram_symbols[0];
+  const HuffmanTreeCode* codes = huffman_codes + 5 * histogram_ix;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
+    if ((tile_x != (x & tile_mask)) || (tile_y != (y & tile_mask))) {
+      tile_x = x & tile_mask;
+      tile_y = y & tile_mask;
+      histogram_ix = histogram_symbols[(y >> histo_bits) * histo_xsize +
+                                       (x >> histo_bits)];
+      codes = huffman_codes + 5 * histogram_ix;
+    }
+    if (PixOrCopyIsLiteral(v)) {
+      static const uint8_t order[] = { 1, 2, 0, 3 };
+      int k;
+      for (k = 0; k < 4; ++k) {
+        const int code = PixOrCopyLiteral(v, order[k]);
+        WriteHuffmanCode(bw, codes + k, code);
+      }
+    } else if (PixOrCopyIsCacheIdx(v)) {
+      const int code = PixOrCopyCacheIdx(v);
+      const int literal_ix = 256 + NUM_LENGTH_CODES + code;
+      WriteHuffmanCode(bw, codes, literal_ix);
+    } else {
+      int bits, n_bits;
+      int code;
+
+      const int distance = PixOrCopyDistance(v);
+      VP8LPrefixEncode(v->len, &code, &n_bits, &bits);
+      WriteHuffmanCodeWithExtraBits(bw, codes, 256 + code, bits, n_bits);
+
+      // Don't write the distance with the extra bits code since
+      // the distance can be up to 18 bits of extra bits, and the prefix
+      // 15 bits, totaling to 33, and our PutBits only supports up to 32 bits.
+      VP8LPrefixEncode(distance, &code, &n_bits, &bits);
+      WriteHuffmanCode(bw, codes + 4, code);
+      VP8LPutBits(bw, bits, n_bits);
+    }
+    x += PixOrCopyLength(v);
+    while (x >= width) {
+      x -= width;
+      ++y;
+    }
+    VP8LRefsCursorNext(&c);
+  }
+  return bw->error_ ? VP8_ENC_ERROR_OUT_OF_MEMORY : VP8_ENC_OK;
+}
+
+// Special case of EncodeImageInternal() for cache-bits=0, histo_bits=31
+static WebPEncodingError EncodeImageNoHuffman(
+    VP8LBitWriter* const bw, const uint32_t* const argb,
+    VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_array,
+    int width, int height, int quality, int low_effort) {
+  int i;
+  int max_tokens = 0;
+  WebPEncodingError err = VP8_ENC_OK;
+  VP8LBackwardRefs* refs;
+  HuffmanTreeToken* tokens = NULL;
+  HuffmanTreeCode huffman_codes[5] = { { 0, NULL, NULL } };
+  const uint16_t histogram_symbols[1] = { 0 };    // only one tree, one symbol
+  int cache_bits = 0;
+  VP8LHistogramSet* histogram_image = NULL;
+  HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
+        3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
+  if (huff_tree == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  // Calculate backward references from ARGB image.
+  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+  err = VP8LGetBackwardReferences(
+      width, height, argb, quality, /*low_effort=*/0, kLZ77Standard | kLZ77RLE,
+      cache_bits, /*do_no_cache=*/0, hash_chain, refs_array, &cache_bits);
+  if (err != VP8_ENC_OK) goto Error;
+  refs = &refs_array[0];
+  histogram_image = VP8LAllocateHistogramSet(1, cache_bits);
+  if (histogram_image == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+  VP8LHistogramSetClear(histogram_image);
+
+  // Build histogram image and symbols from backward references.
+  VP8LHistogramStoreRefs(refs, histogram_image->histograms[0]);
+
+  // Create Huffman bit lengths and codes for each histogram image.
+  assert(histogram_image->size == 1);
+  if (!GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  // No color cache, no Huffman image.
+  VP8LPutBits(bw, 0, 1);
+
+  // Find maximum number of symbols for the huffman tree-set.
+  for (i = 0; i < 5; ++i) {
+    HuffmanTreeCode* const codes = &huffman_codes[i];
+    if (max_tokens < codes->num_symbols) {
+      max_tokens = codes->num_symbols;
+    }
+  }
+
+  tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
+  if (tokens == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  // Store Huffman codes.
+  for (i = 0; i < 5; ++i) {
+    HuffmanTreeCode* const codes = &huffman_codes[i];
+    StoreHuffmanCode(bw, huff_tree, tokens, codes);
+    ClearHuffmanTreeIfOnlyOneSymbol(codes);
+  }
+
+  // Store actual literals.
+  err = StoreImageToBitMask(bw, width, 0, refs, histogram_symbols,
+                            huffman_codes);
+
+ Error:
+  WebPSafeFree(tokens);
+  WebPSafeFree(huff_tree);
+  VP8LFreeHistogramSet(histogram_image);
+  WebPSafeFree(huffman_codes[0].codes);
+  return err;
+}
+
+static WebPEncodingError EncodeImageInternal(
+    VP8LBitWriter* const bw, const uint32_t* const argb,
+    VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[4], int width,
+    int height, int quality, int low_effort, int use_cache,
+    const CrunchConfig* const config, int* cache_bits, int histogram_bits,
+    size_t init_byte_position, int* const hdr_size, int* const data_size) {
+  WebPEncodingError err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+  const uint32_t histogram_image_xysize =
+      VP8LSubSampleSize(width, histogram_bits) *
+      VP8LSubSampleSize(height, histogram_bits);
+  VP8LHistogramSet* histogram_image = NULL;
+  VP8LHistogram* tmp_histo = NULL;
+  int histogram_image_size = 0;
+  size_t bit_array_size = 0;
+  HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
+      3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
+  HuffmanTreeToken* tokens = NULL;
+  HuffmanTreeCode* huffman_codes = NULL;
+  uint16_t* const histogram_symbols =
+      (uint16_t*)WebPSafeMalloc(histogram_image_xysize,
+                                sizeof(*histogram_symbols));
+  int sub_configs_idx;
+  int cache_bits_init, write_histogram_image;
+  VP8LBitWriter bw_init = *bw, bw_best;
+  int hdr_size_tmp;
+  VP8LHashChain hash_chain_histogram;  // histogram image hash chain
+  size_t bw_size_best = ~(size_t)0;
+  assert(histogram_bits >= MIN_HUFFMAN_BITS);
+  assert(histogram_bits <= MAX_HUFFMAN_BITS);
+  assert(hdr_size != NULL);
+  assert(data_size != NULL);
+
+  // Make sure we can allocate the different objects.
+  memset(&hash_chain_histogram, 0, sizeof(hash_chain_histogram));
+  if (huff_tree == NULL || histogram_symbols == NULL ||
+      !VP8LHashChainInit(&hash_chain_histogram, histogram_image_xysize) ||
+      !VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort)) {
+    goto Error;
+  }
+  if (use_cache) {
+    // If the value is different from zero, it has been set during the
+    // palette analysis.
+    cache_bits_init = (*cache_bits == 0) ? MAX_COLOR_CACHE_BITS : *cache_bits;
+  } else {
+    cache_bits_init = 0;
+  }
+  // If several iterations will happen, clone into bw_best.
+  if (!VP8LBitWriterInit(&bw_best, 0) ||
+      ((config->sub_configs_size_ > 1 ||
+        config->sub_configs_[0].do_no_cache_) &&
+       !VP8LBitWriterClone(bw, &bw_best))) {
+    goto Error;
+  }
+  for (sub_configs_idx = 0; sub_configs_idx < config->sub_configs_size_;
+       ++sub_configs_idx) {
+    const CrunchSubConfig* const sub_config =
+        &config->sub_configs_[sub_configs_idx];
+    int cache_bits_best, i_cache;
+    err = VP8LGetBackwardReferences(width, height, argb, quality, low_effort,
+                                    sub_config->lz77_, cache_bits_init,
+                                    sub_config->do_no_cache_, hash_chain,
+                                    &refs_array[0], &cache_bits_best);
+    if (err != VP8_ENC_OK) goto Error;
+
+    for (i_cache = 0; i_cache < (sub_config->do_no_cache_ ? 2 : 1); ++i_cache) {
+      const int cache_bits_tmp = (i_cache == 0) ? cache_bits_best : 0;
+      // Speed-up: no need to study the no-cache case if it was already studied
+      // in i_cache == 0.
+      if (i_cache == 1 && cache_bits_best == 0) break;
+
+      // Reset the bit writer for this iteration.
+      VP8LBitWriterReset(&bw_init, bw);
+
+      // Build histogram image and symbols from backward references.
+      histogram_image =
+          VP8LAllocateHistogramSet(histogram_image_xysize, cache_bits_tmp);
+      tmp_histo = VP8LAllocateHistogram(cache_bits_tmp);
+      if (histogram_image == NULL || tmp_histo == NULL ||
+          !VP8LGetHistoImageSymbols(width, height, &refs_array[i_cache],
+                                    quality, low_effort, histogram_bits,
+                                    cache_bits_tmp, histogram_image, tmp_histo,
+                                    histogram_symbols)) {
+        goto Error;
+      }
+      // Create Huffman bit lengths and codes for each histogram image.
+      histogram_image_size = histogram_image->size;
+      bit_array_size = 5 * histogram_image_size;
+      huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
+                                                       sizeof(*huffman_codes));
+      // Note: some histogram_image entries may point to tmp_histos[], so the
+      // latter need to outlive the following call to
+      // GetHuffBitLengthsAndCodes().
+      if (huffman_codes == NULL ||
+          !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+        goto Error;
+      }
+      // Free combined histograms.
+      VP8LFreeHistogramSet(histogram_image);
+      histogram_image = NULL;
+
+      // Free scratch histograms.
+      VP8LFreeHistogram(tmp_histo);
+      tmp_histo = NULL;
+
+      // Color Cache parameters.
+      if (cache_bits_tmp > 0) {
+        VP8LPutBits(bw, 1, 1);
+        VP8LPutBits(bw, cache_bits_tmp, 4);
+      } else {
+        VP8LPutBits(bw, 0, 1);
+      }
+
+      // Huffman image + meta huffman.
+      write_histogram_image = (histogram_image_size > 1);
+      VP8LPutBits(bw, write_histogram_image, 1);
+      if (write_histogram_image) {
+        uint32_t* const histogram_argb =
+            (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
+                                      sizeof(*histogram_argb));
+        int max_index = 0;
+        uint32_t i;
+        if (histogram_argb == NULL) goto Error;
+        for (i = 0; i < histogram_image_xysize; ++i) {
+          const int symbol_index = histogram_symbols[i] & 0xffff;
+          histogram_argb[i] = (symbol_index << 8);
+          if (symbol_index >= max_index) {
+            max_index = symbol_index + 1;
+          }
+        }
+        histogram_image_size = max_index;
+
+        VP8LPutBits(bw, histogram_bits - 2, 3);
+        err = EncodeImageNoHuffman(
+            bw, histogram_argb, &hash_chain_histogram, &refs_array[2],
+            VP8LSubSampleSize(width, histogram_bits),
+            VP8LSubSampleSize(height, histogram_bits), quality, low_effort);
+        WebPSafeFree(histogram_argb);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+
+      // Store Huffman codes.
+      {
+        int i;
+        int max_tokens = 0;
+        // Find maximum number of symbols for the huffman tree-set.
+        for (i = 0; i < 5 * histogram_image_size; ++i) {
+          HuffmanTreeCode* const codes = &huffman_codes[i];
+          if (max_tokens < codes->num_symbols) {
+            max_tokens = codes->num_symbols;
+          }
+        }
+        tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
+        if (tokens == NULL) goto Error;
+        for (i = 0; i < 5 * histogram_image_size; ++i) {
+          HuffmanTreeCode* const codes = &huffman_codes[i];
+          StoreHuffmanCode(bw, huff_tree, tokens, codes);
+          ClearHuffmanTreeIfOnlyOneSymbol(codes);
+        }
+      }
+      // Store actual literals.
+      hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
+      err = StoreImageToBitMask(bw, width, histogram_bits, &refs_array[i_cache],
+                                histogram_symbols, huffman_codes);
+      if (err != VP8_ENC_OK) goto Error;
+      // Keep track of the smallest image so far.
+      if (VP8LBitWriterNumBytes(bw) < bw_size_best) {
+        bw_size_best = VP8LBitWriterNumBytes(bw);
+        *cache_bits = cache_bits_tmp;
+        *hdr_size = hdr_size_tmp;
+        *data_size =
+            (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
+        VP8LBitWriterSwap(bw, &bw_best);
+      }
+      WebPSafeFree(tokens);
+      tokens = NULL;
+      if (huffman_codes != NULL) {
+        WebPSafeFree(huffman_codes->codes);
+        WebPSafeFree(huffman_codes);
+        huffman_codes = NULL;
+      }
+    }
+  }
+  VP8LBitWriterSwap(bw, &bw_best);
+  err = VP8_ENC_OK;
+
+ Error:
+  WebPSafeFree(tokens);
+  WebPSafeFree(huff_tree);
+  VP8LFreeHistogramSet(histogram_image);
+  VP8LFreeHistogram(tmp_histo);
+  VP8LHashChainClear(&hash_chain_histogram);
+  if (huffman_codes != NULL) {
+    WebPSafeFree(huffman_codes->codes);
+    WebPSafeFree(huffman_codes);
+  }
+  WebPSafeFree(histogram_symbols);
+  VP8LBitWriterWipeOut(&bw_best);
+  return err;
+}
+
+// -----------------------------------------------------------------------------
+// Transforms
+
+static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height,
+                               VP8LBitWriter* const bw) {
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, SUBTRACT_GREEN, 2);
+  VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
+}
+
+static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
+                                            int width, int height,
+                                            int quality, int low_effort,
+                                            int used_subtract_green,
+                                            VP8LBitWriter* const bw) {
+  const int pred_bits = enc->transform_bits_;
+  const int transform_width = VP8LSubSampleSize(width, pred_bits);
+  const int transform_height = VP8LSubSampleSize(height, pred_bits);
+  // we disable near-lossless quantization if palette is used.
+  const int near_lossless_strength = enc->use_palette_ ? 100
+                                   : enc->config_->near_lossless;
+
+  VP8LResidualImage(width, height, pred_bits, low_effort, enc->argb_,
+                    enc->argb_scratch_, enc->transform_data_,
+                    near_lossless_strength, enc->config_->exact,
+                    used_subtract_green);
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
+  assert(pred_bits >= 2);
+  VP8LPutBits(bw, pred_bits - 2, 3);
+  return EncodeImageNoHuffman(
+      bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
+      (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
+      quality, low_effort);
+}
+
+static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
+                                               int width, int height,
+                                               int quality, int low_effort,
+                                               VP8LBitWriter* const bw) {
+  const int ccolor_transform_bits = enc->transform_bits_;
+  const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
+  const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
+
+  VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
+                          enc->argb_, enc->transform_data_);
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
+  assert(ccolor_transform_bits >= 2);
+  VP8LPutBits(bw, ccolor_transform_bits - 2, 3);
+  return EncodeImageNoHuffman(
+      bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
+      (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
+      quality, low_effort);
+}
+
+// -----------------------------------------------------------------------------
+
+static WebPEncodingError WriteRiffHeader(const WebPPicture* const pic,
+                                         size_t riff_size, size_t vp8l_size) {
+  uint8_t riff[RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE + VP8L_SIGNATURE_SIZE] = {
+    'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P',
+    'V', 'P', '8', 'L', 0, 0, 0, 0, VP8L_MAGIC_BYTE,
+  };
+  PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
+  PutLE32(riff + RIFF_HEADER_SIZE + TAG_SIZE, (uint32_t)vp8l_size);
+  if (!pic->writer(riff, sizeof(riff), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static int WriteImageSize(const WebPPicture* const pic,
+                          VP8LBitWriter* const bw) {
+  const int width = pic->width - 1;
+  const int height = pic->height - 1;
+  assert(width < WEBP_MAX_DIMENSION && height < WEBP_MAX_DIMENSION);
+
+  VP8LPutBits(bw, width, VP8L_IMAGE_SIZE_BITS);
+  VP8LPutBits(bw, height, VP8L_IMAGE_SIZE_BITS);
+  return !bw->error_;
+}
+
+static int WriteRealAlphaAndVersion(VP8LBitWriter* const bw, int has_alpha) {
+  VP8LPutBits(bw, has_alpha, 1);
+  VP8LPutBits(bw, VP8L_VERSION, VP8L_VERSION_BITS);
+  return !bw->error_;
+}
+
+static WebPEncodingError WriteImage(const WebPPicture* const pic,
+                                    VP8LBitWriter* const bw,
+                                    size_t* const coded_size) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const uint8_t* const webpll_data = VP8LBitWriterFinish(bw);
+  const size_t webpll_size = VP8LBitWriterNumBytes(bw);
+  const size_t vp8l_size = VP8L_SIGNATURE_SIZE + webpll_size;
+  const size_t pad = vp8l_size & 1;
+  const size_t riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8l_size + pad;
+
+  err = WriteRiffHeader(pic, riff_size, vp8l_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  if (!pic->writer(webpll_data, webpll_size, pic)) {
+    err = VP8_ENC_ERROR_BAD_WRITE;
+    goto Error;
+  }
+
+  if (pad) {
+    const uint8_t pad_byte[1] = { 0 };
+    if (!pic->writer(pad_byte, 1, pic)) {
+      err = VP8_ENC_ERROR_BAD_WRITE;
+      goto Error;
+    }
+  }
+  *coded_size = CHUNK_HEADER_SIZE + riff_size;
+  return VP8_ENC_OK;
+
+ Error:
+  return err;
+}
+
+// -----------------------------------------------------------------------------
+
+static void ClearTransformBuffer(VP8LEncoder* const enc) {
+  WebPSafeFree(enc->transform_mem_);
+  enc->transform_mem_ = NULL;
+  enc->transform_mem_size_ = 0;
+}
+
+// Allocates the memory for argb (W x H) buffer, 2 rows of context for
+// prediction and transform data.
+// Flags influencing the memory allocated:
+//  enc->transform_bits_
+//  enc->use_predict_, enc->use_cross_color_
+static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
+                                                 int width, int height) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const uint64_t image_size = width * height;
+  // VP8LResidualImage needs room for 2 scanlines of uint32 pixels with an extra
+  // pixel in each, plus 2 regular scanlines of bytes.
+  // TODO(skal): Clean up by using arithmetic in bytes instead of words.
+  const uint64_t argb_scratch_size =
+      enc->use_predict_
+          ? (width + 1) * 2 +
+            (width * 2 + sizeof(uint32_t) - 1) / sizeof(uint32_t)
+          : 0;
+  const uint64_t transform_data_size =
+      (enc->use_predict_ || enc->use_cross_color_)
+          ? VP8LSubSampleSize(width, enc->transform_bits_) *
+                VP8LSubSampleSize(height, enc->transform_bits_)
+          : 0;
+  const uint64_t max_alignment_in_words =
+      (WEBP_ALIGN_CST + sizeof(uint32_t) - 1) / sizeof(uint32_t);
+  const uint64_t mem_size =
+      image_size + max_alignment_in_words +
+      argb_scratch_size + max_alignment_in_words +
+      transform_data_size;
+  uint32_t* mem = enc->transform_mem_;
+  if (mem == NULL || mem_size > enc->transform_mem_size_) {
+    ClearTransformBuffer(enc);
+    mem = (uint32_t*)WebPSafeMalloc(mem_size, sizeof(*mem));
+    if (mem == NULL) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+    enc->transform_mem_ = mem;
+    enc->transform_mem_size_ = (size_t)mem_size;
+    enc->argb_content_ = kEncoderNone;
+  }
+  enc->argb_ = mem;
+  mem = (uint32_t*)WEBP_ALIGN(mem + image_size);
+  enc->argb_scratch_ = mem;
+  mem = (uint32_t*)WEBP_ALIGN(mem + argb_scratch_size);
+  enc->transform_data_ = mem;
+
+  enc->current_width_ = width;
+ Error:
+  return err;
+}
+
+static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const WebPPicture* const picture = enc->pic_;
+  const int width = picture->width;
+  const int height = picture->height;
+
+  err = AllocateTransformBuffer(enc, width, height);
+  if (err != VP8_ENC_OK) return err;
+  if (enc->argb_content_ == kEncoderARGB) return VP8_ENC_OK;
+
+  {
+    uint32_t* dst = enc->argb_;
+    const uint32_t* src = picture->argb;
+    int y;
+    for (y = 0; y < height; ++y) {
+      memcpy(dst, src, width * sizeof(*dst));
+      dst += width;
+      src += picture->argb_stride;
+    }
+  }
+  enc->argb_content_ = kEncoderARGB;
+  assert(enc->current_width_ == width);
+  return VP8_ENC_OK;
+}
+
+// -----------------------------------------------------------------------------
+
+#define APPLY_PALETTE_GREEDY_MAX 4
+
+static WEBP_INLINE uint32_t SearchColorGreedy(const uint32_t palette[],
+                                              int palette_size,
+                                              uint32_t color) {
+  (void)palette_size;
+  assert(palette_size < APPLY_PALETTE_GREEDY_MAX);
+  assert(3 == APPLY_PALETTE_GREEDY_MAX - 1);
+  if (color == palette[0]) return 0;
+  if (color == palette[1]) return 1;
+  if (color == palette[2]) return 2;
+  return 3;
+}
+
+static WEBP_INLINE uint32_t ApplyPaletteHash0(uint32_t color) {
+  // Focus on the green color.
+  return (color >> 8) & 0xff;
+}
+
+#define PALETTE_INV_SIZE_BITS 11
+#define PALETTE_INV_SIZE (1 << PALETTE_INV_SIZE_BITS)
+
+static WEBP_INLINE uint32_t ApplyPaletteHash1(uint32_t color) {
+  // Forget about alpha.
+  return ((uint32_t)((color & 0x00ffffffu) * 4222244071ull)) >>
+         (32 - PALETTE_INV_SIZE_BITS);
+}
+
+static WEBP_INLINE uint32_t ApplyPaletteHash2(uint32_t color) {
+  // Forget about alpha.
+  return ((uint32_t)((color & 0x00ffffffu) * ((1ull << 31) - 1))) >>
+         (32 - PALETTE_INV_SIZE_BITS);
+}
+
+// Use 1 pixel cache for ARGB pixels.
+#define APPLY_PALETTE_FOR(COLOR_INDEX) do {         \
+  uint32_t prev_pix = palette[0];                   \
+  uint32_t prev_idx = 0;                            \
+  for (y = 0; y < height; ++y) {                    \
+    for (x = 0; x < width; ++x) {                   \
+      const uint32_t pix = src[x];                  \
+      if (pix != prev_pix) {                        \
+        prev_idx = COLOR_INDEX;                     \
+        prev_pix = pix;                             \
+      }                                             \
+      tmp_row[x] = prev_idx;                        \
+    }                                               \
+    VP8LBundleColorMap(tmp_row, width, xbits, dst); \
+    src += src_stride;                              \
+    dst += dst_stride;                              \
+  }                                                 \
+} while (0)
+
+// Remap argb values in src[] to packed palettes entries in dst[]
+// using 'row' as a temporary buffer of size 'width'.
+// We assume that all src[] values have a corresponding entry in the palette.
+// Note: src[] can be the same as dst[]
+static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
+                                      uint32_t* dst, uint32_t dst_stride,
+                                      const uint32_t* palette, int palette_size,
+                                      int width, int height, int xbits) {
+  // TODO(skal): this tmp buffer is not needed if VP8LBundleColorMap() can be
+  // made to work in-place.
+  uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
+  int x, y;
+
+  if (tmp_row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+
+  if (palette_size < APPLY_PALETTE_GREEDY_MAX) {
+    APPLY_PALETTE_FOR(SearchColorGreedy(palette, palette_size, pix));
+  } else {
+    int i, j;
+    uint16_t buffer[PALETTE_INV_SIZE];
+    uint32_t (*const hash_functions[])(uint32_t) = {
+        ApplyPaletteHash0, ApplyPaletteHash1, ApplyPaletteHash2
+    };
+
+    // Try to find a perfect hash function able to go from a color to an index
+    // within 1 << PALETTE_INV_SIZE_BITS in order to build a hash map to go
+    // from color to index in palette.
+    for (i = 0; i < 3; ++i) {
+      int use_LUT = 1;
+      // Set each element in buffer to max uint16_t.
+      memset(buffer, 0xff, sizeof(buffer));
+      for (j = 0; j < palette_size; ++j) {
+        const uint32_t ind = hash_functions[i](palette[j]);
+        if (buffer[ind] != 0xffffu) {
+          use_LUT = 0;
+          break;
+        } else {
+          buffer[ind] = j;
+        }
+      }
+      if (use_LUT) break;
+    }
+
+    if (i == 0) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash0(pix)]);
+    } else if (i == 1) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash1(pix)]);
+    } else if (i == 2) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash2(pix)]);
+    } else {
+      uint32_t idx_map[MAX_PALETTE_SIZE];
+      uint32_t palette_sorted[MAX_PALETTE_SIZE];
+      PrepareMapToPalette(palette, palette_size, palette_sorted, idx_map);
+      APPLY_PALETTE_FOR(
+          idx_map[SearchColorNoIdx(palette_sorted, pix, palette_size)]);
+    }
+  }
+  WebPSafeFree(tmp_row);
+  return VP8_ENC_OK;
+}
+#undef APPLY_PALETTE_FOR
+#undef PALETTE_INV_SIZE_BITS
+#undef PALETTE_INV_SIZE
+#undef APPLY_PALETTE_GREEDY_MAX
+
+// Note: Expects "enc->palette_" to be set properly.
+static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
+                                             int in_place) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+  const uint32_t* const palette = enc->palette_;
+  const uint32_t* src = in_place ? enc->argb_ : pic->argb;
+  const int src_stride = in_place ? enc->current_width_ : pic->argb_stride;
+  const int palette_size = enc->palette_size_;
+  int xbits;
+
+  // Replace each input pixel by corresponding palette index.
+  // This is done line by line.
+  if (palette_size <= 4) {
+    xbits = (palette_size <= 2) ? 3 : 2;
+  } else {
+    xbits = (palette_size <= 16) ? 1 : 0;
+  }
+
+  err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
+  if (err != VP8_ENC_OK) return err;
+
+  err = ApplyPalette(src, src_stride,
+                     enc->argb_, enc->current_width_,
+                     palette, palette_size, width, height, xbits);
+  enc->argb_content_ = kEncoderPalette;
+  return err;
+}
+
+// Save palette_[] to bitstream.
+static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
+                                       VP8LEncoder* const enc) {
+  int i;
+  uint32_t tmp_palette[MAX_PALETTE_SIZE];
+  const int palette_size = enc->palette_size_;
+  const uint32_t* const palette = enc->palette_;
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, COLOR_INDEXING_TRANSFORM, 2);
+  assert(palette_size >= 1 && palette_size <= MAX_PALETTE_SIZE);
+  VP8LPutBits(bw, palette_size - 1, 8);
+  for (i = palette_size - 1; i >= 1; --i) {
+    tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
+  }
+  tmp_palette[0] = palette[0];
+  return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_,
+                              &enc->refs_[0], palette_size, 1, /*quality=*/20,
+                              low_effort);
+}
+
+// -----------------------------------------------------------------------------
+// VP8LEncoder
+
+static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
+                                   const WebPPicture* const picture) {
+  VP8LEncoder* const enc = (VP8LEncoder*)WebPSafeCalloc(1ULL, sizeof(*enc));
+  if (enc == NULL) {
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return NULL;
+  }
+  enc->config_ = config;
+  enc->pic_ = picture;
+  enc->argb_content_ = kEncoderNone;
+
+  VP8LEncDspInit();
+
+  return enc;
+}
+
+static void VP8LEncoderDelete(VP8LEncoder* enc) {
+  if (enc != NULL) {
+    int i;
+    VP8LHashChainClear(&enc->hash_chain_);
+    for (i = 0; i < 4; ++i) VP8LBackwardRefsClear(&enc->refs_[i]);
+    ClearTransformBuffer(enc);
+    WebPSafeFree(enc);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Main call
+
+typedef struct {
+  const WebPConfig* config_;
+  const WebPPicture* picture_;
+  VP8LBitWriter* bw_;
+  VP8LEncoder* enc_;
+  int use_cache_;
+  CrunchConfig crunch_configs_[CRUNCH_CONFIGS_MAX];
+  int num_crunch_configs_;
+  int red_and_blue_always_zero_;
+  WebPEncodingError err_;
+  WebPAuxStats* stats_;
+} StreamEncodeContext;
+
+static int EncodeStreamHook(void* input, void* data2) {
+  StreamEncodeContext* const params = (StreamEncodeContext*)input;
+  const WebPConfig* const config = params->config_;
+  const WebPPicture* const picture = params->picture_;
+  VP8LBitWriter* const bw = params->bw_;
+  VP8LEncoder* const enc = params->enc_;
+  const int use_cache = params->use_cache_;
+  const CrunchConfig* const crunch_configs = params->crunch_configs_;
+  const int num_crunch_configs = params->num_crunch_configs_;
+  const int red_and_blue_always_zero = params->red_and_blue_always_zero_;
+#if !defined(WEBP_DISABLE_STATS)
+  WebPAuxStats* const stats = params->stats_;
+#endif
+  WebPEncodingError err = VP8_ENC_OK;
+  const int quality = (int)config->quality;
+  const int low_effort = (config->method == 0);
+#if (WEBP_NEAR_LOSSLESS == 1)
+  const int width = picture->width;
+#endif
+  const int height = picture->height;
+  const size_t byte_position = VP8LBitWriterNumBytes(bw);
+#if (WEBP_NEAR_LOSSLESS == 1)
+  int use_near_lossless = 0;
+#endif
+  int hdr_size = 0;
+  int data_size = 0;
+  int use_delta_palette = 0;
+  int idx;
+  size_t best_size = ~(size_t)0;
+  VP8LBitWriter bw_init = *bw, bw_best;
+  (void)data2;
+
+  if (!VP8LBitWriterInit(&bw_best, 0) ||
+      (num_crunch_configs > 1 && !VP8LBitWriterClone(bw, &bw_best))) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  for (idx = 0; idx < num_crunch_configs; ++idx) {
+    const int entropy_idx = crunch_configs[idx].entropy_idx_;
+    enc->use_palette_ =
+        (entropy_idx == kPalette) || (entropy_idx == kPaletteAndSpatial);
+    enc->use_subtract_green_ =
+        (entropy_idx == kSubGreen) || (entropy_idx == kSpatialSubGreen);
+    enc->use_predict_ = (entropy_idx == kSpatial) ||
+                        (entropy_idx == kSpatialSubGreen) ||
+                        (entropy_idx == kPaletteAndSpatial);
+    // When using a palette, R/B==0, hence no need to test for cross-color.
+    if (low_effort || enc->use_palette_) {
+      enc->use_cross_color_ = 0;
+    } else {
+      enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_;
+    }
+    // Reset any parameter in the encoder that is set in the previous iteration.
+    enc->cache_bits_ = 0;
+    VP8LBackwardRefsClear(&enc->refs_[0]);
+    VP8LBackwardRefsClear(&enc->refs_[1]);
+
+#if (WEBP_NEAR_LOSSLESS == 1)
+    // Apply near-lossless preprocessing.
+    use_near_lossless = (config->near_lossless < 100) && !enc->use_palette_ &&
+                        !enc->use_predict_;
+    if (use_near_lossless) {
+      err = AllocateTransformBuffer(enc, width, height);
+      if (err != VP8_ENC_OK) goto Error;
+      if ((enc->argb_content_ != kEncoderNearLossless) &&
+          !VP8ApplyNearLossless(picture, config->near_lossless, enc->argb_)) {
+        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+        goto Error;
+      }
+      enc->argb_content_ = kEncoderNearLossless;
+    } else {
+      enc->argb_content_ = kEncoderNone;
+    }
+#else
+    enc->argb_content_ = kEncoderNone;
+#endif
+
+    // Encode palette
+    if (enc->use_palette_) {
+      if (crunch_configs[idx].palette_sorting_type_ == kSortedDefault) {
+        // Nothing to do, we have already sorted the palette.
+        memcpy(enc->palette_, enc->palette_sorted_,
+               enc->palette_size_ * sizeof(*enc->palette_));
+      } else if (crunch_configs[idx].palette_sorting_type_ == kMinimizeDelta) {
+        PaletteSortMinimizeDeltas(enc->palette_sorted_, enc->palette_size_,
+                                  enc->palette_);
+      } else {
+        assert(crunch_configs[idx].palette_sorting_type_ == kModifiedZeng);
+        err = PaletteSortModifiedZeng(enc->pic_, enc->palette_sorted_,
+                                      enc->palette_size_, enc->palette_);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+      err = EncodePalette(bw, low_effort, enc);
+      if (err != VP8_ENC_OK) goto Error;
+      err = MapImageFromPalette(enc, use_delta_palette);
+      if (err != VP8_ENC_OK) goto Error;
+      // If using a color cache, do not have it bigger than the number of
+      // colors.
+      if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
+        enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
+      }
+    }
+    if (!use_delta_palette) {
+      // In case image is not packed.
+      if (enc->argb_content_ != kEncoderNearLossless &&
+          enc->argb_content_ != kEncoderPalette) {
+        err = MakeInputImageCopy(enc);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+
+      // -----------------------------------------------------------------------
+      // Apply transforms and write transform data.
+
+      if (enc->use_subtract_green_) {
+        ApplySubtractGreen(enc, enc->current_width_, height, bw);
+      }
+
+      if (enc->use_predict_) {
+        err = ApplyPredictFilter(enc, enc->current_width_, height, quality,
+                                 low_effort, enc->use_subtract_green_, bw);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+
+      if (enc->use_cross_color_) {
+        err = ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
+                                    low_effort, bw);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+    }
+
+    VP8LPutBits(bw, !TRANSFORM_PRESENT, 1);  // No more transforms.
+
+    // -------------------------------------------------------------------------
+    // Encode and write the transformed image.
+    err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
+                              enc->current_width_, height, quality, low_effort,
+                              use_cache, &crunch_configs[idx],
+                              &enc->cache_bits_, enc->histo_bits_,
+                              byte_position, &hdr_size, &data_size);
+    if (err != VP8_ENC_OK) goto Error;
+
+    // If we are better than what we already have.
+    if (VP8LBitWriterNumBytes(bw) < best_size) {
+      best_size = VP8LBitWriterNumBytes(bw);
+      // Store the BitWriter.
+      VP8LBitWriterSwap(bw, &bw_best);
+#if !defined(WEBP_DISABLE_STATS)
+      // Update the stats.
+      if (stats != NULL) {
+        stats->lossless_features = 0;
+        if (enc->use_predict_) stats->lossless_features |= 1;
+        if (enc->use_cross_color_) stats->lossless_features |= 2;
+        if (enc->use_subtract_green_) stats->lossless_features |= 4;
+        if (enc->use_palette_) stats->lossless_features |= 8;
+        stats->histogram_bits = enc->histo_bits_;
+        stats->transform_bits = enc->transform_bits_;
+        stats->cache_bits = enc->cache_bits_;
+        stats->palette_size = enc->palette_size_;
+        stats->lossless_size = (int)(best_size - byte_position);
+        stats->lossless_hdr_size = hdr_size;
+        stats->lossless_data_size = data_size;
+      }
+#endif
+    }
+    // Reset the bit writer for the following iteration if any.
+    if (num_crunch_configs > 1) VP8LBitWriterReset(&bw_init, bw);
+  }
+  VP8LBitWriterSwap(&bw_best, bw);
+
+Error:
+  VP8LBitWriterWipeOut(&bw_best);
+  params->err_ = err;
+  // The hook should return false in case of error.
+  return (err == VP8_ENC_OK);
+}
+
+WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
+                                   const WebPPicture* const picture,
+                                   VP8LBitWriter* const bw_main,
+                                   int use_cache) {
+  WebPEncodingError err = VP8_ENC_OK;
+  VP8LEncoder* const enc_main = VP8LEncoderNew(config, picture);
+  VP8LEncoder* enc_side = NULL;
+  CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX];
+  int num_crunch_configs_main, num_crunch_configs_side = 0;
+  int idx;
+  int red_and_blue_always_zero = 0;
+  WebPWorker worker_main, worker_side;
+  StreamEncodeContext params_main, params_side;
+  // The main thread uses picture->stats, the side thread uses stats_side.
+  WebPAuxStats stats_side;
+  VP8LBitWriter bw_side;
+  const WebPWorkerInterface* const worker_interface = WebPGetWorkerInterface();
+  int ok_main;
+
+  // Analyze image (entropy, num_palettes etc)
+  if (enc_main == NULL ||
+      !EncoderAnalyze(enc_main, crunch_configs, &num_crunch_configs_main,
+                      &red_and_blue_always_zero) ||
+      !EncoderInit(enc_main) || !VP8LBitWriterInit(&bw_side, 0)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  // Split the configs between the main and side threads (if any).
+  if (config->thread_level > 0) {
+    num_crunch_configs_side = num_crunch_configs_main / 2;
+    for (idx = 0; idx < num_crunch_configs_side; ++idx) {
+      params_side.crunch_configs_[idx] =
+          crunch_configs[num_crunch_configs_main - num_crunch_configs_side +
+                         idx];
+    }
+    params_side.num_crunch_configs_ = num_crunch_configs_side;
+  }
+  num_crunch_configs_main -= num_crunch_configs_side;
+  for (idx = 0; idx < num_crunch_configs_main; ++idx) {
+    params_main.crunch_configs_[idx] = crunch_configs[idx];
+  }
+  params_main.num_crunch_configs_ = num_crunch_configs_main;
+
+  // Fill in the parameters for the thread workers.
+  {
+    const int params_size = (num_crunch_configs_side > 0) ? 2 : 1;
+    for (idx = 0; idx < params_size; ++idx) {
+      // Create the parameters for each worker.
+      WebPWorker* const worker = (idx == 0) ? &worker_main : &worker_side;
+      StreamEncodeContext* const param =
+          (idx == 0) ? &params_main : &params_side;
+      param->config_ = config;
+      param->picture_ = picture;
+      param->use_cache_ = use_cache;
+      param->red_and_blue_always_zero_ = red_and_blue_always_zero;
+      if (idx == 0) {
+        param->stats_ = picture->stats;
+        param->bw_ = bw_main;
+        param->enc_ = enc_main;
+      } else {
+        param->stats_ = (picture->stats == NULL) ? NULL : &stats_side;
+        // Create a side bit writer.
+        if (!VP8LBitWriterClone(bw_main, &bw_side)) {
+          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          goto Error;
+        }
+        param->bw_ = &bw_side;
+        // Create a side encoder.
+        enc_side = VP8LEncoderNew(config, picture);
+        if (enc_side == NULL || !EncoderInit(enc_side)) {
+          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          goto Error;
+        }
+        // Copy the values that were computed for the main encoder.
+        enc_side->histo_bits_ = enc_main->histo_bits_;
+        enc_side->transform_bits_ = enc_main->transform_bits_;
+        enc_side->palette_size_ = enc_main->palette_size_;
+        memcpy(enc_side->palette_, enc_main->palette_,
+               sizeof(enc_main->palette_));
+        memcpy(enc_side->palette_sorted_, enc_main->palette_sorted_,
+               sizeof(enc_main->palette_sorted_));
+        param->enc_ = enc_side;
+      }
+      // Create the workers.
+      worker_interface->Init(worker);
+      worker->data1 = param;
+      worker->data2 = NULL;
+      worker->hook = EncodeStreamHook;
+    }
+  }
+
+  // Start the second thread if needed.
+  if (num_crunch_configs_side != 0) {
+    if (!worker_interface->Reset(&worker_side)) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+#if !defined(WEBP_DISABLE_STATS)
+    // This line is here and not in the param initialization above to remove a
+    // Clang static analyzer warning.
+    if (picture->stats != NULL) {
+      memcpy(&stats_side, picture->stats, sizeof(stats_side));
+    }
+#endif
+    // This line is only useful to remove a Clang static analyzer warning.
+    params_side.err_ = VP8_ENC_OK;
+    worker_interface->Launch(&worker_side);
+  }
+  // Execute the main thread.
+  worker_interface->Execute(&worker_main);
+  ok_main = worker_interface->Sync(&worker_main);
+  worker_interface->End(&worker_main);
+  if (num_crunch_configs_side != 0) {
+    // Wait for the second thread.
+    const int ok_side = worker_interface->Sync(&worker_side);
+    worker_interface->End(&worker_side);
+    if (!ok_main || !ok_side) {
+      err = ok_main ? params_side.err_ : params_main.err_;
+      goto Error;
+    }
+    if (VP8LBitWriterNumBytes(&bw_side) < VP8LBitWriterNumBytes(bw_main)) {
+      VP8LBitWriterSwap(bw_main, &bw_side);
+#if !defined(WEBP_DISABLE_STATS)
+      if (picture->stats != NULL) {
+        memcpy(picture->stats, &stats_side, sizeof(*picture->stats));
+      }
+#endif
+    }
+  } else {
+    if (!ok_main) {
+      err = params_main.err_;
+      goto Error;
+    }
+  }
+
+Error:
+  VP8LBitWriterWipeOut(&bw_side);
+  VP8LEncoderDelete(enc_main);
+  VP8LEncoderDelete(enc_side);
+  return err;
+}
+
+#undef CRUNCH_CONFIGS_MAX
+#undef CRUNCH_SUBCONFIGS_MAX
+
+int VP8LEncodeImage(const WebPConfig* const config,
+                    const WebPPicture* const picture) {
+  int width, height;
+  int has_alpha;
+  size_t coded_size;
+  int percent = 0;
+  int initial_size;
+  WebPEncodingError err = VP8_ENC_OK;
+  VP8LBitWriter bw;
+
+  if (picture == NULL) return 0;
+
+  if (config == NULL || picture->argb == NULL) {
+    err = VP8_ENC_ERROR_NULL_PARAMETER;
+    WebPEncodingSetError(picture, err);
+    return 0;
+  }
+
+  width = picture->width;
+  height = picture->height;
+  // Initialize BitWriter with size corresponding to 16 bpp to photo images and
+  // 8 bpp for graphical images.
+  initial_size = (config->image_hint == WEBP_HINT_GRAPH) ?
+      width * height : width * height * 2;
+  if (!VP8LBitWriterInit(&bw, initial_size)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  if (!WebPReportProgress(picture, 1, &percent)) {
+ UserAbort:
+    err = VP8_ENC_ERROR_USER_ABORT;
+    goto Error;
+  }
+  // Reset stats (for pure lossless coding)
+  if (picture->stats != NULL) {
+    WebPAuxStats* const stats = picture->stats;
+    memset(stats, 0, sizeof(*stats));
+    stats->PSNR[0] = 99.f;
+    stats->PSNR[1] = 99.f;
+    stats->PSNR[2] = 99.f;
+    stats->PSNR[3] = 99.f;
+    stats->PSNR[4] = 99.f;
+  }
+
+  // Write image size.
+  if (!WriteImageSize(picture, &bw)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  has_alpha = WebPPictureHasTransparency(picture);
+  // Write the non-trivial Alpha flag and lossless version.
+  if (!WriteRealAlphaAndVersion(&bw, has_alpha)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  if (!WebPReportProgress(picture, 5, &percent)) goto UserAbort;
+
+  // Encode main image stream.
+  err = VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/);
+  if (err != VP8_ENC_OK) goto Error;
+
+  if (!WebPReportProgress(picture, 90, &percent)) goto UserAbort;
+
+  // Finish the RIFF chunk.
+  err = WriteImage(picture, &bw, &coded_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  if (!WebPReportProgress(picture, 100, &percent)) goto UserAbort;
+
+#if !defined(WEBP_DISABLE_STATS)
+  // Save size.
+  if (picture->stats != NULL) {
+    picture->stats->coded_size += (int)coded_size;
+    picture->stats->lossless_size = (int)coded_size;
+  }
+#endif
+
+  if (picture->extra_info != NULL) {
+    const int mb_w = (width + 15) >> 4;
+    const int mb_h = (height + 15) >> 4;
+    memset(picture->extra_info, 0, mb_w * mb_h * sizeof(*picture->extra_info));
+  }
+
+ Error:
+  if (bw.error_) err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+  VP8LBitWriterWipeOut(&bw);
+  if (err != VP8_ENC_OK) {
+    WebPEncodingSetError(picture, err);
+    return 0;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/vp8li_enc.h b/media/libwebp/enc/vp8li_enc.h
index 1e259eda77..f6f8cf6403 100644
--- a/media/libwebp/enc/vp8li_enc.h
+++ b/media/libwebp/enc/vp8li_enc.h
@@ -69,9 +69,11 @@ typedef struct {
   int use_palette_;
   int palette_size_;
   uint32_t palette_[MAX_PALETTE_SIZE];
+  // Sorted version of palette_ for cache purposes.
+  uint32_t palette_sorted_[MAX_PALETTE_SIZE];
 
   // Some 'scratch' (potentially large) objects.
-  struct VP8LBackwardRefs refs_[3];  // Backward Refs array for temporaries.
+  struct VP8LBackwardRefs refs_[4];  // Backward Refs array for temporaries.
   VP8LHashChain hash_chain_;         // HashChain data for constructing
                                      // backward references.
 } VP8LEncoder;
diff --git a/media/libwebp/enc/webp_enc.c b/media/libwebp/enc/webp_enc.c
new file mode 100644
index 0000000000..47ba405f5d
--- /dev/null
+++ b/media/libwebp/enc/webp_enc.c
@@ -0,0 +1,410 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebP encoder: main entry point
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/vp8li_enc.h"
+#include "../utils/utils.h"
+
+// #define PRINT_MEMORY_INFO
+
+#ifdef PRINT_MEMORY_INFO
+#include <stdio.h>
+#endif
+
+//------------------------------------------------------------------------------
+
+int WebPGetEncoderVersion(void) {
+  return (ENC_MAJ_VERSION << 16) | (ENC_MIN_VERSION << 8) | ENC_REV_VERSION;
+}
+
+//------------------------------------------------------------------------------
+// VP8Encoder
+//------------------------------------------------------------------------------
+
+static void ResetSegmentHeader(VP8Encoder* const enc) {
+  VP8EncSegmentHeader* const hdr = &enc->segment_hdr_;
+  hdr->num_segments_ = enc->config_->segments;
+  hdr->update_map_  = (hdr->num_segments_ > 1);
+  hdr->size_ = 0;
+}
+
+static void ResetFilterHeader(VP8Encoder* const enc) {
+  VP8EncFilterHeader* const hdr = &enc->filter_hdr_;
+  hdr->simple_ = 1;
+  hdr->level_ = 0;
+  hdr->sharpness_ = 0;
+  hdr->i4x4_lf_delta_ = 0;
+}
+
+static void ResetBoundaryPredictions(VP8Encoder* const enc) {
+  // init boundary values once for all
+  // Note: actually, initializing the preds_[] is only needed for intra4.
+  int i;
+  uint8_t* const top = enc->preds_ - enc->preds_w_;
+  uint8_t* const left = enc->preds_ - 1;
+  for (i = -1; i < 4 * enc->mb_w_; ++i) {
+    top[i] = B_DC_PRED;
+  }
+  for (i = 0; i < 4 * enc->mb_h_; ++i) {
+    left[i * enc->preds_w_] = B_DC_PRED;
+  }
+  enc->nz_[-1] = 0;   // constant
+}
+
+// Mapping from config->method_ to coding tools used.
+//-------------------+---+---+---+---+---+---+---+
+//   Method          | 0 | 1 | 2 | 3 |(4)| 5 | 6 |
+//-------------------+---+---+---+---+---+---+---+
+// fast probe        | x |   |   | x |   |   |   |
+//-------------------+---+---+---+---+---+---+---+
+// dynamic proba     | ~ | x | x | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// fast mode analysis|[x]|[x]|   |   | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// basic rd-opt      |   |   |   | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// disto-refine i4/16| x | x | x |   |   |   |   |
+//-------------------+---+---+---+---+---+---+---+
+// disto-refine uv   |   | x | x |   |   |   |   |
+//-------------------+---+---+---+---+---+---+---+
+// rd-opt i4/16      |   |   | ~ | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// token buffer (opt)|   |   |   | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// Trellis           |   |   |   |   |   | x |Ful|
+//-------------------+---+---+---+---+---+---+---+
+// full-SNS          |   |   |   |   | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+
+static void MapConfigToTools(VP8Encoder* const enc) {
+  const WebPConfig* const config = enc->config_;
+  const int method = config->method;
+  const int limit = 100 - config->partition_limit;
+  enc->method_ = method;
+  enc->rd_opt_level_ = (method >= 6) ? RD_OPT_TRELLIS_ALL
+                     : (method >= 5) ? RD_OPT_TRELLIS
+                     : (method >= 3) ? RD_OPT_BASIC
+                     : RD_OPT_NONE;
+  enc->max_i4_header_bits_ =
+      256 * 16 * 16 *                 // upper bound: up to 16bit per 4x4 block
+      (limit * limit) / (100 * 100);  // ... modulated with a quadratic curve.
+
+  // partition0 = 512k max.
+  enc->mb_header_limit_ =
+      (score_t)256 * 510 * 8 * 1024 / (enc->mb_w_ * enc->mb_h_);
+
+  enc->thread_level_ = config->thread_level;
+
+  enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0);
+  if (!config->low_memory) {
+#if !defined(DISABLE_TOKEN_BUFFER)
+    enc->use_tokens_ = (enc->rd_opt_level_ >= RD_OPT_BASIC);  // need rd stats
+#endif
+    if (enc->use_tokens_) {
+      enc->num_parts_ = 1;   // doesn't work with multi-partition
+    }
+  }
+}
+
+// Memory scaling with dimensions:
+//  memory (bytes) ~= 2.25 * w + 0.0625 * w * h
+//
+// Typical memory footprint (614x440 picture)
+//              encoder: 22111
+//                 info: 4368
+//                preds: 17741
+//          top samples: 1263
+//             non-zero: 175
+//             lf-stats: 0
+//                total: 45658
+// Transient object sizes:
+//       VP8EncIterator: 3360
+//         VP8ModeScore: 872
+//       VP8SegmentInfo: 732
+//          VP8EncProba: 18352
+//              LFStats: 2048
+// Picture size (yuv): 419328
+
+static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
+                                  WebPPicture* const picture) {
+  VP8Encoder* enc;
+  const int use_filter =
+      (config->filter_strength > 0) || (config->autofilter > 0);
+  const int mb_w = (picture->width + 15) >> 4;
+  const int mb_h = (picture->height + 15) >> 4;
+  const int preds_w = 4 * mb_w + 1;
+  const int preds_h = 4 * mb_h + 1;
+  const size_t preds_size = preds_w * preds_h * sizeof(*enc->preds_);
+  const int top_stride = mb_w * 16;
+  const size_t nz_size = (mb_w + 1) * sizeof(*enc->nz_) + WEBP_ALIGN_CST;
+  const size_t info_size = mb_w * mb_h * sizeof(*enc->mb_info_);
+  const size_t samples_size =
+      2 * top_stride * sizeof(*enc->y_top_)  // top-luma/u/v
+      + WEBP_ALIGN_CST;                      // align all
+  const size_t lf_stats_size =
+      config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
+  const size_t top_derr_size =
+      (config->quality <= ERROR_DIFFUSION_QUALITY || config->pass > 1) ?
+          mb_w * sizeof(*enc->top_derr_) : 0;
+  uint8_t* mem;
+  const uint64_t size = (uint64_t)sizeof(*enc)   // main struct
+                      + WEBP_ALIGN_CST           // cache alignment
+                      + info_size                // modes info
+                      + preds_size               // prediction modes
+                      + samples_size             // top/left samples
+                      + top_derr_size            // top diffusion error
+                      + nz_size                  // coeff context bits
+                      + lf_stats_size;           // autofilter stats
+
+#ifdef PRINT_MEMORY_INFO
+  printf("===================================\n");
+  printf("Memory used:\n"
+         "             encoder: %ld\n"
+         "                info: %ld\n"
+         "               preds: %ld\n"
+         "         top samples: %ld\n"
+         "       top diffusion: %ld\n"
+         "            non-zero: %ld\n"
+         "            lf-stats: %ld\n"
+         "               total: %ld\n",
+         sizeof(*enc) + WEBP_ALIGN_CST, info_size,
+         preds_size, samples_size, top_derr_size, nz_size, lf_stats_size, size);
+  printf("Transient object sizes:\n"
+         "      VP8EncIterator: %ld\n"
+         "        VP8ModeScore: %ld\n"
+         "      VP8SegmentInfo: %ld\n"
+         "         VP8EncProba: %ld\n"
+         "             LFStats: %ld\n",
+         sizeof(VP8EncIterator), sizeof(VP8ModeScore),
+         sizeof(VP8SegmentInfo), sizeof(VP8EncProba),
+         sizeof(LFStats));
+  printf("Picture size (yuv): %ld\n",
+         mb_w * mb_h * 384 * sizeof(uint8_t));
+  printf("===================================\n");
+#endif
+  mem = (uint8_t*)WebPSafeMalloc(size, sizeof(*mem));
+  if (mem == NULL) {
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return NULL;
+  }
+  enc = (VP8Encoder*)mem;
+  mem = (uint8_t*)WEBP_ALIGN(mem + sizeof(*enc));
+  memset(enc, 0, sizeof(*enc));
+  enc->num_parts_ = 1 << config->partitions;
+  enc->mb_w_ = mb_w;
+  enc->mb_h_ = mb_h;
+  enc->preds_w_ = preds_w;
+  enc->mb_info_ = (VP8MBInfo*)mem;
+  mem += info_size;
+  enc->preds_ = mem + 1 + enc->preds_w_;
+  mem += preds_size;
+  enc->nz_ = 1 + (uint32_t*)WEBP_ALIGN(mem);
+  mem += nz_size;
+  enc->lf_stats_ = lf_stats_size ? (LFStats*)WEBP_ALIGN(mem) : NULL;
+  mem += lf_stats_size;
+
+  // top samples (all 16-aligned)
+  mem = (uint8_t*)WEBP_ALIGN(mem);
+  enc->y_top_ = mem;
+  enc->uv_top_ = enc->y_top_ + top_stride;
+  mem += 2 * top_stride;
+  enc->top_derr_ = top_derr_size ? (DError*)mem : NULL;
+  mem += top_derr_size;
+  assert(mem <= (uint8_t*)enc + size);
+
+  enc->config_ = config;
+  enc->profile_ = use_filter ? ((config->filter_type == 1) ? 0 : 1) : 2;
+  enc->pic_ = picture;
+  enc->percent_ = 0;
+
+  MapConfigToTools(enc);
+  VP8EncDspInit();
+  VP8DefaultProbas(enc);
+  ResetSegmentHeader(enc);
+  ResetFilterHeader(enc);
+  ResetBoundaryPredictions(enc);
+  VP8EncDspCostInit();
+  VP8EncInitAlpha(enc);
+
+  // lower quality means smaller output -> we modulate a little the page
+  // size based on quality. This is just a crude 1rst-order prediction.
+  {
+    const float scale = 1.f + config->quality * 5.f / 100.f;  // in [1,6]
+    VP8TBufferInit(&enc->tokens_, (int)(mb_w * mb_h * 4 * scale));
+  }
+  return enc;
+}
+
+static int DeleteVP8Encoder(VP8Encoder* enc) {
+  int ok = 1;
+  if (enc != NULL) {
+    ok = VP8EncDeleteAlpha(enc);
+    VP8TBufferClear(&enc->tokens_);
+    WebPSafeFree(enc);
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_DISABLE_STATS)
+static double GetPSNR(uint64_t err, uint64_t size) {
+  return (err > 0 && size > 0) ? 10. * log10(255. * 255. * size / err) : 99.;
+}
+
+static void FinalizePSNR(const VP8Encoder* const enc) {
+  WebPAuxStats* stats = enc->pic_->stats;
+  const uint64_t size = enc->sse_count_;
+  const uint64_t* const sse = enc->sse_;
+  stats->PSNR[0] = (float)GetPSNR(sse[0], size);
+  stats->PSNR[1] = (float)GetPSNR(sse[1], size / 4);
+  stats->PSNR[2] = (float)GetPSNR(sse[2], size / 4);
+  stats->PSNR[3] = (float)GetPSNR(sse[0] + sse[1] + sse[2], size * 3 / 2);
+  stats->PSNR[4] = (float)GetPSNR(sse[3], size);
+}
+#endif  // !defined(WEBP_DISABLE_STATS)
+
+static void StoreStats(VP8Encoder* const enc) {
+#if !defined(WEBP_DISABLE_STATS)
+  WebPAuxStats* const stats = enc->pic_->stats;
+  if (stats != NULL) {
+    int i, s;
+    for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
+      stats->segment_level[i] = enc->dqm_[i].fstrength_;
+      stats->segment_quant[i] = enc->dqm_[i].quant_;
+      for (s = 0; s <= 2; ++s) {
+        stats->residual_bytes[s][i] = enc->residual_bytes_[s][i];
+      }
+    }
+    FinalizePSNR(enc);
+    stats->coded_size = enc->coded_size_;
+    for (i = 0; i < 3; ++i) {
+      stats->block_count[i] = enc->block_count_[i];
+    }
+  }
+#else  // defined(WEBP_DISABLE_STATS)
+  WebPReportProgress(enc->pic_, 100, &enc->percent_);  // done!
+#endif  // !defined(WEBP_DISABLE_STATS)
+}
+
+int WebPEncodingSetError(const WebPPicture* const pic,
+                         WebPEncodingError error) {
+  assert((int)error < VP8_ENC_ERROR_LAST);
+  assert((int)error >= VP8_ENC_OK);
+  ((WebPPicture*)pic)->error_code = error;
+  return 0;
+}
+
+int WebPReportProgress(const WebPPicture* const pic,
+                       int percent, int* const percent_store) {
+  if (percent_store != NULL && percent != *percent_store) {
+    *percent_store = percent;
+    if (pic->progress_hook && !pic->progress_hook(percent, pic)) {
+      // user abort requested
+      WebPEncodingSetError(pic, VP8_ENC_ERROR_USER_ABORT);
+      return 0;
+    }
+  }
+  return 1;  // ok
+}
+//------------------------------------------------------------------------------
+
+int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
+  int ok = 0;
+  if (pic == NULL) return 0;
+
+  WebPEncodingSetError(pic, VP8_ENC_OK);  // all ok so far
+  if (config == NULL) {  // bad params
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
+  }
+  if (!WebPValidateConfig(config)) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  }
+  if (pic->width <= 0 || pic->height <= 0) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+  if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+
+  if (pic->stats != NULL) memset(pic->stats, 0, sizeof(*pic->stats));
+
+  if (!config->lossless) {
+    VP8Encoder* enc = NULL;
+
+    if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
+      // Make sure we have YUVA samples.
+      if (config->use_sharp_yuv || (config->preprocessing & 4)) {
+        if (!WebPPictureSharpARGBToYUVA(pic)) {
+          return 0;
+        }
+      } else {
+        float dithering = 0.f;
+        if (config->preprocessing & 2) {
+          const float x = config->quality / 100.f;
+          const float x2 = x * x;
+          // slowly decreasing from max dithering at low quality (q->0)
+          // to 0.5 dithering amplitude at high quality (q->100)
+          dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
+        }
+        if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
+          return 0;
+        }
+      }
+    }
+
+    if (!config->exact) {
+      WebPCleanupTransparentArea(pic);
+    }
+
+    enc = InitVP8Encoder(config, pic);
+    if (enc == NULL) return 0;  // pic->error is already set.
+    // Note: each of the tasks below account for 20% in the progress report.
+    ok = VP8EncAnalyze(enc);
+
+    // Analysis is done, proceed to actual coding.
+    ok = ok && VP8EncStartAlpha(enc);   // possibly done in parallel
+    if (!enc->use_tokens_) {
+      ok = ok && VP8EncLoop(enc);
+    } else {
+      ok = ok && VP8EncTokenLoop(enc);
+    }
+    ok = ok && VP8EncFinishAlpha(enc);
+
+    ok = ok && VP8EncWrite(enc);
+    StoreStats(enc);
+    if (!ok) {
+      VP8EncFreeBitWriters(enc);
+    }
+    ok &= DeleteVP8Encoder(enc);  // must always be called, even if !ok
+  } else {
+    // Make sure we have ARGB samples.
+    if (pic->argb == NULL && !WebPPictureYUVAToARGB(pic)) {
+      return 0;
+    }
+
+    if (!config->exact) {
+      WebPReplaceTransparentPixels(pic, 0x000000);
+    }
+
+    ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.
+  }
+
+  return ok;
+}
diff --git a/media/libwebp/moz.build b/media/libwebp/moz.build
index 5450e2b47c..5580b9a3dc 100644
--- a/media/libwebp/moz.build
+++ b/media/libwebp/moz.build
@@ -9,6 +9,8 @@ with Files('**'):
 EXPORTS.webp += [
     'webp/decode.h',
     'webp/demux.h',
+    'webp/encode.h',
+    'webp/format_constants.h',
     'webp/mux_types.h',
     'webp/types.h',
 ]
@@ -17,6 +19,7 @@ DIRS += [
     'dec',
     'demux',
     'dsp',
+    'enc',
     'moz',
     'utils',
 ]
diff --git a/media/libwebp/moz/cpu.cpp b/media/libwebp/moz/cpu.cpp
index 39b9f3500e..5def5c2b25 100644
--- a/media/libwebp/moz/cpu.cpp
+++ b/media/libwebp/moz/cpu.cpp
@@ -4,7 +4,7 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 /* This file replaces the CPU info methods originally implemented in
- * src/dsp/cpu.c, due to missing dependencies for Andriod builds. It
+ * src/dsp/cpu.c, due to missing dependencies for Android builds. It
  * controls if NEON/SSE/etc is used. */
 
 #include "../dsp/dsp.h"
diff --git a/media/libwebp/update.sh b/media/libwebp/update.sh
index 4fff43d694..9201add982 100644..100755
--- a/media/libwebp/update.sh
+++ b/media/libwebp/update.sh
@@ -21,63 +21,21 @@ cp $1/src/webp/*.h webp
 
 mkdir -p dec
 cp $1/src/dec/*.h dec
-cp $1/src/dec/alpha_dec.c dec
-cp $1/src/dec/buffer_dec.c dec
-cp $1/src/dec/frame_dec.c dec
-cp $1/src/dec/idec_dec.c dec
-cp $1/src/dec/io_dec.c dec
-cp $1/src/dec/quant_dec.c dec
-cp $1/src/dec/tree_dec.c dec
-cp $1/src/dec/vp8_dec.c dec
-cp $1/src/dec/vp8l_dec.c dec
-cp $1/src/dec/webp_dec.c dec
+cp $1/src/dec/*.c dec
 
 mkdir -p demux
 cp $1/src/demux/demux.c demux
 
 mkdir -p dsp
 cp $1/src/dsp/*.h dsp
-cp $1/src/dsp/alpha_processing.c dsp
-cp $1/src/dsp/alpha_processing_neon.c dsp
-cp $1/src/dsp/alpha_processing_sse2.c dsp
-cp $1/src/dsp/alpha_processing_sse41.c dsp
-cp $1/src/dsp/dec.c dsp
-cp $1/src/dsp/dec_clip_tables.c dsp
-cp $1/src/dsp/dec_neon.c dsp
-cp $1/src/dsp/dec_sse2.c dsp
-cp $1/src/dsp/dec_sse41.c dsp
-cp $1/src/dsp/filters.c dsp
-cp $1/src/dsp/filters_neon.c dsp
-cp $1/src/dsp/filters_sse2.c dsp
-cp $1/src/dsp/lossless.c dsp
-cp $1/src/dsp/lossless_neon.c dsp
-cp $1/src/dsp/lossless_sse2.c dsp
-cp $1/src/dsp/rescaler.c dsp
-cp $1/src/dsp/rescaler_neon.c dsp
-cp $1/src/dsp/rescaler_sse2.c dsp
-cp $1/src/dsp/upsampling.c dsp
-cp $1/src/dsp/upsampling_neon.c dsp
-cp $1/src/dsp/upsampling_sse2.c dsp
-cp $1/src/dsp/upsampling_sse41.c dsp
-cp $1/src/dsp/yuv.c dsp
-cp $1/src/dsp/yuv_neon.c dsp
-cp $1/src/dsp/yuv_sse2.c dsp
-cp $1/src/dsp/yuv_sse41.c dsp
+cp $1/src/dsp/*.c dsp
 
 mkdir -p enc
 cp $1/src/enc/*.h enc
+cp $1/src/enc/*.c enc
 
 mkdir -p utils
 cp $1/src/utils/*.h utils
-cp $1/src/utils/bit_reader_utils.c utils
-cp $1/src/utils/color_cache_utils.c utils
-cp $1/src/utils/filters_utils.c utils
-cp $1/src/utils/huffman_utils.c utils
-cp $1/src/utils/quant_levels_dec_utils.c utils
-cp $1/src/utils/quant_levels_utils.c utils
-cp $1/src/utils/random_utils.c utils
-cp $1/src/utils/rescaler_utils.c utils
-cp $1/src/utils/thread_utils.c utils
-cp $1/src/utils/utils.c utils
+cp $1/src/utils/*.c utils
 
 find . \( -name "*.c" -o -name "*.h" \) -exec sed -i 's/#include "src\//#include "..\//g' {} \;
diff --git a/media/libwebp/utils/bit_reader_inl_utils.h b/media/libwebp/utils/bit_reader_inl_utils.h
index 8d1249ef97..78804ed8d2 100644
--- a/media/libwebp/utils/bit_reader_inl_utils.h
+++ b/media/libwebp/utils/bit_reader_inl_utils.h
@@ -55,7 +55,7 @@ void VP8LoadFinalBytes(VP8BitReader* const br);
 
 // makes sure br->value_ has at least BITS bits worth of data
 static WEBP_UBSAN_IGNORE_UNDEF WEBP_INLINE
-void VP8LoadNewBytes(VP8BitReader* const br) {
+void VP8LoadNewBytes(VP8BitReader* WEBP_RESTRICT const br) {
   assert(br != NULL && br->buf_ != NULL);
   // Read 'BITS' bits at a time if possible.
   if (br->buf_ < br->buf_max_) {
@@ -104,7 +104,8 @@ void VP8LoadNewBytes(VP8BitReader* const br) {
 }
 
 // Read a bit with proba 'prob'. Speed-critical function!
-static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
+static WEBP_INLINE int VP8GetBit(VP8BitReader* WEBP_RESTRICT const br,
+                                 int prob, const char label[]) {
   // Don't move this declaration! It makes a big speed difference to store
   // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
   // alter br->range_ value.
@@ -129,13 +130,15 @@ static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
       br->bits_ -= shift;
     }
     br->range_ = range - 1;
+    BT_TRACK(br);
     return bit;
   }
 }
 
 // simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here)
 static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
-int VP8GetSigned(VP8BitReader* const br, int v) {
+int VP8GetSigned(VP8BitReader* WEBP_RESTRICT const br, int v,
+                 const char label[]) {
   if (br->bits_ < 0) {
     VP8LoadNewBytes(br);
   }
@@ -148,11 +151,13 @@ int VP8GetSigned(VP8BitReader* const br, int v) {
     br->range_ += mask;
     br->range_ |= 1;
     br->value_ -= (bit_t)((split + 1) & mask) << pos;
+    BT_TRACK(br);
     return (v ^ mask) - mask;
   }
 }
 
-static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
+static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* WEBP_RESTRICT const br,
+                                    int prob, const char label[]) {
   // Don't move this declaration! It makes a big speed difference to store
   // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
   // alter br->range_ value.
@@ -179,6 +184,7 @@ static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
       br->bits_ -= shift;
     }
     br->range_ = range;
+    BT_TRACK(br);
     return bit;
   }
 }
diff --git a/media/libwebp/utils/bit_reader_utils.c b/media/libwebp/utils/bit_reader_utils.c
index a7cb193bde..1001ec445d 100644
--- a/media/libwebp/utils/bit_reader_utils.c
+++ b/media/libwebp/utils/bit_reader_utils.c
@@ -102,17 +102,18 @@ void VP8LoadFinalBytes(VP8BitReader* const br) {
 //------------------------------------------------------------------------------
 // Higher-level calls
 
-uint32_t VP8GetValue(VP8BitReader* const br, int bits) {
+uint32_t VP8GetValue(VP8BitReader* const br, int bits, const char label[]) {
   uint32_t v = 0;
   while (bits-- > 0) {
-    v |= VP8GetBit(br, 0x80) << bits;
+    v |= VP8GetBit(br, 0x80, label) << bits;
   }
   return v;
 }
 
-int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
-  const int value = VP8GetValue(br, bits);
-  return VP8Get(br) ? -value : value;
+int32_t VP8GetSignedValue(VP8BitReader* const br, int bits,
+                          const char label[]) {
+  const int value = VP8GetValue(br, bits, label);
+  return VP8Get(br, label) ? -value : value;
 }
 
 //------------------------------------------------------------------------------
@@ -220,3 +221,78 @@ uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
 }
 
 //------------------------------------------------------------------------------
+// Bit-tracing tool
+
+#if (BITTRACE > 0)
+
+#include <stdlib.h>   // for atexit()
+#include <stdio.h>
+#include <string.h>
+
+#define MAX_NUM_LABELS 32
+static struct {
+  const char* label;
+  int size;
+  int count;
+} kLabels[MAX_NUM_LABELS];
+
+static int last_label = 0;
+static int last_pos = 0;
+static const uint8_t* buf_start = NULL;
+static int init_done = 0;
+
+static void PrintBitTraces(void) {
+  int i;
+  int scale = 1;
+  int total = 0;
+  const char* units = "bits";
+#if (BITTRACE == 2)
+  scale = 8;
+  units = "bytes";
+#endif
+  for (i = 0; i < last_label; ++i) total += kLabels[i].size;
+  if (total < 1) total = 1;   // avoid rounding errors
+  printf("=== Bit traces ===\n");
+  for (i = 0; i < last_label; ++i) {
+    const int skip = 16 - (int)strlen(kLabels[i].label);
+    const int value = (kLabels[i].size + scale - 1) / scale;
+    assert(skip > 0);
+    printf("%s \%*s: %6d %s   \t[%5.2f%%] [count: %7d]\n",
+           kLabels[i].label, skip, "", value, units,
+           100.f * kLabels[i].size / total,
+           kLabels[i].count);
+  }
+  total = (total + scale - 1) / scale;
+  printf("Total: %d %s\n", total, units);
+}
+
+void BitTrace(const struct VP8BitReader* const br, const char label[]) {
+  int i, pos;
+  if (!init_done) {
+    memset(kLabels, 0, sizeof(kLabels));
+    atexit(PrintBitTraces);
+    buf_start = br->buf_;
+    init_done = 1;
+  }
+  pos = (int)(br->buf_ - buf_start) * 8 - br->bits_;
+  // if there's a too large jump, we've changed partition -> reset counter
+  if (abs(pos - last_pos) > 32) {
+    buf_start = br->buf_;
+    pos = 0;
+    last_pos = 0;
+  }
+  if (br->range_ >= 0x7f) pos += kVP8Log2Range[br->range_ - 0x7f];
+  for (i = 0; i < last_label; ++i) {
+    if (!strcmp(label, kLabels[i].label)) break;
+  }
+  if (i == MAX_NUM_LABELS) abort();   // overflow!
+  kLabels[i].label = label;
+  kLabels[i].size += pos - last_pos;
+  kLabels[i].count += 1;
+  if (i == last_label) ++last_label;
+  last_pos = pos;
+}
+
+#endif  // BITTRACE > 0
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/utils/bit_reader_utils.h b/media/libwebp/utils/bit_reader_utils.h
index 377a7821ad..2df9c417cf 100644
--- a/media/libwebp/utils/bit_reader_utils.h
+++ b/media/libwebp/utils/bit_reader_utils.h
@@ -21,6 +21,27 @@
 #endif
 #include "../webp/types.h"
 
+// Warning! This macro triggers quite some MACRO wizardry around func signature!
+#if !defined(BITTRACE)
+#define BITTRACE 0    // 0 = off, 1 = print bits, 2 = print bytes
+#endif
+
+#if (BITTRACE > 0)
+struct VP8BitReader;
+extern void BitTrace(const struct VP8BitReader* const br, const char label[]);
+#define BT_TRACK(br) BitTrace(br, label)
+#define VP8Get(BR, L) VP8GetValue(BR, 1, L)
+#else
+#define BT_TRACK(br)
+// We'll REMOVE the 'const char label[]' from all signatures and calls (!!):
+#define VP8GetValue(BR, N, L) VP8GetValue(BR, N)
+#define VP8Get(BR, L) VP8GetValue(BR, 1, L)
+#define VP8GetSignedValue(BR, N, L) VP8GetSignedValue(BR, N)
+#define VP8GetBit(BR, P, L) VP8GetBit(BR, P)
+#define VP8GetBitAlt(BR, P, L) VP8GetBitAlt(BR, P)
+#define VP8GetSigned(BR, V, L) VP8GetSigned(BR, V)
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -92,17 +113,15 @@ void VP8BitReaderSetBuffer(VP8BitReader* const br,
 void VP8RemapBitReader(VP8BitReader* const br, ptrdiff_t offset);
 
 // return the next value made of 'num_bits' bits
-uint32_t VP8GetValue(VP8BitReader* const br, int num_bits);
-static WEBP_INLINE uint32_t VP8Get(VP8BitReader* const br) {
-  return VP8GetValue(br, 1);
-}
+uint32_t VP8GetValue(VP8BitReader* const br, int num_bits, const char label[]);
 
 // return the next value with sign-extension.
-int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
+int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits,
+                          const char label[]);
 
 // bit_reader_inl.h will implement the following methods:
-//   static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob)
-//   static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v)
+//   static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob, ...)
+//   static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v, ...)
 // and should be included by the .c files that actually need them.
 // This is to avoid recompiling the whole library whenever this file is touched,
 // and also allowing platform-specific ad-hoc hacks.
diff --git a/media/libwebp/utils/bit_writer_utils.c b/media/libwebp/utils/bit_writer_utils.c
new file mode 100644
index 0000000000..37a63946c4
--- /dev/null
+++ b/media/libwebp/utils/bit_writer_utils.c
@@ -0,0 +1,347 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Bit writing and boolean coder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+//         Vikas Arora (vikaas.arora@gmail.com)
+
+#include <assert.h>
+#include <string.h>   // for memcpy()
+#include <stdlib.h>
+
+#include "../utils/bit_writer_utils.h"
+#include "../utils/endian_inl_utils.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+// VP8BitWriter
+
+static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
+  uint8_t* new_buf;
+  size_t new_size;
+  const uint64_t needed_size_64b = (uint64_t)bw->pos_ + extra_size;
+  const size_t needed_size = (size_t)needed_size_64b;
+  if (needed_size_64b != needed_size) {
+    bw->error_ = 1;
+    return 0;
+  }
+  if (needed_size <= bw->max_pos_) return 1;
+  // If the following line wraps over 32bit, the test just after will catch it.
+  new_size = 2 * bw->max_pos_;
+  if (new_size < needed_size) new_size = needed_size;
+  if (new_size < 1024) new_size = 1024;
+  new_buf = (uint8_t*)WebPSafeMalloc(1ULL, new_size);
+  if (new_buf == NULL) {
+    bw->error_ = 1;
+    return 0;
+  }
+  if (bw->pos_ > 0) {
+    assert(bw->buf_ != NULL);
+    memcpy(new_buf, bw->buf_, bw->pos_);
+  }
+  WebPSafeFree(bw->buf_);
+  bw->buf_ = new_buf;
+  bw->max_pos_ = new_size;
+  return 1;
+}
+
+static void Flush(VP8BitWriter* const bw) {
+  const int s = 8 + bw->nb_bits_;
+  const int32_t bits = bw->value_ >> s;
+  assert(bw->nb_bits_ >= 0);
+  bw->value_ -= bits << s;
+  bw->nb_bits_ -= 8;
+  if ((bits & 0xff) != 0xff) {
+    size_t pos = bw->pos_;
+    if (!BitWriterResize(bw, bw->run_ + 1)) {
+      return;
+    }
+    if (bits & 0x100) {  // overflow -> propagate carry over pending 0xff's
+      if (pos > 0) bw->buf_[pos - 1]++;
+    }
+    if (bw->run_ > 0) {
+      const int value = (bits & 0x100) ? 0x00 : 0xff;
+      for (; bw->run_ > 0; --bw->run_) bw->buf_[pos++] = value;
+    }
+    bw->buf_[pos++] = bits & 0xff;
+    bw->pos_ = pos;
+  } else {
+    bw->run_++;   // delay writing of bytes 0xff, pending eventual carry.
+  }
+}
+
+//------------------------------------------------------------------------------
+// renormalization
+
+static const uint8_t kNorm[128] = {  // renorm_sizes[i] = 8 - log2(i)
+     7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0
+};
+
+// range = ((range + 1) << kVP8Log2Range[range]) - 1
+static const uint8_t kNewRange[128] = {
+  127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239,
+  127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239,
+  247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179,
+  183, 187, 191, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239,
+  243, 247, 251, 127, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149,
+  151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179,
+  181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209,
+  211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239,
+  241, 243, 245, 247, 249, 251, 253, 127
+};
+
+int VP8PutBit(VP8BitWriter* const bw, int bit, int prob) {
+  const int split = (bw->range_ * prob) >> 8;
+  if (bit) {
+    bw->value_ += split + 1;
+    bw->range_ -= split + 1;
+  } else {
+    bw->range_ = split;
+  }
+  if (bw->range_ < 127) {   // emit 'shift' bits out and renormalize
+    const int shift = kNorm[bw->range_];
+    bw->range_ = kNewRange[bw->range_];
+    bw->value_ <<= shift;
+    bw->nb_bits_ += shift;
+    if (bw->nb_bits_ > 0) Flush(bw);
+  }
+  return bit;
+}
+
+int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
+  const int split = bw->range_ >> 1;
+  if (bit) {
+    bw->value_ += split + 1;
+    bw->range_ -= split + 1;
+  } else {
+    bw->range_ = split;
+  }
+  if (bw->range_ < 127) {
+    bw->range_ = kNewRange[bw->range_];
+    bw->value_ <<= 1;
+    bw->nb_bits_ += 1;
+    if (bw->nb_bits_ > 0) Flush(bw);
+  }
+  return bit;
+}
+
+void VP8PutBits(VP8BitWriter* const bw, uint32_t value, int nb_bits) {
+  uint32_t mask;
+  assert(nb_bits > 0 && nb_bits < 32);
+  for (mask = 1u << (nb_bits - 1); mask; mask >>= 1) {
+    VP8PutBitUniform(bw, value & mask);
+  }
+}
+
+void VP8PutSignedBits(VP8BitWriter* const bw, int value, int nb_bits) {
+  if (!VP8PutBitUniform(bw, value != 0)) return;
+  if (value < 0) {
+    VP8PutBits(bw, ((-value) << 1) | 1, nb_bits + 1);
+  } else {
+    VP8PutBits(bw, value << 1, nb_bits + 1);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) {
+  bw->range_   = 255 - 1;
+  bw->value_   = 0;
+  bw->run_     = 0;
+  bw->nb_bits_ = -8;
+  bw->pos_     = 0;
+  bw->max_pos_ = 0;
+  bw->error_   = 0;
+  bw->buf_     = NULL;
+  return (expected_size > 0) ? BitWriterResize(bw, expected_size) : 1;
+}
+
+uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) {
+  VP8PutBits(bw, 0, 9 - bw->nb_bits_);
+  bw->nb_bits_ = 0;   // pad with zeroes
+  Flush(bw);
+  return bw->buf_;
+}
+
+int VP8BitWriterAppend(VP8BitWriter* const bw,
+                       const uint8_t* data, size_t size) {
+  assert(data != NULL);
+  if (bw->nb_bits_ != -8) return 0;   // Flush() must have been called
+  if (!BitWriterResize(bw, size)) return 0;
+  memcpy(bw->buf_ + bw->pos_, data, size);
+  bw->pos_ += size;
+  return 1;
+}
+
+void VP8BitWriterWipeOut(VP8BitWriter* const bw) {
+  if (bw != NULL) {
+    WebPSafeFree(bw->buf_);
+    memset(bw, 0, sizeof(*bw));
+  }
+}
+
+//------------------------------------------------------------------------------
+// VP8LBitWriter
+
+// This is the minimum amount of size the memory buffer is guaranteed to grow
+// when extra space is needed.
+#define MIN_EXTRA_SIZE  (32768ULL)
+
+// Returns 1 on success.
+static int VP8LBitWriterResize(VP8LBitWriter* const bw, size_t extra_size) {
+  uint8_t* allocated_buf;
+  size_t allocated_size;
+  const size_t max_bytes = bw->end_ - bw->buf_;
+  const size_t current_size = bw->cur_ - bw->buf_;
+  const uint64_t size_required_64b = (uint64_t)current_size + extra_size;
+  const size_t size_required = (size_t)size_required_64b;
+  if (size_required != size_required_64b) {
+    bw->error_ = 1;
+    return 0;
+  }
+  if (max_bytes > 0 && size_required <= max_bytes) return 1;
+  allocated_size = (3 * max_bytes) >> 1;
+  if (allocated_size < size_required) allocated_size = size_required;
+  // make allocated size multiple of 1k
+  allocated_size = (((allocated_size >> 10) + 1) << 10);
+  allocated_buf = (uint8_t*)WebPSafeMalloc(1ULL, allocated_size);
+  if (allocated_buf == NULL) {
+    bw->error_ = 1;
+    return 0;
+  }
+  if (current_size > 0) {
+    memcpy(allocated_buf, bw->buf_, current_size);
+  }
+  WebPSafeFree(bw->buf_);
+  bw->buf_ = allocated_buf;
+  bw->cur_ = bw->buf_ + current_size;
+  bw->end_ = bw->buf_ + allocated_size;
+  return 1;
+}
+
+int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) {
+  memset(bw, 0, sizeof(*bw));
+  return VP8LBitWriterResize(bw, expected_size);
+}
+
+int VP8LBitWriterClone(const VP8LBitWriter* const src,
+                       VP8LBitWriter* const dst) {
+  const size_t current_size = src->cur_ - src->buf_;
+  assert(src->cur_ >= src->buf_ && src->cur_ <= src->end_);
+  if (!VP8LBitWriterResize(dst, current_size)) return 0;
+  memcpy(dst->buf_, src->buf_, current_size);
+  dst->bits_ = src->bits_;
+  dst->used_ = src->used_;
+  dst->error_ = src->error_;
+  dst->cur_ = dst->buf_ + current_size;
+  return 1;
+}
+
+void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
+  if (bw != NULL) {
+    WebPSafeFree(bw->buf_);
+    memset(bw, 0, sizeof(*bw));
+  }
+}
+
+void VP8LBitWriterReset(const VP8LBitWriter* const bw_init,
+                        VP8LBitWriter* const bw) {
+  bw->bits_ = bw_init->bits_;
+  bw->used_ = bw_init->used_;
+  bw->cur_ = bw->buf_ + (bw_init->cur_ - bw_init->buf_);
+  assert(bw->cur_ <= bw->end_);
+  bw->error_ = bw_init->error_;
+}
+
+void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst) {
+  const VP8LBitWriter tmp = *src;
+  *src = *dst;
+  *dst = tmp;
+}
+
+void VP8LPutBitsFlushBits(VP8LBitWriter* const bw) {
+  // If needed, make some room by flushing some bits out.
+  if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
+    const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
+    if (!CheckSizeOverflow(extra_size) ||
+        !VP8LBitWriterResize(bw, (size_t)extra_size)) {
+      bw->cur_ = bw->buf_;
+      bw->error_ = 1;
+      return;
+    }
+  }
+  *(vp8l_wtype_t*)bw->cur_ = (vp8l_wtype_t)WSWAP((vp8l_wtype_t)bw->bits_);
+  bw->cur_ += VP8L_WRITER_BYTES;
+  bw->bits_ >>= VP8L_WRITER_BITS;
+  bw->used_ -= VP8L_WRITER_BITS;
+}
+
+void VP8LPutBitsInternal(VP8LBitWriter* const bw, uint32_t bits, int n_bits) {
+  assert(n_bits <= 32);
+  // That's the max we can handle:
+  assert(sizeof(vp8l_wtype_t) == 2);
+  if (n_bits > 0) {
+    vp8l_atype_t lbits = bw->bits_;
+    int used = bw->used_;
+    // Special case of overflow handling for 32bit accumulator (2-steps flush).
+#if VP8L_WRITER_BITS == 16
+    if (used + n_bits >= VP8L_WRITER_MAX_BITS) {
+      // Fill up all the VP8L_WRITER_MAX_BITS so it can be flushed out below.
+      const int shift = VP8L_WRITER_MAX_BITS - used;
+      lbits |= (vp8l_atype_t)bits << used;
+      used = VP8L_WRITER_MAX_BITS;
+      n_bits -= shift;
+      bits >>= shift;
+      assert(n_bits <= VP8L_WRITER_MAX_BITS);
+    }
+#endif
+    // If needed, make some room by flushing some bits out.
+    while (used >= VP8L_WRITER_BITS) {
+      if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
+        const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
+        if (!CheckSizeOverflow(extra_size) ||
+            !VP8LBitWriterResize(bw, (size_t)extra_size)) {
+          bw->cur_ = bw->buf_;
+          bw->error_ = 1;
+          return;
+        }
+      }
+      *(vp8l_wtype_t*)bw->cur_ = (vp8l_wtype_t)WSWAP((vp8l_wtype_t)lbits);
+      bw->cur_ += VP8L_WRITER_BYTES;
+      lbits >>= VP8L_WRITER_BITS;
+      used -= VP8L_WRITER_BITS;
+    }
+    bw->bits_ = lbits | ((vp8l_atype_t)bits << used);
+    bw->used_ = used + n_bits;
+  }
+}
+
+uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw) {
+  // flush leftover bits
+  if (VP8LBitWriterResize(bw, (bw->used_ + 7) >> 3)) {
+    while (bw->used_ > 0) {
+      *bw->cur_++ = (uint8_t)bw->bits_;
+      bw->bits_ >>= 8;
+      bw->used_ -= 8;
+    }
+    bw->used_ = 0;
+  }
+  return bw->buf_;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/utils/color_cache_utils.c b/media/libwebp/utils/color_cache_utils.c
index c5eb0d8a90..c9e212df53 100644
--- a/media/libwebp/utils/color_cache_utils.c
+++ b/media/libwebp/utils/color_cache_utils.c
@@ -20,22 +20,22 @@
 //------------------------------------------------------------------------------
 // VP8LColorCache.
 
-int VP8LColorCacheInit(VP8LColorCache* const cc, int hash_bits) {
+int VP8LColorCacheInit(VP8LColorCache* const color_cache, int hash_bits) {
   const int hash_size = 1 << hash_bits;
-  assert(cc != NULL);
+  assert(color_cache != NULL);
   assert(hash_bits > 0);
-  cc->colors_ = (uint32_t*)WebPSafeCalloc((uint64_t)hash_size,
-                                          sizeof(*cc->colors_));
-  if (cc->colors_ == NULL) return 0;
-  cc->hash_shift_ = 32 - hash_bits;
-  cc->hash_bits_ = hash_bits;
+  color_cache->colors_ = (uint32_t*)WebPSafeCalloc(
+      (uint64_t)hash_size, sizeof(*color_cache->colors_));
+  if (color_cache->colors_ == NULL) return 0;
+  color_cache->hash_shift_ = 32 - hash_bits;
+  color_cache->hash_bits_ = hash_bits;
   return 1;
 }
 
-void VP8LColorCacheClear(VP8LColorCache* const cc) {
-  if (cc != NULL) {
-    WebPSafeFree(cc->colors_);
-    cc->colors_ = NULL;
+void VP8LColorCacheClear(VP8LColorCache* const color_cache) {
+  if (color_cache != NULL) {
+    WebPSafeFree(color_cache->colors_);
+    color_cache->colors_ = NULL;
   }
 }
 
diff --git a/media/libwebp/utils/color_cache_utils.h b/media/libwebp/utils/color_cache_utils.h
index c46131277d..34e9e68795 100644
--- a/media/libwebp/utils/color_cache_utils.h
+++ b/media/libwebp/utils/color_cache_utils.h
@@ -17,6 +17,7 @@
 
 #include <assert.h>
 
+#include "../dsp/dsp.h"
 #include "../webp/types.h"
 
 #ifdef __cplusplus
@@ -25,15 +26,16 @@ extern "C" {
 
 // Main color cache struct.
 typedef struct {
-  uint32_t *colors_;  // color entries
+  uint32_t* colors_;  // color entries
   int hash_shift_;    // Hash shift: 32 - hash_bits_.
   int hash_bits_;
 } VP8LColorCache;
 
-static const uint64_t kHashMul = 0x1e35a7bdull;
+static const uint32_t kHashMul = 0x1e35a7bdu;
 
-static WEBP_INLINE int VP8LHashPix(uint32_t argb, int shift) {
-  return (int)(((argb * kHashMul) & 0xffffffffu) >> shift);
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+int VP8LHashPix(uint32_t argb, int shift) {
+  return (int)((argb * kHashMul) >> shift);
 }
 
 static WEBP_INLINE uint32_t VP8LColorCacheLookup(
diff --git a/media/libwebp/utils/huffman_encode_utils.c b/media/libwebp/utils/huffman_encode_utils.c
new file mode 100644
index 0000000000..8219cfc168
--- /dev/null
+++ b/media/libwebp/utils/huffman_encode_utils.c
@@ -0,0 +1,416 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+// Entropy encoding (Huffman) for webp lossless.
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../utils/huffman_encode_utils.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+// -----------------------------------------------------------------------------
+// Util function to optimize the symbol map for RLE coding
+
+// Heuristics for selecting the stride ranges to collapse.
+static int ValuesShouldBeCollapsedToStrideAverage(int a, int b) {
+  return abs(a - b) < 4;
+}
+
+// Change the population counts in a way that the consequent
+// Huffman tree compression, especially its RLE-part, give smaller output.
+static void OptimizeHuffmanForRle(int length, uint8_t* const good_for_rle,
+                                  uint32_t* const counts) {
+  // 1) Let's make the Huffman code more compatible with rle encoding.
+  int i;
+  for (; length >= 0; --length) {
+    if (length == 0) {
+      return;  // All zeros.
+    }
+    if (counts[length - 1] != 0) {
+      // Now counts[0..length - 1] does not have trailing zeros.
+      break;
+    }
+  }
+  // 2) Let's mark all population counts that already can be encoded
+  // with an rle code.
+  {
+    // Let's not spoil any of the existing good rle codes.
+    // Mark any seq of 0's that is longer as 5 as a good_for_rle.
+    // Mark any seq of non-0's that is longer as 7 as a good_for_rle.
+    uint32_t symbol = counts[0];
+    int stride = 0;
+    for (i = 0; i < length + 1; ++i) {
+      if (i == length || counts[i] != symbol) {
+        if ((symbol == 0 && stride >= 5) ||
+            (symbol != 0 && stride >= 7)) {
+          int k;
+          for (k = 0; k < stride; ++k) {
+            good_for_rle[i - k - 1] = 1;
+          }
+        }
+        stride = 1;
+        if (i != length) {
+          symbol = counts[i];
+        }
+      } else {
+        ++stride;
+      }
+    }
+  }
+  // 3) Let's replace those population counts that lead to more rle codes.
+  {
+    uint32_t stride = 0;
+    uint32_t limit = counts[0];
+    uint32_t sum = 0;
+    for (i = 0; i < length + 1; ++i) {
+      if (i == length || good_for_rle[i] ||
+          (i != 0 && good_for_rle[i - 1]) ||
+          !ValuesShouldBeCollapsedToStrideAverage(counts[i], limit)) {
+        if (stride >= 4 || (stride >= 3 && sum == 0)) {
+          uint32_t k;
+          // The stride must end, collapse what we have, if we have enough (4).
+          uint32_t count = (sum + stride / 2) / stride;
+          if (count < 1) {
+            count = 1;
+          }
+          if (sum == 0) {
+            // Don't make an all zeros stride to be upgraded to ones.
+            count = 0;
+          }
+          for (k = 0; k < stride; ++k) {
+            // We don't want to change value at counts[i],
+            // that is already belonging to the next stride. Thus - 1.
+            counts[i - k - 1] = count;
+          }
+        }
+        stride = 0;
+        sum = 0;
+        if (i < length - 3) {
+          // All interesting strides have a count of at least 4,
+          // at least when non-zeros.
+          limit = (counts[i] + counts[i + 1] +
+                   counts[i + 2] + counts[i + 3] + 2) / 4;
+        } else if (i < length) {
+          limit = counts[i];
+        } else {
+          limit = 0;
+        }
+      }
+      ++stride;
+      if (i != length) {
+        sum += counts[i];
+        if (stride >= 4) {
+          limit = (sum + stride / 2) / stride;
+        }
+      }
+    }
+  }
+}
+
+// A comparer function for two Huffman trees: sorts first by 'total count'
+// (more comes first), and then by 'value' (more comes first).
+static int CompareHuffmanTrees(const void* ptr1, const void* ptr2) {
+  const HuffmanTree* const t1 = (const HuffmanTree*)ptr1;
+  const HuffmanTree* const t2 = (const HuffmanTree*)ptr2;
+  if (t1->total_count_ > t2->total_count_) {
+    return -1;
+  } else if (t1->total_count_ < t2->total_count_) {
+    return 1;
+  } else {
+    assert(t1->value_ != t2->value_);
+    return (t1->value_ < t2->value_) ? -1 : 1;
+  }
+}
+
+static void SetBitDepths(const HuffmanTree* const tree,
+                         const HuffmanTree* const pool,
+                         uint8_t* const bit_depths, int level) {
+  if (tree->pool_index_left_ >= 0) {
+    SetBitDepths(&pool[tree->pool_index_left_], pool, bit_depths, level + 1);
+    SetBitDepths(&pool[tree->pool_index_right_], pool, bit_depths, level + 1);
+  } else {
+    bit_depths[tree->value_] = level;
+  }
+}
+
+// Create an optimal Huffman tree.
+//
+// (data,length): population counts.
+// tree_limit: maximum bit depth (inclusive) of the codes.
+// bit_depths[]: how many bits are used for the symbol.
+//
+// Returns 0 when an error has occurred.
+//
+// The catch here is that the tree cannot be arbitrarily deep
+//
+// count_limit is the value that is to be faked as the minimum value
+// and this minimum value is raised until the tree matches the
+// maximum length requirement.
+//
+// This algorithm is not of excellent performance for very long data blocks,
+// especially when population counts are longer than 2**tree_limit, but
+// we are not planning to use this with extremely long blocks.
+//
+// See https://en.wikipedia.org/wiki/Huffman_coding
+static void GenerateOptimalTree(const uint32_t* const histogram,
+                                int histogram_size,
+                                HuffmanTree* tree, int tree_depth_limit,
+                                uint8_t* const bit_depths) {
+  uint32_t count_min;
+  HuffmanTree* tree_pool;
+  int tree_size_orig = 0;
+  int i;
+
+  for (i = 0; i < histogram_size; ++i) {
+    if (histogram[i] != 0) {
+      ++tree_size_orig;
+    }
+  }
+
+  if (tree_size_orig == 0) {   // pretty optimal already!
+    return;
+  }
+
+  tree_pool = tree + tree_size_orig;
+
+  // For block sizes with less than 64k symbols we never need to do a
+  // second iteration of this loop.
+  // If we actually start running inside this loop a lot, we would perhaps
+  // be better off with the Katajainen algorithm.
+  assert(tree_size_orig <= (1 << (tree_depth_limit - 1)));
+  for (count_min = 1; ; count_min *= 2) {
+    int tree_size = tree_size_orig;
+    // We need to pack the Huffman tree in tree_depth_limit bits.
+    // So, we try by faking histogram entries to be at least 'count_min'.
+    int idx = 0;
+    int j;
+    for (j = 0; j < histogram_size; ++j) {
+      if (histogram[j] != 0) {
+        const uint32_t count =
+            (histogram[j] < count_min) ? count_min : histogram[j];
+        tree[idx].total_count_ = count;
+        tree[idx].value_ = j;
+        tree[idx].pool_index_left_ = -1;
+        tree[idx].pool_index_right_ = -1;
+        ++idx;
+      }
+    }
+
+    // Build the Huffman tree.
+    qsort(tree, tree_size, sizeof(*tree), CompareHuffmanTrees);
+
+    if (tree_size > 1) {  // Normal case.
+      int tree_pool_size = 0;
+      while (tree_size > 1) {  // Finish when we have only one root.
+        uint32_t count;
+        tree_pool[tree_pool_size++] = tree[tree_size - 1];
+        tree_pool[tree_pool_size++] = tree[tree_size - 2];
+        count = tree_pool[tree_pool_size - 1].total_count_ +
+                tree_pool[tree_pool_size - 2].total_count_;
+        tree_size -= 2;
+        {
+          // Search for the insertion point.
+          int k;
+          for (k = 0; k < tree_size; ++k) {
+            if (tree[k].total_count_ <= count) {
+              break;
+            }
+          }
+          memmove(tree + (k + 1), tree + k, (tree_size - k) * sizeof(*tree));
+          tree[k].total_count_ = count;
+          tree[k].value_ = -1;
+
+          tree[k].pool_index_left_ = tree_pool_size - 1;
+          tree[k].pool_index_right_ = tree_pool_size - 2;
+          tree_size = tree_size + 1;
+        }
+      }
+      SetBitDepths(&tree[0], tree_pool, bit_depths, 0);
+    } else if (tree_size == 1) {  // Trivial case: only one element.
+      bit_depths[tree[0].value_] = 1;
+    }
+
+    {
+      // Test if this Huffman tree satisfies our 'tree_depth_limit' criteria.
+      int max_depth = bit_depths[0];
+      for (j = 1; j < histogram_size; ++j) {
+        if (max_depth < bit_depths[j]) {
+          max_depth = bit_depths[j];
+        }
+      }
+      if (max_depth <= tree_depth_limit) {
+        break;
+      }
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Coding of the Huffman tree values
+
+static HuffmanTreeToken* CodeRepeatedValues(int repetitions,
+                                            HuffmanTreeToken* tokens,
+                                            int value, int prev_value) {
+  assert(value <= MAX_ALLOWED_CODE_LENGTH);
+  if (value != prev_value) {
+    tokens->code = value;
+    tokens->extra_bits = 0;
+    ++tokens;
+    --repetitions;
+  }
+  while (repetitions >= 1) {
+    if (repetitions < 3) {
+      int i;
+      for (i = 0; i < repetitions; ++i) {
+        tokens->code = value;
+        tokens->extra_bits = 0;
+        ++tokens;
+      }
+      break;
+    } else if (repetitions < 7) {
+      tokens->code = 16;
+      tokens->extra_bits = repetitions - 3;
+      ++tokens;
+      break;
+    } else {
+      tokens->code = 16;
+      tokens->extra_bits = 3;
+      ++tokens;
+      repetitions -= 6;
+    }
+  }
+  return tokens;
+}
+
+static HuffmanTreeToken* CodeRepeatedZeros(int repetitions,
+                                           HuffmanTreeToken* tokens) {
+  while (repetitions >= 1) {
+    if (repetitions < 3) {
+      int i;
+      for (i = 0; i < repetitions; ++i) {
+        tokens->code = 0;   // 0-value
+        tokens->extra_bits = 0;
+        ++tokens;
+      }
+      break;
+    } else if (repetitions < 11) {
+      tokens->code = 17;
+      tokens->extra_bits = repetitions - 3;
+      ++tokens;
+      break;
+    } else if (repetitions < 139) {
+      tokens->code = 18;
+      tokens->extra_bits = repetitions - 11;
+      ++tokens;
+      break;
+    } else {
+      tokens->code = 18;
+      tokens->extra_bits = 0x7f;  // 138 repeated 0s
+      ++tokens;
+      repetitions -= 138;
+    }
+  }
+  return tokens;
+}
+
+int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
+                                    HuffmanTreeToken* tokens, int max_tokens) {
+  HuffmanTreeToken* const starting_token = tokens;
+  HuffmanTreeToken* const ending_token = tokens + max_tokens;
+  const int depth_size = tree->num_symbols;
+  int prev_value = 8;  // 8 is the initial value for rle.
+  int i = 0;
+  assert(tokens != NULL);
+  while (i < depth_size) {
+    const int value = tree->code_lengths[i];
+    int k = i + 1;
+    int runs;
+    while (k < depth_size && tree->code_lengths[k] == value) ++k;
+    runs = k - i;
+    if (value == 0) {
+      tokens = CodeRepeatedZeros(runs, tokens);
+    } else {
+      tokens = CodeRepeatedValues(runs, tokens, value, prev_value);
+      prev_value = value;
+    }
+    i += runs;
+    assert(tokens <= ending_token);
+  }
+  (void)ending_token;    // suppress 'unused variable' warning
+  return (int)(tokens - starting_token);
+}
+
+// -----------------------------------------------------------------------------
+
+// Pre-reversed 4-bit values.
+static const uint8_t kReversedBits[16] = {
+  0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
+  0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
+};
+
+static uint32_t ReverseBits(int num_bits, uint32_t bits) {
+  uint32_t retval = 0;
+  int i = 0;
+  while (i < num_bits) {
+    i += 4;
+    retval |= kReversedBits[bits & 0xf] << (MAX_ALLOWED_CODE_LENGTH + 1 - i);
+    bits >>= 4;
+  }
+  retval >>= (MAX_ALLOWED_CODE_LENGTH + 1 - num_bits);
+  return retval;
+}
+
+// Get the actual bit values for a tree of bit depths.
+static void ConvertBitDepthsToSymbols(HuffmanTreeCode* const tree) {
+  // 0 bit-depth means that the symbol does not exist.
+  int i;
+  int len;
+  uint32_t next_code[MAX_ALLOWED_CODE_LENGTH + 1];
+  int depth_count[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
+
+  assert(tree != NULL);
+  len = tree->num_symbols;
+  for (i = 0; i < len; ++i) {
+    const int code_length = tree->code_lengths[i];
+    assert(code_length <= MAX_ALLOWED_CODE_LENGTH);
+    ++depth_count[code_length];
+  }
+  depth_count[0] = 0;  // ignore unused symbol
+  next_code[0] = 0;
+  {
+    uint32_t code = 0;
+    for (i = 1; i <= MAX_ALLOWED_CODE_LENGTH; ++i) {
+      code = (code + depth_count[i - 1]) << 1;
+      next_code[i] = code;
+    }
+  }
+  for (i = 0; i < len; ++i) {
+    const int code_length = tree->code_lengths[i];
+    tree->codes[i] = ReverseBits(code_length, next_code[code_length]++);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Main entry point
+
+void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
+                           uint8_t* const buf_rle, HuffmanTree* const huff_tree,
+                           HuffmanTreeCode* const huff_code) {
+  const int num_symbols = huff_code->num_symbols;
+  memset(buf_rle, 0, num_symbols * sizeof(*buf_rle));
+  OptimizeHuffmanForRle(num_symbols, buf_rle, histogram);
+  GenerateOptimalTree(histogram, num_symbols, huff_tree, tree_depth_limit,
+                      huff_code->code_lengths);
+  // Create the actual bit codes for the bit lengths.
+  ConvertBitDepthsToSymbols(huff_code);
+}
diff --git a/media/libwebp/utils/huffman_encode_utils.h b/media/libwebp/utils/huffman_encode_utils.h
index 236f266e4d..892d514751 100644
--- a/media/libwebp/utils/huffman_encode_utils.h
+++ b/media/libwebp/utils/huffman_encode_utils.h
@@ -51,7 +51,7 @@ int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
 // huffman code tree.
 void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
                            uint8_t* const buf_rle, HuffmanTree* const huff_tree,
-                           HuffmanTreeCode* const tree);
+                           HuffmanTreeCode* const huff_code);
 
 #ifdef __cplusplus
 }
diff --git a/media/libwebp/utils/huffman_utils.c b/media/libwebp/utils/huffman_utils.c
index a2b4b3f897..8e6954f196 100644
--- a/media/libwebp/utils/huffman_utils.c
+++ b/media/libwebp/utils/huffman_utils.c
@@ -91,7 +91,8 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
 
   assert(code_lengths_size != 0);
   assert(code_lengths != NULL);
-  assert(root_table != NULL);
+  assert((root_table != NULL && sorted != NULL) ||
+         (root_table == NULL && sorted == NULL));
   assert(root_bits > 0);
 
   // Build histogram of code lengths.
@@ -120,16 +121,22 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
   for (symbol = 0; symbol < code_lengths_size; ++symbol) {
     const int symbol_code_length = code_lengths[symbol];
     if (code_lengths[symbol] > 0) {
-      sorted[offset[symbol_code_length]++] = symbol;
+      if (sorted != NULL) {
+        sorted[offset[symbol_code_length]++] = symbol;
+      } else {
+        offset[symbol_code_length]++;
+      }
     }
   }
 
   // Special case code with only one value.
   if (offset[MAX_ALLOWED_CODE_LENGTH] == 1) {
-    HuffmanCode code;
-    code.bits = 0;
-    code.value = (uint16_t)sorted[0];
-    ReplicateValue(table, 1, total_size, code);
+    if (sorted != NULL) {
+      HuffmanCode code;
+      code.bits = 0;
+      code.value = (uint16_t)sorted[0];
+      ReplicateValue(table, 1, total_size, code);
+    }
     return total_size;
   }
 
@@ -151,6 +158,7 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
       if (num_open < 0) {
         return 0;
       }
+      if (root_table == NULL) continue;
       for (; count[len] > 0; --count[len]) {
         HuffmanCode code;
         code.bits = (uint8_t)len;
@@ -169,6 +177,7 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
       if (num_open < 0) {
         return 0;
       }
+      if (root_table == NULL) continue;
       for (; count[len] > 0; --count[len]) {
         HuffmanCode code;
         if ((key & mask) != low) {
@@ -206,7 +215,10 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
                           const int code_lengths[], int code_lengths_size) {
   int total_size;
   assert(code_lengths_size <= MAX_CODE_LENGTHS_SIZE);
-  if (code_lengths_size <= SORTED_SIZE_CUTOFF) {
+  if (root_table == NULL) {
+    total_size = BuildHuffmanTable(NULL, root_bits,
+                                   code_lengths, code_lengths_size, NULL);
+  } else if (code_lengths_size <= SORTED_SIZE_CUTOFF) {
     // use local stack-allocated array.
     uint16_t sorted[SORTED_SIZE_CUTOFF];
     total_size = BuildHuffmanTable(root_table, root_bits,
diff --git a/media/libwebp/utils/huffman_utils.h b/media/libwebp/utils/huffman_utils.h
index 7f241aab97..4f54691d20 100644
--- a/media/libwebp/utils/huffman_utils.h
+++ b/media/libwebp/utils/huffman_utils.h
@@ -78,6 +78,8 @@ void VP8LHtreeGroupsFree(HTreeGroup* const htree_groups);
 // the huffman table.
 // Returns built table size or 0 in case of error (invalid tree or
 // memory error).
+// If root_table is NULL, it returns 0 if a lookup cannot be built, something
+// > 0 otherwise (but not the table size).
 int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
                           const int code_lengths[], int code_lengths_size);
 
diff --git a/media/libwebp/utils/moz.build b/media/libwebp/utils/moz.build
index 619eaee6df..32431a9f3b 100644
--- a/media/libwebp/utils/moz.build
+++ b/media/libwebp/utils/moz.build
@@ -8,8 +8,10 @@ with Files('**'):
 
 SOURCES += [
     'bit_reader_utils.c',
+    'bit_writer_utils.c',
     'color_cache_utils.c',
     'filters_utils.c',
+    'huffman_encode_utils.c',
     'huffman_utils.c',
     'quant_levels_dec_utils.c',
     'quant_levels_utils.c',
diff --git a/media/libwebp/utils/quant_levels_dec_utils.c b/media/libwebp/utils/quant_levels_dec_utils.c
index a60de3444e..f960a8aa83 100644
--- a/media/libwebp/utils/quant_levels_dec_utils.c
+++ b/media/libwebp/utils/quant_levels_dec_utils.c
@@ -30,7 +30,7 @@
 
 #define DFIX 4           // extra precision for ordered dithering
 #define DSIZE 4          // dithering size (must be a power of two)
-// cf. http://en.wikipedia.org/wiki/Ordered_dithering
+// cf. https://en.wikipedia.org/wiki/Ordered_dithering
 static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
   {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
   { 12,  4, 14,  6 },
diff --git a/media/libwebp/utils/rescaler_utils.c b/media/libwebp/utils/rescaler_utils.c
index 6e384f5078..3b64e27525 100644
--- a/media/libwebp/utils/rescaler_utils.c
+++ b/media/libwebp/utils/rescaler_utils.c
@@ -12,66 +12,74 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <assert.h>
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #include "../dsp/dsp.h"
 #include "../utils/rescaler_utils.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 
-void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
-                      uint8_t* const dst,
-                      int dst_width, int dst_height, int dst_stride,
-                      int num_channels, rescaler_t* const work) {
+int WebPRescalerInit(WebPRescaler* const rescaler,
+                     int src_width, int src_height,
+                     uint8_t* const dst,
+                     int dst_width, int dst_height, int dst_stride,
+                     int num_channels, rescaler_t* const work) {
   const int x_add = src_width, x_sub = dst_width;
   const int y_add = src_height, y_sub = dst_height;
-  wrk->x_expand = (src_width < dst_width);
-  wrk->y_expand = (src_height < dst_height);
-  wrk->src_width = src_width;
-  wrk->src_height = src_height;
-  wrk->dst_width = dst_width;
-  wrk->dst_height = dst_height;
-  wrk->src_y = 0;
-  wrk->dst_y = 0;
-  wrk->dst = dst;
-  wrk->dst_stride = dst_stride;
-  wrk->num_channels = num_channels;
+  const uint64_t total_size = 2ull * dst_width * num_channels * sizeof(*work);
+  if (!CheckSizeOverflow(total_size)) return 0;
+
+  rescaler->x_expand = (src_width < dst_width);
+  rescaler->y_expand = (src_height < dst_height);
+  rescaler->src_width = src_width;
+  rescaler->src_height = src_height;
+  rescaler->dst_width = dst_width;
+  rescaler->dst_height = dst_height;
+  rescaler->src_y = 0;
+  rescaler->dst_y = 0;
+  rescaler->dst = dst;
+  rescaler->dst_stride = dst_stride;
+  rescaler->num_channels = num_channels;
 
   // for 'x_expand', we use bilinear interpolation
-  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add;
-  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
-  if (!wrk->x_expand) {  // fx_scale is not used otherwise
-    wrk->fx_scale = WEBP_RESCALER_FRAC(1, wrk->x_sub);
+  rescaler->x_add = rescaler->x_expand ? (x_sub - 1) : x_add;
+  rescaler->x_sub = rescaler->x_expand ? (x_add - 1) : x_sub;
+  if (!rescaler->x_expand) {  // fx_scale is not used otherwise
+    rescaler->fx_scale = WEBP_RESCALER_FRAC(1, rescaler->x_sub);
   }
   // vertical scaling parameters
-  wrk->y_add = wrk->y_expand ? y_add - 1 : y_add;
-  wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
-  wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
-  if (!wrk->y_expand) {
+  rescaler->y_add = rescaler->y_expand ? y_add - 1 : y_add;
+  rescaler->y_sub = rescaler->y_expand ? y_sub - 1 : y_sub;
+  rescaler->y_accum = rescaler->y_expand ? rescaler->y_sub : rescaler->y_add;
+  if (!rescaler->y_expand) {
     // This is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
-    // Its value is <= WEBP_RESCALER_ONE, because dst_height <= wrk->y_add, and
-    // wrk->x_add >= 1;
-    const uint64_t ratio =
-        (uint64_t)dst_height * WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_add);
+    // Its value is <= WEBP_RESCALER_ONE, because dst_height <= rescaler->y_add
+    // and rescaler->x_add >= 1;
+    const uint64_t num = (uint64_t)dst_height * WEBP_RESCALER_ONE;
+    const uint64_t den = (uint64_t)rescaler->x_add * rescaler->y_add;
+    const uint64_t ratio = num / den;
     if (ratio != (uint32_t)ratio) {
       // When ratio == WEBP_RESCALER_ONE, we can't represent the ratio with the
       // current fixed-point precision. This happens when src_height ==
-      // wrk->y_add (which == src_height), and wrk->x_add == 1.
+      // rescaler->y_add (which == src_height), and rescaler->x_add == 1.
       // => We special-case fxy_scale = 0, in WebPRescalerExportRow().
-      wrk->fxy_scale = 0;
+      rescaler->fxy_scale = 0;
     } else {
-      wrk->fxy_scale = (uint32_t)ratio;
+      rescaler->fxy_scale = (uint32_t)ratio;
     }
-    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->y_sub);
+    rescaler->fy_scale = WEBP_RESCALER_FRAC(1, rescaler->y_sub);
   } else {
-    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->x_add);
-    // wrk->fxy_scale is unused here.
+    rescaler->fy_scale = WEBP_RESCALER_FRAC(1, rescaler->x_add);
+    // rescaler->fxy_scale is unused here.
   }
-  wrk->irow = work;
-  wrk->frow = work + num_channels * dst_width;
-  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
+  rescaler->irow = work;
+  rescaler->frow = work + num_channels * dst_width;
+  memset(work, 0, (size_t)total_size);
 
   WebPRescalerDspInit();
+  return 1;
 }
 
 int WebPRescalerGetScaledDimensions(int src_width, int src_height,
@@ -82,19 +90,20 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
   {
     int width = *scaled_width;
     int height = *scaled_height;
+    const int max_size = INT_MAX / 2;
 
     // if width is unspecified, scale original proportionally to height ratio.
-    if (width == 0) {
+    if (width == 0 && src_height > 0) {
       width =
-          (int)(((uint64_t)src_width * height + src_height / 2) / src_height);
+          (int)(((uint64_t)src_width * height + src_height - 1) / src_height);
     }
     // if height is unspecified, scale original proportionally to width ratio.
-    if (height == 0) {
+    if (height == 0 && src_width > 0) {
       height =
-          (int)(((uint64_t)src_height * width + src_width / 2) / src_width);
+          (int)(((uint64_t)src_height * width + src_width - 1) / src_width);
     }
     // Check if the overall dimensions still make sense.
-    if (width <= 0 || height <= 0) {
+    if (width <= 0 || height <= 0 || width > max_size || height > max_size) {
       return 0;
     }
 
@@ -107,31 +116,34 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
 //------------------------------------------------------------------------------
 // all-in-one calls
 
-int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
-  const int num_lines = (wrk->y_accum + wrk->y_sub - 1) / wrk->y_sub;
+int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
+                           int max_num_lines) {
+  const int num_lines =
+      (rescaler->y_accum + rescaler->y_sub - 1) / rescaler->y_sub;
   return (num_lines > max_num_lines) ? max_num_lines : num_lines;
 }
 
-int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
+int WebPRescalerImport(WebPRescaler* const rescaler, int num_lines,
                        const uint8_t* src, int src_stride) {
   int total_imported = 0;
-  while (total_imported < num_lines && !WebPRescalerHasPendingOutput(wrk)) {
-    if (wrk->y_expand) {
-      rescaler_t* const tmp = wrk->irow;
-      wrk->irow = wrk->frow;
-      wrk->frow = tmp;
+  while (total_imported < num_lines &&
+         !WebPRescalerHasPendingOutput(rescaler)) {
+    if (rescaler->y_expand) {
+      rescaler_t* const tmp = rescaler->irow;
+      rescaler->irow = rescaler->frow;
+      rescaler->frow = tmp;
     }
-    WebPRescalerImportRow(wrk, src);
-    if (!wrk->y_expand) {     // Accumulate the contribution of the new row.
+    WebPRescalerImportRow(rescaler, src);
+    if (!rescaler->y_expand) {    // Accumulate the contribution of the new row.
       int x;
-      for (x = 0; x < wrk->num_channels * wrk->dst_width; ++x) {
-        wrk->irow[x] += wrk->frow[x];
+      for (x = 0; x < rescaler->num_channels * rescaler->dst_width; ++x) {
+        rescaler->irow[x] += rescaler->frow[x];
       }
     }
-    ++wrk->src_y;
+    ++rescaler->src_y;
     src += src_stride;
     ++total_imported;
-    wrk->y_accum -= wrk->y_sub;
+    rescaler->y_accum -= rescaler->y_sub;
   }
   return total_imported;
 }
diff --git a/media/libwebp/utils/rescaler_utils.h b/media/libwebp/utils/rescaler_utils.h
index b5d176ecf2..1c1942ddf5 100644
--- a/media/libwebp/utils/rescaler_utils.h
+++ b/media/libwebp/utils/rescaler_utils.h
@@ -47,12 +47,13 @@ struct WebPRescaler {
 };
 
 // Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
-void WebPRescalerInit(WebPRescaler* const rescaler,
-                      int src_width, int src_height,
-                      uint8_t* const dst,
-                      int dst_width, int dst_height, int dst_stride,
-                      int num_channels,
-                      rescaler_t* const work);
+// Returns false in case of error.
+int WebPRescalerInit(WebPRescaler* const rescaler,
+                     int src_width, int src_height,
+                     uint8_t* const dst,
+                     int dst_width, int dst_height, int dst_stride,
+                     int num_channels,
+                     rescaler_t* const work);
 
 // If either 'scaled_width' or 'scaled_height' (but not both) is 0 the value
 // will be calculated preserving the aspect ratio, otherwise the values are
diff --git a/media/libwebp/utils/thread_utils.c b/media/libwebp/utils/thread_utils.c
index e87ffbeac4..d61e89d59d 100644
--- a/media/libwebp/utils/thread_utils.c
+++ b/media/libwebp/utils/thread_utils.c
@@ -73,7 +73,7 @@ typedef struct {
 #endif
 
 static int pthread_create(pthread_t* const thread, const void* attr,
-                          unsigned int (__stdcall *start)(void*), void* arg) {
+                          unsigned int (__stdcall* start)(void*), void* arg) {
   (void)attr;
 #ifdef USE_CREATE_THREAD
   *thread = CreateThread(NULL,   /* lpThreadAttributes */
@@ -217,8 +217,12 @@ static THREADFN ThreadLoop(void* ptr) {
       done = 1;
     }
     // signal to the main thread that we're done (for Sync())
-    pthread_cond_signal(&impl->condition_);
+    // Note the associated mutex does not need to be held when signaling the
+    // condition. Unlocking the mutex first may improve performance in some
+    // implementations, avoiding the case where the waiting thread can't
+    // reacquire the mutex when woken.
     pthread_mutex_unlock(&impl->mutex_);
+    pthread_cond_signal(&impl->condition_);
   }
   return THREAD_RETURN(NULL);    // Thread is finished
 }
@@ -240,7 +244,13 @@ static void ChangeState(WebPWorker* const worker, WebPWorkerStatus new_status) {
     // assign new status and release the working thread if needed
     if (new_status != OK) {
       worker->status_ = new_status;
+      // Note the associated mutex does not need to be held when signaling the
+      // condition. Unlocking the mutex first may improve performance in some
+      // implementations, avoiding the case where the waiting thread can't
+      // reacquire the mutex when woken.
+      pthread_mutex_unlock(&impl->mutex_);
       pthread_cond_signal(&impl->condition_);
+      return;
     }
   }
   pthread_mutex_unlock(&impl->mutex_);
diff --git a/media/libwebp/utils/utils.c b/media/libwebp/utils/utils.c
index 9bda6a7169..97713d2832 100644
--- a/media/libwebp/utils/utils.c
+++ b/media/libwebp/utils/utils.c
@@ -23,7 +23,7 @@
 // alloc/free etc) is printed. For debugging/tuning purpose only (it's slow,
 // and not multi-thread safe!).
 // An interesting alternative is valgrind's 'massif' tool:
-//    http://valgrind.org/docs/manual/ms-manual.html
+//    https://valgrind.org/docs/manual/ms-manual.html
 // Here is an example command line:
 /*    valgrind --tool=massif --massif-out-file=massif.out \
                --stacks=yes --alloc-fn=WebPSafeMalloc --alloc-fn=WebPSafeCalloc
@@ -101,6 +101,9 @@ static void Increment(int* const v) {
 #if defined(MALLOC_LIMIT)
     {
       const char* const malloc_limit_str = getenv("MALLOC_LIMIT");
+#if MALLOC_LIMIT > 1
+      mem_limit = (size_t)MALLOC_LIMIT;
+#endif
       if (malloc_limit_str != NULL) {
         mem_limit = atoi(malloc_limit_str);
       }
@@ -169,16 +172,16 @@ static int CheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
   const uint64_t total_size = nmemb * size;
   if (nmemb == 0) return 1;
   if ((uint64_t)size > WEBP_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
-  if (total_size != (size_t)total_size) return 0;
+  if (!CheckSizeOverflow(total_size)) return 0;
 #if defined(PRINT_MEM_INFO) && defined(MALLOC_FAIL_AT)
   if (countdown_to_fail > 0 && --countdown_to_fail == 0) {
     return 0;    // fake fail!
   }
 #endif
-#if defined(MALLOC_LIMIT)
+#if defined(PRINT_MEM_INFO) && defined(MALLOC_LIMIT)
   if (mem_limit > 0) {
     const uint64_t new_total_mem = (uint64_t)total_mem + total_size;
-    if (new_total_mem != (size_t)new_total_mem ||
+    if (!CheckSizeOverflow(new_total_mem) ||
         new_total_mem > mem_limit) {
       return 0;   // fake fail!
     }
@@ -216,9 +219,14 @@ void WebPSafeFree(void* const ptr) {
   free(ptr);
 }
 
-// Public API function.
+// Public API functions.
+
+void* WebPMalloc(size_t size) {
+  return WebPSafeMalloc(1, size);
+}
+
 void WebPFree(void* ptr) {
-  free(ptr);
+  WebPSafeFree(ptr);
 }
 
 //------------------------------------------------------------------------------
@@ -226,7 +234,7 @@ void WebPFree(void* ptr) {
 void WebPCopyPlane(const uint8_t* src, int src_stride,
                    uint8_t* dst, int dst_stride, int width, int height) {
   assert(src != NULL && dst != NULL);
-  assert(src_stride >= width && dst_stride >= width);
+  assert(abs(src_stride) >= width && abs(dst_stride) >= width);
   while (height-- > 0) {
     memcpy(dst, src, width);
     src += src_stride;
diff --git a/media/libwebp/utils/utils.h b/media/libwebp/utils/utils.h
index d22151b0fc..20abf03c69 100644
--- a/media/libwebp/utils/utils.h
+++ b/media/libwebp/utils/utils.h
@@ -42,6 +42,10 @@ extern "C" {
 #endif
 #endif  // WEBP_MAX_ALLOCABLE_MEMORY
 
+static WEBP_INLINE int CheckSizeOverflow(uint64_t size) {
+  return size == (size_t)size;
+}
+
 // size-checking safe malloc/calloc: verify that the requested size is not too
 // large, or return NULL. You don't need to call these for constructs like
 // malloc(sizeof(foo)), but only if there's picture-dependent size involved
@@ -92,14 +96,14 @@ static WEBP_INLINE uint32_t GetLE32(const uint8_t* const data) {
 // Store 16, 24 or 32 bits in little-endian order.
 static WEBP_INLINE void PutLE16(uint8_t* const data, int val) {
   assert(val < (1 << 16));
-  data[0] = (val >> 0);
-  data[1] = (val >> 8);
+  data[0] = (val >> 0) & 0xff;
+  data[1] = (val >> 8) & 0xff;
 }
 
 static WEBP_INLINE void PutLE24(uint8_t* const data, int val) {
   assert(val < (1 << 24));
   PutLE16(data, val & 0xffff);
-  data[2] = (val >> 16);
+  data[2] = (val >> 16) & 0xff;
 }
 
 static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
@@ -107,24 +111,33 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
   PutLE16(data + 2, (int)(val >> 16));
 }
 
-// Returns (int)floor(log2(n)). n must be > 0.
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+// Returns (int)floor(log2(n)). n must be > 0.
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
   return 31 ^ __builtin_clz(n);
 }
+// counts the number of trailing zero
+static WEBP_INLINE int BitsCtz(uint32_t n) { return __builtin_ctz(n); }
 #elif defined(_MSC_VER) && _MSC_VER > 1310 && \
       (defined(_M_X64) || defined(_M_IX86))
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
+#pragma intrinsic(_BitScanForward)
 
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  unsigned long first_set_bit;
+  unsigned long first_set_bit;  // NOLINT (runtime/int)
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
-#else   // default: use the C-version.
+static WEBP_INLINE int BitsCtz(uint32_t n) {
+  unsigned long first_set_bit;  // NOLINT (runtime/int)
+  _BitScanForward(&first_set_bit, n);
+  return first_set_bit;
+}
+#else   // default: use the (slow) C-version.
+#define WEBP_HAVE_SLOW_CLZ_CTZ   // signal that the Clz/Ctz function are slow
 // Returns 31 ^ clz(n) = log2(n). This is the default C-implementation, either
 // based on table or not. Can be used as fallback if clz() is not available.
 #define WEBP_NEED_LOG_TABLE_8BIT
@@ -139,6 +152,15 @@ static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
 }
 
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); }
+
+static WEBP_INLINE int BitsCtz(uint32_t n) {
+  int i;
+  for (i = 0; i < 32; ++i, n >>= 1) {
+    if (n & 1) return i;
+  }
+  return 32;
+}
+
 #endif
 
 //------------------------------------------------------------------------------
diff --git a/media/libwebp/webp/config.h b/media/libwebp/webp/config.h
index dd31c3cfaa..3496bc2c49 100644
--- a/media/libwebp/webp/config.h
+++ b/media/libwebp/webp/config.h
@@ -13,6 +13,9 @@
 /* Set to 1 if __builtin_bswap64 is available */
 #define HAVE_BUILTIN_BSWAP64 1
 
+/* Define to 1 if you have the <cpu-features.h> header file. */
+/* #undef HAVE_CPU_FEATURES_H */
+
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #define HAVE_DLFCN_H 1
 
@@ -20,14 +23,11 @@
 /* #undef HAVE_GLUT_GLUT_H */
 
 /* Define to 1 if you have the <GL/glut.h> header file. */
-/* #undef HAVE_GL_GLUT_H */
+#define HAVE_GL_GLUT_H 1
 
 /* Define to 1 if you have the <inttypes.h> header file. */
 #define HAVE_INTTYPES_H 1
 
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
 /* Define to 1 if you have the <OpenGL/glut.h> header file. */
 /* #undef HAVE_OPENGL_GLUT_H */
 
@@ -40,6 +40,9 @@
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HAVE_STDINT_H 1
 
+/* Define to 1 if you have the <stdio.h> header file. */
+#define HAVE_STDIO_H 1
+
 /* Define to 1 if you have the <stdlib.h> header file. */
 #define HAVE_STDLIB_H 1
 
@@ -77,38 +80,34 @@
 #define PACKAGE_NAME "libwebp"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libwebp 0.5.1"
+#define PACKAGE_STRING "libwebp 1.2.2"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "libwebp"
 
 /* Define to the home page for this package. */
-#define PACKAGE_URL "http://developers.google.com/speed/webp"
+#define PACKAGE_URL "https://developers.google.com/speed/webp"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "0.5.1"
+#define PACKAGE_VERSION "1.2.2"
 
 /* Define to necessary symbol if this constant uses a non-standard name on
    your system. */
 /* #undef PTHREAD_CREATE_JOINABLE */
 
-/* Define to 1 if you have the ANSI C header files. */
+/* Define to 1 if all of the C90 standard headers exist (not just the ones
+   required in a freestanding environment). This macro is provided for
+   backward compatibility; new code need not use it. */
 #define STDC_HEADERS 1
 
 /* Version number of package */
-#define VERSION "0.5.1"
-
-/* Enable experimental code */
-/* #undef WEBP_EXPERIMENTAL_FEATURES */
-
-/* Set to 1 if AVX2 is supported */
-#define WEBP_HAVE_AVX2 1
+#define VERSION "1.2.2"
 
 /* Set to 1 if GIF library is installed */
 #define WEBP_HAVE_GIF 1
 
 /* Set to 1 if OpenGL is supported */
-/* #undef WEBP_HAVE_GL */
+#define WEBP_HAVE_GL 1
 
 /* Set to 1 if JPEG library is installed */
 #define WEBP_HAVE_JPEG 1
@@ -122,6 +121,9 @@
 /* Set to 1 if PNG library is installed */
 #define WEBP_HAVE_PNG 1
 
+/* Set to 1 if SDL library is installed */
+#define WEBP_HAVE_SDL 1
+
 /* Set to 1 if SSE2 is supported */
 #define WEBP_HAVE_SSE2 1
 
@@ -131,6 +133,9 @@
 /* Set to 1 if TIFF library is installed */
 #define WEBP_HAVE_TIFF 1
 
+/* Enable near lossless encoding */
+#define WEBP_NEAR_LOSSLESS 1
+
 /* Undefine this to disable thread support. */
 #define WEBP_USE_THREAD 1
 
diff --git a/media/libwebp/webp/decode.h b/media/libwebp/webp/decode.h
index ae8bfe840e..d98247509a 100644
--- a/media/libwebp/webp/decode.h
+++ b/media/libwebp/webp/decode.h
@@ -20,7 +20,7 @@
 extern "C" {
 #endif
 
-#define WEBP_DECODER_ABI_VERSION 0x0208    // MAJOR(8b) + MINOR(8b)
+#define WEBP_DECODER_ABI_VERSION 0x0209    // MAJOR(8b) + MINOR(8b)
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@@ -85,15 +85,12 @@ WEBP_EXTERN uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
 // Upon return, the Y buffer has a stride returned as '*stride', while U and V
 // have a common stride returned as '*uv_stride'.
 // Return NULL in case of error.
-// (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
+// (*) Also named Y'CbCr. See: https://en.wikipedia.org/wiki/YCbCr
 WEBP_EXTERN uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
                                    int* width, int* height,
                                    uint8_t** u, uint8_t** v,
                                    int* stride, int* uv_stride);
 
-// Releases memory returned by the WebPDecode*() functions above.
-WEBP_EXTERN void WebPFree(void* ptr);
-
 // These five functions are variants of the above ones, that decode the image
 // directly into a pre-allocated buffer 'output_buffer'. The maximum storage
 // available in this buffer is indicated by 'output_buffer_size'. If this
@@ -456,7 +453,7 @@ struct WebPDecoderOptions {
   int scaled_width, scaled_height;    // final resolution
   int use_threads;                    // if true, use multi-threaded decoding
   int dithering_strength;             // dithering strength (0=Off, 100=full)
-  int flip;                           // flip output vertically
+  int flip;                           // if true, flip output vertically
   int alpha_dithering_strength;       // alpha dithering strength in [0..100]
 
   uint32_t pad[5];                    // padding for later use
diff --git a/media/libwebp/webp/encode.h b/media/libwebp/webp/encode.h
index 549cf07730..b4c599df87 100644
--- a/media/libwebp/webp/encode.h
+++ b/media/libwebp/webp/encode.h
@@ -20,7 +20,7 @@
 extern "C" {
 #endif
 
-#define WEBP_ENCODER_ABI_VERSION 0x020e    // MAJOR(8b) + MINOR(8b)
+#define WEBP_ENCODER_ABI_VERSION 0x020f    // MAJOR(8b) + MINOR(8b)
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@@ -62,6 +62,10 @@ WEBP_EXTERN size_t WebPEncodeBGRA(const uint8_t* bgra,
 // These functions are the equivalent of the above, but compressing in a
 // lossless manner. Files are usually larger than lossy format, but will
 // not suffer any compression loss.
+// Note these functions, like the lossy versions, use the library's default
+// settings. For lossless this means 'exact' is disabled. RGB values in
+// transparent areas will be modified to improve compression. To avoid this,
+// use WebPEncode() and set WebPConfig::exact to 1.
 WEBP_EXTERN size_t WebPEncodeLosslessRGB(const uint8_t* rgb,
                                          int width, int height, int stride,
                                          uint8_t** output);
@@ -75,9 +79,6 @@ WEBP_EXTERN size_t WebPEncodeLosslessBGRA(const uint8_t* bgra,
                                           int width, int height, int stride,
                                           uint8_t** output);
 
-// Releases memory returned by the WebPEncode*() functions above.
-WEBP_EXTERN void WebPFree(void* ptr);
-
 //------------------------------------------------------------------------------
 // Coding parameters
 
@@ -147,7 +148,8 @@ struct WebPConfig {
   int use_delta_palette;  // reserved for future lossless feature
   int use_sharp_yuv;      // if needed, use sharp (and slow) RGB->YUV conversion
 
-  uint32_t pad[2];        // padding for later use
+  int qmin;               // minimum permissible quality factor
+  int qmax;               // maximum permissible quality factor
 };
 
 // Enumerate some predefined settings for WebPConfig, depending on the type
@@ -290,6 +292,11 @@ typedef enum WebPEncodingError {
 #define WEBP_MAX_DIMENSION 16383
 
 // Main exchange structure (input samples, output bytes, statistics)
+//
+// Once WebPPictureInit() has been called, it's ok to make all the INPUT fields
+// (use_argb, y/u/v, argb, ...) point to user-owned data, even if
+// WebPPictureAlloc() has been called. Depending on the value use_argb,
+// it's guaranteed that either *argb or *y/*u/*v content will be kept untouched.
 struct WebPPicture {
   //   INPUT
   //////////////
@@ -302,7 +309,7 @@ struct WebPPicture {
   // YUV input (mostly used for input to lossy compression)
   WebPEncCSP colorspace;     // colorspace: should be YUV420 for now (=Y'CbCr).
   int width, height;         // dimensions (less or equal to WEBP_MAX_DIMENSION)
-  uint8_t *y, *u, *v;        // pointers to luma/chroma planes.
+  uint8_t* y, *u, *v;        // pointers to luma/chroma planes.
   int y_stride, uv_stride;   // luma/chroma strides.
   uint8_t* a;                // pointer to the alpha plane
   int a_stride;              // stride of the alpha plane
@@ -346,7 +353,7 @@ struct WebPPicture {
   uint32_t pad3[3];       // padding for later use
 
   // Unused for now
-  uint8_t *pad4, *pad5;
+  uint8_t* pad4, *pad5;
   uint32_t pad6[8];       // padding for later use
 
   // PRIVATE FIELDS
diff --git a/media/libwebp/webp/mux.h b/media/libwebp/webp/mux.h
index 66096a92e0..7d27489a40 100644
--- a/media/libwebp/webp/mux.h
+++ b/media/libwebp/webp/mux.h
@@ -57,7 +57,7 @@ extern "C" {
   WebPMuxGetChunk(mux, "ICCP", &icc_profile);
   // ... (Consume icc_data).
   WebPMuxDelete(mux);
-  free(data);
+  WebPFree(data);
 */
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
@@ -245,7 +245,7 @@ WEBP_EXTERN WebPMuxError WebPMuxPushFrame(
     WebPMux* mux, const WebPMuxFrameInfo* frame, int copy_data);
 
 // Gets the nth frame from the mux object.
-// The content of 'frame->bitstream' is allocated using malloc(), and NOT
+// The content of 'frame->bitstream' is allocated using WebPMalloc(), and NOT
 // owned by the 'mux' object. It MUST be deallocated by the caller by calling
 // WebPDataClear().
 // nth=0 has a special meaning - last position.
@@ -376,10 +376,10 @@ WEBP_EXTERN WebPMuxError WebPMuxNumChunks(const WebPMux* mux,
 // Assembles all chunks in WebP RIFF format and returns in 'assembled_data'.
 // This function also validates the mux object.
 // Note: The content of 'assembled_data' will be ignored and overwritten.
-// Also, the content of 'assembled_data' is allocated using malloc(), and NOT
-// owned by the 'mux' object. It MUST be deallocated by the caller by calling
-// WebPDataClear(). It's always safe to call WebPDataClear() upon return,
-// even in case of error.
+// Also, the content of 'assembled_data' is allocated using WebPMalloc(), and
+// NOT owned by the 'mux' object. It MUST be deallocated by the caller by
+// calling WebPDataClear(). It's always safe to call WebPDataClear() upon
+// return, even in case of error.
 // Parameters:
 //   mux - (in/out) object whose chunks are to be assembled
 //   assembled_data - (out) assembled WebP data
diff --git a/media/libwebp/webp/mux_types.h b/media/libwebp/webp/mux_types.h
index ceea77dfc6..2fe8195839 100644
--- a/media/libwebp/webp/mux_types.h
+++ b/media/libwebp/webp/mux_types.h
@@ -14,7 +14,6 @@
 #ifndef WEBP_WEBP_MUX_TYPES_H_
 #define WEBP_WEBP_MUX_TYPES_H_
 
-#include <stdlib.h>  // free()
 #include <string.h>  // memset()
 #include "./types.h"
 
@@ -56,6 +55,7 @@ typedef enum WebPMuxAnimBlend {
 
 // Data type used to describe 'raw' data, e.g., chunk data
 // (ICC profile, metadata) and WebP compressed image data.
+// 'bytes' memory must be allocated using WebPMalloc() and such.
 struct WebPData {
   const uint8_t* bytes;
   size_t size;
@@ -68,11 +68,11 @@ static WEBP_INLINE void WebPDataInit(WebPData* webp_data) {
   }
 }
 
-// Clears the contents of the 'webp_data' object by calling free(). Does not
-// deallocate the object itself.
+// Clears the contents of the 'webp_data' object by calling WebPFree().
+// Does not deallocate the object itself.
 static WEBP_INLINE void WebPDataClear(WebPData* webp_data) {
   if (webp_data != NULL) {
-    free((void*)webp_data->bytes);
+    WebPFree((void*)webp_data->bytes);
     WebPDataInit(webp_data);
   }
 }
@@ -83,7 +83,7 @@ static WEBP_INLINE int WebPDataCopy(const WebPData* src, WebPData* dst) {
   if (src == NULL || dst == NULL) return 0;
   WebPDataInit(dst);
   if (src->bytes != NULL && src->size != 0) {
-    dst->bytes = (uint8_t*)malloc(src->size);
+    dst->bytes = (uint8_t*)WebPMalloc(src->size);
     if (dst->bytes == NULL) return 0;
     memcpy((void*)dst->bytes, src->bytes, src->size);
     dst->size = src->size;
diff --git a/media/libwebp/webp/types.h b/media/libwebp/webp/types.h
index 0ce2622e41..47f7f2b007 100644
--- a/media/libwebp/webp/types.h
+++ b/media/libwebp/webp/types.h
@@ -7,7 +7,7 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-//  Common types
+//  Common types + memory wrappers
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
@@ -49,4 +49,20 @@ typedef long long int int64_t;
 // Macro to check ABI compatibility (same major revision number)
 #define WEBP_ABI_IS_INCOMPATIBLE(a, b) (((a) >> 8) != ((b) >> 8))
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Allocates 'size' bytes of memory. Returns NULL upon error. Memory
+// must be deallocated by calling WebPFree(). This function is made available
+// by the core 'libwebp' library.
+WEBP_EXTERN void* WebPMalloc(size_t size);
+
+// Releases memory returned by the WebPDecode*() functions (from decode.h).
+WEBP_EXTERN void WebPFree(void* ptr);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
 #endif  // WEBP_WEBP_TYPES_H_
diff --git a/media/update-libjpeg.sh b/media/update-libjpeg.sh
index 68a8e988ac..6a1b377bec 100755..100644
--- a/media/update-libjpeg.sh
+++ b/media/update-libjpeg.sh
@@ -17,14 +17,12 @@ tag=${2-HEAD}
 (cd $repo; git archive --prefix=media/libjpeg/ $tag) | (cd $srcdir/..; tar xf -)
 
 cd $srcdir/libjpeg
-cp win/jsimdcfg.inc simd/
 
-revert_files="1050342.diff jconfig.h jconfigint.h moz.build MOZCHANGES mozilla.diff simd/jsimdcfg.inc"
+revert_files="jconfig.h jconfigint.h moz.build MOZCHANGES mozilla.diff"
 if test -d ${topsrcdir}/.hg; then
     hg revert --no-backup $revert_files
-elif test -d ${topsrcdir}/.git; then
+elif test -e ${topsrcdir}/.git; then
     git checkout HEAD -- $revert_files
 fi
 
 patch -p0 -i mozilla.diff
-patch -p0 -i 1050342.diff
diff --git a/netwerk/cache2/OldWrappers.cpp b/netwerk/cache2/OldWrappers.cpp
index 76a4fa6c19..3ff62244b5 100644
--- a/netwerk/cache2/OldWrappers.cpp
+++ b/netwerk/cache2/OldWrappers.cpp
@@ -692,8 +692,6 @@ nsresult _OldCacheLoad::Start()
 {
   LOG(("_OldCacheLoad::Start [this=%p, key=%s]", this, mCacheKey.get()));
 
-  mLoadStart = mozilla::TimeStamp::Now();
-
   nsresult rv;
 
   // Consumers that can invoke this code as first and off the main thread
diff --git a/netwerk/cache2/OldWrappers.h b/netwerk/cache2/OldWrappers.h
index f85b0741ac..a825f3762b 100644
--- a/netwerk/cache2/OldWrappers.h
+++ b/netwerk/cache2/OldWrappers.h
@@ -204,8 +204,6 @@ private:
   nsresult mStatus;
   uint32_t mRunCount;
   nsCOMPtr<nsIApplicationCache> mAppCache;
-
-  mozilla::TimeStamp mLoadStart;
 };
 
 
diff --git a/netwerk/dns/nsIDNService.cpp b/netwerk/dns/nsIDNService.cpp
index 70e255ed15..1f35fe1dab 100644
--- a/netwerk/dns/nsIDNService.cpp
+++ b/netwerk/dns/nsIDNService.cpp
@@ -727,14 +727,11 @@ bool nsIDNService::isLabelSafe(const nsAString &label)
       ch = SURROGATE_TO_UCS4(ch, *current++);
     }
 
-    // Check for restricted characters; aspirational scripts are NOT permitted,
-    // in anticipation of the category being merged into Limited-Use scripts
-    // in the upcoming (Unicode 10.0-based) revision of UAX #31.
     IdentifierType idType = GetIdentifierType(ch);
     if (idType == IDTYPE_RESTRICTED) {
       return false;
     }
-    MOZ_ASSERT(idType == IDTYPE_ALLOWED || idType == IDTYPE_ASPIRATIONAL);
+    MOZ_ASSERT(idType == IDTYPE_ALLOWED);
 
     // Check for mixed script
     Script script = GetScriptCode(ch);