Merge pull request 'Update libwebp to 1.2.2' (#1940) from jobbautista9/UXP:libwebp into master

Reviewed-on: https://repo.palemoon.org/MoonchildProductions/UXP/pulls/1940
author: Moonchild <moonchild@palemoon.org> 2022-06-26 11:47:44 +0000
committer: Moonchild <moonchild@palemoon.org> 2022-06-26 11:47:44 +0000
commit: a08f15245412d0b95eba82c7e9be7b7c64286967 (patch)
tree: fa594344bc140371bc050a7d84f2d1f66c724bf3
parent: 20c80bcbc174c975fe16b1d562879489924b218c (diff)
parent: 76f825276ddc527f86d0a17e803d820ef67fd355 (diff)
download: uxp-a08f15245412d0b95eba82c7e9be7b7c64286967.tar.gz
138 files changed, 34309 insertions, 893 deletions
diff --git a/media/libwebp/AUTHORS b/media/libwebp/AUTHORS
index 0d70b7fb2a..8307c2099d 100644
--- a/media/libwebp/AUTHORS
+++ b/media/libwebp/AUTHORS
@@ -1,9 +1,15 @@
 Contributors:
+- Aidan O'Loan (aidanol at gmail dot com)
 - Alan Browning (browning at google dot com)
 - Charles Munger (clm at google dot com)
+- Cheng Yi (cyi at google dot com)
 - Christian Duvivier (cduvivier at google dot com)
+- Christopher Degawa (ccom at randomderp dot com)
+- Clement Courbet (courbet at google dot com)
 - Djordje Pesut (djordje dot pesut at imgtec dot com)
 - Hui Su (huisu at google dot com)
+- Ilya Kurdyukov (jpegqs at gmail dot com)
+- Ingvar Stepanyan (rreverser at google dot com)
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
 - Jehan (jehan at girinstud dot io)
@@ -20,11 +26,13 @@ Contributors:
 - Mislav Bradac (mislavm at google dot com)
 - Nico Weber (thakis at chromium dot org)
 - Noel Chromium (noel at chromium dot org)
+- Oliver Wolff (oliver dot wolff at qt dot io)
 - Owen Rodley (orodley at google dot com)
 - Parag Salasakar (img dot mips1 at gmail dot com)
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
 - Pierre Joye (pierre dot php at gmail dot com)
+- Roberto Alanis (alanisbaez at google dot com)
 - Sam Clegg (sbc at chromium dot org)
 - Scott Hancher (seh at google dot com)
 - Scott LaVarnway (slavarnway at google dot com)
@@ -38,5 +46,7 @@ Contributors:
 - Vikas Arora (vikasa at google dot com)
 - Vincent Rabaud (vrabaud at google dot com)
 - Vlad Tsyrklevich (vtsyrklevich at chromium dot org)
+- Wan-Teh Chang (wtc at google dot com)
 - Yang Zhang (yang dot zhang at arm dot com)
 - Yannis Guyon (yguyon at google dot com)
+- Zhi An Ng (zhin at chromium dot org)
diff --git a/media/libwebp/NEWS b/media/libwebp/NEWS
index aa393c819f..5b36c5cf30 100644
--- a/media/libwebp/NEWS
+++ b/media/libwebp/NEWS
@@ -1,3 +1,58 @@
+- 1/11/2022: version 1.2.2
+  This is a binary compatible release.
+  * webpmux: add "-set bgcolor A,R,G,B"
+  * add ARM64 NEON support for MSVC builds (#539)
+  * fix duplicate include error in Xcode when using multiple XCFrameworks in a
+    project (#542)
+  * doc updates and bug fixes (#538, #544, #548, #550)
+
+- 7/20/2021: version 1.2.1
+  This is a binary compatible release.
+  * minor lossless encoder improvements and x86 color conversion speed up
+  * add ARM64 simulator support to xcframeworkbuild.sh (#510)
+  * further security related hardening in libwebp & examples
+    (issues: #497, #508, #518)
+    (chromium: #1196480, #1196773, #1196775, #1196777, #1196778, #1196850)
+    (oss-fuzz: #28658, #28978)
+  * toolchain updates and bug fixes (#498, #501, #502, #504, #505, #506, #509,
+                                     #533)
+  * use more inclusive language within the source (#507)
+
+- 12/23/2020: version 1.2.0
+  * API changes:
+    - libwebp:
+      encode.h: add a qmin / qmax range for quality factor (cwebp adds -qrange)
+  * lossless encoder improvements
+  * SIMD support for Wasm builds
+  * add xcframeworkbuild.sh, supports Mac Catalyst builds
+  * import fuzzers from oss-fuzz & chromium (#409)
+  * webpmux: add an '-set loop <value>' option (#494)
+  * toolchain updates and bug fixes (#449, #463, #470, #475, #477, #478, #479,
+    #488, #491)
+
+- 12/18/2019: version 1.1.0
+  * API changes:
+    - libwebp:
+      WebPMalloc (issue #442)
+    - extras:
+      WebPUnmultiplyARGB
+  * alpha decode fix (issue #439)
+  * toolchain updates and bug fixes
+    (chromium: #1026858, #1027136, #1027409, #1028620, #1028716, #995200)
+    (oss-fuzz: #19430, #19447)
+
+- 7/4/2019: version 1.0.3
+  This is a binary compatible release.
+  * resize fixes for Nx1 sizes and the addition of non-opaque alpha values for
+    odd sizes (issues #418, #434)
+  * lossless encode/decode performance improvements
+  * lossy compression performance improvement at low quality levels with flat
+    content (issue #432)
+  * python swig files updated to support python 3
+  Tool updates:
+    vwebp will now preserve the aspect ratio of images that exceed monitor
+    resolution by scaling the image to fit (issue #433)
+
 - 1/14/2019: version 1.0.2
   This is a binary compatible release.
   * (Windows) unicode file support in the tools (linux and mac already had
diff --git a/media/libwebp/README b/media/libwebp/README
index 502a4c1c20..f6eaf2c049 100644
--- a/media/libwebp/README
+++ b/media/libwebp/README
@@ -4,7 +4,7 @@
           \__\__/\____/\_____/__/ ____  ___
                 / _/ /    \    \ /  _ \/ _/
                /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v1.0.2
+               \____/____/\_____/_____/____/v1.2.2
 
 Description:
 ============
@@ -13,13 +13,13 @@ WebP codec: library to encode and decode images in WebP format. This package
 contains the library that can be used in other programs to add WebP support,
 as well as the command line tools 'cwebp' and 'dwebp'.
 
-See http://developers.google.com/speed/webp
+See https://developers.google.com/speed/webp
 
 The latest source tree is available at
 https://chromium.googlesource.com/webm/libwebp
 
 It is released under the same license as the WebM project.
-See http://www.webmproject.org/license/software/ or the
+See https://www.webmproject.org/license/software/ or the
 "COPYING" file for details. An additional intellectual
 property rights grant can be found in the file PATENTS.
 
@@ -113,7 +113,7 @@ make install
 
 CMake:
 ------
-With CMake, you can compile libwebp, cwebp, dwebp, gif2web, img2webp, webpinfo
+With CMake, you can compile libwebp, cwebp, dwebp, gif2webp, img2webp, webpinfo
 and the JS bindings.
 
 Prerequisites:
@@ -225,6 +225,7 @@ Usage:
 
 If input size (-s) for an image is not specified, it is
 assumed to be a PNG, JPEG, TIFF or WebP file.
+Note: Animated PNG and WebP files are not supported.
 
 Options:
   -h / -help ............. short help
@@ -254,6 +255,8 @@ Options:
   -partition_limit <int> . limit quality to fit the 512k limit on
                            the first partition (0=no degradation ... 100=full)
   -pass <int> ............ analysis pass number (1..10)
+  -qrange <min> <max> .... specifies the permissible quality range
+                           (default: 0 100)
   -crop <x> <y> <w> <h> .. crop picture with the given rectangle
   -resize <w> <h> ........ resize picture (after any cropping)
   -mt .................... use multi-threading if available
@@ -294,6 +297,7 @@ Experimental Options:
   -af .................... auto-adjust filter strength
   -pre <int> ............. pre-processing filter
 
+
 The main options you might want to try in order to further tune the
 visual quality are:
  -preset
@@ -341,7 +345,9 @@ The full list of options is available using -h:
 > dwebp -h
 Usage: dwebp in_file [options] [-o out_file]
 
-Decodes the WebP image file to PNG format [Default]
+Decodes the WebP image file to PNG format [Default].
+Note: Animated WebP files are not supported.
+
 Use following options to convert into alternate image formats:
   -pam ......... save the raw RGBA samples as a color PAM
   -ppm ......... save the raw RGB samples as a color PPM
@@ -423,15 +429,15 @@ Prerequisites:
 1) OpenGL & OpenGL Utility Toolkit (GLUT)
   Linux:
     $ sudo apt-get install freeglut3-dev mesa-common-dev
-  Mac + XCode:
+  Mac + Xcode:
     - These libraries should be available in the OpenGL / GLUT frameworks.
   Windows:
     http://freeglut.sourceforge.net/index.php#download
 
 2) (Optional) qcms (Quick Color Management System)
   i. Download qcms from Mozilla / Chromium:
-    http://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
-    http://src.chromium.org/viewvc/chrome/trunk/src/third_party/qcms
+    https://hg.mozilla.org/mozilla-central/file/0e7639e3bdfb/gfx/qcms
+    https://source.chromium.org/chromium/chromium/src/+/main:third_party/qcms/;drc=d4a2f8e1ed461d8fc05ed88d1ae2dc94c9773825
   ii. Build and archive the source files as libqcms.a / qcms.lib
   iii. Update makefile.unix / Makefile.vc
     a) Define WEBP_HAVE_QCMS
@@ -450,7 +456,7 @@ modes, etc.
 
 Usage:
 
-  img2webp [file-level options] [image files...] [per-frame options...]
+  img2webp [file_options] [[frame_options] frame_file]...
 
 File-level options (only used at the start of compression):
  -min_size ............ minimize size
@@ -597,7 +603,7 @@ The encoding flow looks like:
   // Setup a config, starting form a preset and tuning some additional
   // parameters
   WebPConfig config;
-  if (!WebPConfigPreset(&config, WEBP_PRESET_PHOTO, quality_factor))
+  if (!WebPConfigPreset(&config, WEBP_PRESET_PHOTO, quality_factor)) {
     return 0;   // version error
   }
   // ... additional tuning
@@ -613,7 +619,7 @@ The encoding flow looks like:
   pic.width = width;
   pic.height = height;
   // allocated picture of dimension width x height
-  if (!WebPPictureAllocate(&pic)) {
+  if (!WebPPictureAlloc(&pic)) {
     return 0;   // memory error
   }
   // at this point, 'pic' has been initialized as a container,
@@ -780,10 +786,10 @@ Bugs:
 Please report all bugs to the issue tracker:
     https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
-    http://www.webmproject.org/code/contribute/submitting-patches/
+    https://www.webmproject.org/code/contribute/submitting-patches/
 
 Discuss:
 ========
 
 Email: webp-discuss@webmproject.org
-Web: http://groups.google.com/a/webmproject.org/group/webp-discuss
+Web: https://groups.google.com/a/webmproject.org/group/webp-discuss
diff --git a/media/libwebp/README.mux b/media/libwebp/README.mux
index 7e9c3c903b..099d8e061d 100644
--- a/media/libwebp/README.mux
+++ b/media/libwebp/README.mux
@@ -1,7 +1,7 @@
           __   __  ____  ____  ____  __ __  _     __ __
          /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
          \       /   __/  _  \   __/      /  /  (_/  /__
-          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v1.0.2
+          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v1.2.2
 
 
 Description:
@@ -43,10 +43,12 @@ GET_OPTIONS:
    frame n   get nth frame
 
 SET_OPTIONS:
- Set color profile/metadata:
-   icc  file.icc     set ICC profile
-   exif file.exif    set EXIF metadata
-   xmp  file.xmp     set XMP metadata
+ Set color profile/metadata/parameters:
+   loop LOOP_COUNT            set the loop count
+   bgcolor BACKGROUND_COLOR   set the animation background color
+   icc  file.icc              set ICC profile
+   exif file.exif             set EXIF metadata
+   xmp  file.xmp              set XMP metadata
    where:    'file.icc' contains the ICC profile to be set,
              'file.exif' contains the EXIF metadata to be set
              'file.xmp' contains the XMP metadata to be set
@@ -247,10 +249,10 @@ Bugs:
 Please report all bugs to the issue tracker:
     https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
-    http://www.webmproject.org/code/contribute/submitting-patches/
+    https://www.webmproject.org/code/contribute/submitting-patches/
 
 Discuss:
 ========
 
 Email: webp-discuss@webmproject.org
-Web: http://groups.google.com/a/webmproject.org/group/webp-discuss
+Web: https://groups.google.com/a/webmproject.org/group/webp-discuss
diff --git a/media/libwebp/UXPCHANGES b/media/libwebp/UXPCHANGES
index 78b7823c8d..bf1fe22d6e 100644
--- a/media/libwebp/UXPCHANGES
+++ b/media/libwebp/UXPCHANGES
@@ -3,3 +3,4 @@ Changes made to pristine libwebp source by Moonchild Productions and mozilla.org
 2017/01/27  -- Synced with libwebp-0.6.0 (BZ #1294490).
 2018/06/29  -- Synced with libwebp-1.0.0 + BUG=webp:381,383,384.
 2019/01/21  -- Synced with libwebp-1.0.2
+2022/06/26  -- Synced with libwebp-1.2.2
diff --git a/media/libwebp/dec/alpha_dec.c b/media/libwebp/dec/alpha_dec.c
index 1ff7c62d8b..52c24037e4 100644
--- a/media/libwebp/dec/alpha_dec.c
+++ b/media/libwebp/dec/alpha_dec.c
@@ -183,7 +183,7 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
   assert(dec != NULL && io != NULL);
 
   if (row < 0 || num_rows <= 0 || row + num_rows > height) {
-    return NULL;    // sanity check.
+    return NULL;
   }
 
   if (!dec->is_alpha_decoded_) {
diff --git a/media/libwebp/dec/buffer_dec.c b/media/libwebp/dec/buffer_dec.c
index d72d32b0a9..0f3eed2cfe 100644
--- a/media/libwebp/dec/buffer_dec.c
+++ b/media/libwebp/dec/buffer_dec.c
@@ -102,7 +102,7 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
     int stride;
     uint64_t size;
 
-    if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
+    if ((uint64_t)w * kModeBpp[mode] >= (1ull << 31)) {
       return VP8_STATUS_INVALID_PARAM;
     }
     stride = w * kModeBpp[mode];
@@ -117,7 +117,6 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
     }
     total_size = size + 2 * uv_size + a_size;
 
-    // Security/sanity checks
     output = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*output));
     if (output == NULL) {
       return VP8_STATUS_OUT_OF_MEMORY;
@@ -156,11 +155,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
   }
   if (WebPIsRGBMode(buffer->colorspace)) {
     WebPRGBABuffer* const buf = &buffer->u.RGBA;
-    buf->rgba += (buffer->height - 1) * buf->stride;
+    buf->rgba += (int64_t)(buffer->height - 1) * buf->stride;
     buf->stride = -buf->stride;
   } else {
     WebPYUVABuffer* const buf = &buffer->u.YUVA;
-    const int H = buffer->height;
+    const int64_t H = buffer->height;
     buf->y += (H - 1) * buf->y_stride;
     buf->y_stride = -buf->y_stride;
     buf->u += ((H - 1) >> 1) * buf->u_stride;
@@ -188,8 +187,7 @@ VP8StatusCode WebPAllocateDecBuffer(int width, int height,
       const int ch = options->crop_height;
       const int x = options->crop_left & ~1;
       const int y = options->crop_top & ~1;
-      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
-          x + cw > width || y + ch > height) {
+      if (!WebPCheckCropDimensions(width, height, x, y, cw, ch)) {
         return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
       }
       width = cw;
diff --git a/media/libwebp/dec/frame_dec.c b/media/libwebp/dec/frame_dec.c
index 3d1d662746..d4cdc15344 100644
--- a/media/libwebp/dec/frame_dec.c
+++ b/media/libwebp/dec/frame_dec.c
@@ -705,7 +705,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
                         + cache_size + alpha_size + WEBP_ALIGN_CST;
   uint8_t* mem;
 
-  if (needed != (size_t)needed) return 0;  // check for overflow
+  if (!CheckSizeOverflow(needed)) return 0;  // check for overflow
   if (needed > dec->mem_size_) {
     WebPSafeFree(dec->mem_);
     dec->mem_size_ = 0;
@@ -732,7 +732,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
   mem += f_info_size;
   dec->thread_ctx_.id_ = 0;
   dec->thread_ctx_.f_info_ = dec->f_info_;
-  if (dec->mt_method_ > 0) {
+  if (dec->filter_type_ > 0 && dec->mt_method_ > 0) {
     // secondary cache line. The deblocking process need to make use of the
     // filtering strength from previous macroblock row, while the new ones
     // are being decoded in parallel. We'll just swap the pointers.
diff --git a/media/libwebp/dec/idec_dec.c b/media/libwebp/dec/idec_dec.c
index ee0d33eac4..3a592d59ed 100644
--- a/media/libwebp/dec/idec_dec.c
+++ b/media/libwebp/dec/idec_dec.c
@@ -166,9 +166,11 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
   VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
   MemBuffer* const mem = &idec->mem_;
   const int need_compressed_alpha = NeedCompressedAlpha(idec);
-  const uint8_t* const old_start = mem->buf_ + mem->start_;
+  const uint8_t* const old_start =
+      (mem->buf_ == NULL) ? NULL : mem->buf_ + mem->start_;
   const uint8_t* const old_base =
       need_compressed_alpha ? dec->alpha_data_ : old_start;
+  assert(mem->buf_ != NULL || mem->start_ == 0);
   assert(mem->mode_ == MEM_MODE_APPEND);
   if (data_size > MAX_CHUNK_PAYLOAD) {
     // security safeguard: trying to allocate more than what the format
@@ -184,7 +186,7 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
     uint8_t* const new_buf =
         (uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
     if (new_buf == NULL) return 0;
-    memcpy(new_buf, old_base, current_size);
+    if (old_base != NULL) memcpy(new_buf, old_base, current_size);
     WebPSafeFree(mem->buf_);
     mem->buf_ = new_buf;
     mem->buf_size_ = (size_t)extra_size;
@@ -192,6 +194,7 @@ static int AppendToMemBuffer(WebPIDecoder* const idec,
     mem->end_ = current_size;
   }
 
+  assert(mem->buf_ != NULL);
   memcpy(mem->buf_ + mem->end_, data, data_size);
   mem->end_ += data_size;
   assert(mem->end_ <= mem->buf_size_);
@@ -204,7 +207,9 @@ static int RemapMemBuffer(WebPIDecoder* const idec,
                           const uint8_t* const data, size_t data_size) {
   MemBuffer* const mem = &idec->mem_;
   const uint8_t* const old_buf = mem->buf_;
-  const uint8_t* const old_start = old_buf + mem->start_;
+  const uint8_t* const old_start =
+      (old_buf == NULL) ? NULL : old_buf + mem->start_;
+  assert(old_buf != NULL || mem->start_ == 0);
   assert(mem->mode_ == MEM_MODE_MAP);
 
   if (data_size < mem->buf_size_) return 0;  // can't remap to a shorter buffer!
diff --git a/media/libwebp/dec/io_dec.c b/media/libwebp/dec/io_dec.c
index 0edd9f526e..6124c61393 100644
--- a/media/libwebp/dec/io_dec.c
+++ b/media/libwebp/dec/io_dec.c
@@ -25,21 +25,16 @@
 static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
   WebPDecBuffer* output = p->output;
   const WebPYUVABuffer* const buf = &output->u.YUVA;
-  uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride;
-  uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride;
-  uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride;
+  uint8_t* const y_dst = buf->y + (size_t)io->mb_y * buf->y_stride;
+  uint8_t* const u_dst = buf->u + (size_t)(io->mb_y >> 1) * buf->u_stride;
+  uint8_t* const v_dst = buf->v + (size_t)(io->mb_y >> 1) * buf->v_stride;
   const int mb_w = io->mb_w;
   const int mb_h = io->mb_h;
   const int uv_w = (mb_w + 1) / 2;
   const int uv_h = (mb_h + 1) / 2;
-  int j;
-  for (j = 0; j < mb_h; ++j) {
-    memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
-  }
-  for (j = 0; j < uv_h; ++j) {
-    memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
-    memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
-  }
+  WebPCopyPlane(io->y, io->y_stride, y_dst, buf->y_stride, mb_w, mb_h);
+  WebPCopyPlane(io->u, io->uv_stride, u_dst, buf->u_stride, uv_w, uv_h);
+  WebPCopyPlane(io->v, io->uv_stride, v_dst, buf->v_stride, uv_w, uv_h);
   return io->mb_h;
 }
 
@@ -47,7 +42,7 @@ static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
   WebPDecBuffer* const output = p->output;
   WebPRGBABuffer* const buf = &output->u.RGBA;
-  uint8_t* const dst = buf->rgba + io->mb_y * buf->stride;
+  uint8_t* const dst = buf->rgba + (size_t)io->mb_y * buf->stride;
   WebPSamplerProcessPlane(io->y, io->y_stride,
                           io->u, io->v, io->uv_stride,
                           dst, buf->stride, io->mb_w, io->mb_h,
@@ -62,7 +57,7 @@ static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
   int num_lines_out = io->mb_h;   // a priori guess
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  uint8_t* dst = buf->rgba + (size_t)io->mb_y * buf->stride;
   WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
   const uint8_t* cur_y = io->y;
   const uint8_t* cur_u = io->u;
@@ -133,7 +128,7 @@ static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
   const WebPYUVABuffer* const buf = &p->output->u.YUVA;
   const int mb_w = io->mb_w;
   const int mb_h = io->mb_h;
-  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+  uint8_t* dst = buf->a + (size_t)io->mb_y * buf->a_stride;
   int j;
   (void)expected_num_lines_out;
   assert(expected_num_lines_out == mb_h);
@@ -186,7 +181,7 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
         (colorspace == MODE_ARGB || colorspace == MODE_Argb);
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
     int num_rows;
-    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
     uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
     const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
@@ -210,7 +205,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
     const WEBP_CSP_MODE colorspace = p->output->colorspace;
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
     int num_rows;
-    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
 #if (WEBP_SWAP_16BIT_CSP == 1)
     uint8_t* alpha_dst = base_rgba;
@@ -276,9 +271,9 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
                                 int expected_num_lines_out) {
   const WebPYUVABuffer* const buf = &p->output->u.YUVA;
-  uint8_t* const dst_a = buf->a + p->last_y * buf->a_stride;
+  uint8_t* const dst_a = buf->a + (size_t)p->last_y * buf->a_stride;
   if (io->a != NULL) {
-    uint8_t* const dst_y = buf->y + p->last_y * buf->y_stride;
+    uint8_t* const dst_y = buf->y + (size_t)p->last_y * buf->y_stride;
     const int num_lines_out = Rescale(io->a, io->width, io->mb_h, p->scaler_a);
     assert(expected_num_lines_out == num_lines_out);
     if (num_lines_out > 0) {   // unmultiply the Y
@@ -303,46 +298,57 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const int uv_out_height = (out_height + 1) >> 1;
   const int uv_in_width  = (io->mb_w + 1) >> 1;
   const int uv_in_height = (io->mb_h + 1) >> 1;
-  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
+  // scratch memory for luma rescaler
+  const size_t work_size = 2 * (size_t)out_width;
   const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
-  size_t tmp_size, rescaler_size;
+  uint64_t total_size;
+  size_t rescaler_size;
   rescaler_t* work;
   WebPRescaler* scalers;
   const int num_rescalers = has_alpha ? 4 : 3;
 
-  tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
+  total_size = ((uint64_t)work_size + 2 * uv_work_size) * sizeof(*work);
   if (has_alpha) {
-    tmp_size += work_size * sizeof(*work);
+    total_size += (uint64_t)work_size * sizeof(*work);
   }
   rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+  total_size += rescaler_size;
+  if (!CheckSizeOverflow(total_size)) {
+    return 0;
+  }
 
-  p->memory = WebPSafeMalloc(1ULL, tmp_size + rescaler_size);
+  p->memory = WebPSafeMalloc(1ULL, (size_t)total_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
   work = (rescaler_t*)p->memory;
 
-  scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + tmp_size);
+  scalers = (WebPRescaler*)WEBP_ALIGN(
+      (const uint8_t*)work + total_size - rescaler_size);
   p->scaler_y = &scalers[0];
   p->scaler_u = &scalers[1];
   p->scaler_v = &scalers[2];
   p->scaler_a = has_alpha ? &scalers[3] : NULL;
 
-  WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
-                   buf->y, out_width, out_height, buf->y_stride, 1,
-                   work);
-  WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
-                   buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
-                   work + work_size);
-  WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
-                   buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
-                   work + work_size + uv_work_size);
+  if (!WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
+                        buf->y, out_width, out_height, buf->y_stride, 1,
+                        work) ||
+      !WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
+                        buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
+                        work + work_size) ||
+      !WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
+                        buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
+                        work + work_size + uv_work_size)) {
+    return 0;
+  }
   p->emit = EmitRescaledYUV;
 
   if (has_alpha) {
-    WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
-                     buf->a, out_width, out_height, buf->a_stride, 1,
-                     work + work_size + 2 * uv_work_size);
+    if (!WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
+                          buf->a, out_width, out_height, buf->a_stride, 1,
+                          work + work_size + 2 * uv_work_size)) {
+      return 0;
+    }
     p->emit_alpha = EmitRescaledAlphaYUV;
     WebPInitAlphaProcessing();
   }
@@ -356,7 +362,7 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
   const WebPYUV444Converter convert =
       WebPYUV444Converters[p->output->colorspace];
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + y_pos * buf->stride;
+  uint8_t* dst = buf->rgba + (size_t)y_pos * buf->stride;
   int num_lines_out = 0;
   // For RGB rescaling, because of the YUV420, current scan position
   // U/V can be +1/-1 line from the Y one.  Hence the double test.
@@ -383,15 +389,15 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
   while (j < mb_h) {
     const int y_lines_in =
         WebPRescalerImport(p->scaler_y, mb_h - j,
-                           io->y + j * io->y_stride, io->y_stride);
+                           io->y + (size_t)j * io->y_stride, io->y_stride);
     j += y_lines_in;
     if (WebPRescaleNeededLines(p->scaler_u, uv_mb_h - uv_j)) {
-      const int u_lines_in =
-          WebPRescalerImport(p->scaler_u, uv_mb_h - uv_j,
-                             io->u + uv_j * io->uv_stride, io->uv_stride);
-      const int v_lines_in =
-          WebPRescalerImport(p->scaler_v, uv_mb_h - uv_j,
-                             io->v + uv_j * io->uv_stride, io->uv_stride);
+      const int u_lines_in = WebPRescalerImport(
+          p->scaler_u, uv_mb_h - uv_j, io->u + (size_t)uv_j * io->uv_stride,
+          io->uv_stride);
+      const int v_lines_in = WebPRescalerImport(
+          p->scaler_v, uv_mb_h - uv_j, io->v + (size_t)uv_j * io->uv_stride,
+          io->uv_stride);
       (void)v_lines_in;   // remove a gcc warning
       assert(u_lines_in == v_lines_in);
       uv_j += u_lines_in;
@@ -403,7 +409,7 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
 
 static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
   const WEBP_CSP_MODE colorspace = p->output->colorspace;
   const int alpha_first =
       (colorspace == MODE_ARGB || colorspace == MODE_Argb);
@@ -431,7 +437,7 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
 static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
                                int max_lines_out) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
 #if (WEBP_SWAP_16BIT_CSP == 1)
   uint8_t* alpha_dst = base_rgba;
 #else
@@ -470,7 +476,7 @@ static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
     int lines_left = expected_num_out_lines;
     const int y_end = p->last_y + lines_left;
     while (lines_left > 0) {
-      const int row_offset = scaler->src_y - io->mb_y;
+      const int64_t row_offset = (int64_t)scaler->src_y - io->mb_y;
       WebPRescalerImport(scaler, io->mb_h + io->mb_y - scaler->src_y,
                          io->a + row_offset * io->width, io->width);
       lines_left -= p->emit_alpha_row(p, y_end - lines_left, lines_left);
@@ -485,51 +491,58 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const int out_height = io->scaled_height;
   const int uv_in_width  = (io->mb_w + 1) >> 1;
   const int uv_in_height = (io->mb_h + 1) >> 1;
-  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
+  // scratch memory for one rescaler
+  const size_t work_size = 2 * (size_t)out_width;
   rescaler_t* work;  // rescalers work area
   uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
-  size_t tmp_size1, tmp_size2, total_size, rescaler_size;
+  uint64_t tmp_size1, tmp_size2, total_size;
+  size_t rescaler_size;
   WebPRescaler* scalers;
   const int num_rescalers = has_alpha ? 4 : 3;
 
-  tmp_size1 = 3 * work_size;
-  tmp_size2 = 3 * out_width;
-  if (has_alpha) {
-    tmp_size1 += work_size;
-    tmp_size2 += out_width;
-  }
+  tmp_size1 = (uint64_t)num_rescalers * work_size;
+  tmp_size2 = (uint64_t)num_rescalers * out_width;
   total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
   rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+  total_size += rescaler_size;
+  if (!CheckSizeOverflow(total_size)) {
+    return 0;
+  }
 
-  p->memory = WebPSafeMalloc(1ULL, total_size + rescaler_size);
+  p->memory = WebPSafeMalloc(1ULL, (size_t)total_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
   work = (rescaler_t*)p->memory;
   tmp = (uint8_t*)(work + tmp_size1);
 
-  scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + total_size);
+  scalers = (WebPRescaler*)WEBP_ALIGN(
+      (const uint8_t*)work + total_size - rescaler_size);
   p->scaler_y = &scalers[0];
   p->scaler_u = &scalers[1];
   p->scaler_v = &scalers[2];
   p->scaler_a = has_alpha ? &scalers[3] : NULL;
 
-  WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
-                   tmp + 0 * out_width, out_width, out_height, 0, 1,
-                   work + 0 * work_size);
-  WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
-                   tmp + 1 * out_width, out_width, out_height, 0, 1,
-                   work + 1 * work_size);
-  WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
-                   tmp + 2 * out_width, out_width, out_height, 0, 1,
-                   work + 2 * work_size);
+  if (!WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
+                        tmp + 0 * out_width, out_width, out_height, 0, 1,
+                        work + 0 * work_size) ||
+      !WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
+                        tmp + 1 * out_width, out_width, out_height, 0, 1,
+                        work + 1 * work_size) ||
+      !WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
+                        tmp + 2 * out_width, out_width, out_height, 0, 1,
+                        work + 2 * work_size)) {
+    return 0;
+  }
   p->emit = EmitRescaledRGB;
   WebPInitYUV444Converters();
 
   if (has_alpha) {
-    WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
-                     tmp + 3 * out_width, out_width, out_height, 0, 1,
-                     work + 3 * work_size);
+    if (!WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
+                          tmp + 3 * out_width, out_width, out_height, 0, 1,
+                          work + 3 * work_size)) {
+      return 0;
+    }
     p->emit_alpha = EmitRescaledAlphaRGB;
     if (p->output->colorspace == MODE_RGBA_4444 ||
         p->output->colorspace == MODE_rgbA_4444) {
diff --git a/media/libwebp/dec/quant_dec.c b/media/libwebp/dec/quant_dec.c
index 6ecaf1c453..351da5f561 100644
--- a/media/libwebp/dec/quant_dec.c
+++ b/media/libwebp/dec/quant_dec.c
@@ -61,12 +61,17 @@ static const uint16_t kAcTable[128] = {
 
 void VP8ParseQuant(VP8Decoder* const dec) {
   VP8BitReader* const br = &dec->br_;
-  const int base_q0 = VP8GetValue(br, 7);
-  const int dqy1_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
-  const int dqy2_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
-  const int dqy2_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
-  const int dquv_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
-  const int dquv_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int base_q0 = VP8GetValue(br, 7, "global-header");
+  const int dqy1_dc = VP8Get(br, "global-header") ?
+       VP8GetSignedValue(br, 4, "global-header") : 0;
+  const int dqy2_dc = VP8Get(br, "global-header") ?
+       VP8GetSignedValue(br, 4, "global-header") : 0;
+  const int dqy2_ac = VP8Get(br, "global-header") ?
+       VP8GetSignedValue(br, 4, "global-header") : 0;
+  const int dquv_dc = VP8Get(br, "global-header") ?
+       VP8GetSignedValue(br, 4, "global-header") : 0;
+  const int dquv_ac = VP8Get(br, "global-header") ?
+       VP8GetSignedValue(br, 4, "global-header") : 0;
 
   const VP8SegmentHeader* const hdr = &dec->segment_hdr_;
   int i;
diff --git a/media/libwebp/dec/tree_dec.c b/media/libwebp/dec/tree_dec.c
index 5818860254..b219cdd2c9 100644
--- a/media/libwebp/dec/tree_dec.c
+++ b/media/libwebp/dec/tree_dec.c
@@ -296,20 +296,21 @@ static void ParseIntraMode(VP8BitReader* const br,
   // to decode more than 1 keyframe.
   if (dec->segment_hdr_.update_map_) {
     // Hardcoded tree parsing
-    block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0])
-                    ? VP8GetBit(br, dec->proba_.segments_[1])
-                    : 2 + VP8GetBit(br, dec->proba_.segments_[2]);
+    block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0], "segments")
+                    ?  VP8GetBit(br, dec->proba_.segments_[1], "segments")
+                    :  VP8GetBit(br, dec->proba_.segments_[2], "segments") + 2;
   } else {
     block->segment_ = 0;  // default for intra
   }
-  if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_);
+  if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_, "skip");
 
-  block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
+  block->is_i4x4_ = !VP8GetBit(br, 145, "block-size");
   if (!block->is_i4x4_) {
     // Hardcoded 16x16 intra-mode decision tree.
     const int ymode =
-        VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
-                           : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
+        VP8GetBit(br, 156, "pred-modes") ?
+            (VP8GetBit(br, 128, "pred-modes") ? TM_PRED : H_PRED) :
+            (VP8GetBit(br, 163, "pred-modes") ? V_PRED : DC_PRED);
     block->imodes_[0] = ymode;
     memset(top, ymode, 4 * sizeof(*top));
     memset(left, ymode, 4 * sizeof(*left));
@@ -323,22 +324,25 @@ static void ParseIntraMode(VP8BitReader* const br,
         const uint8_t* const prob = kBModesProba[top[x]][ymode];
 #if (USE_GENERIC_TREE == 1)
         // Generic tree-parsing
-        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
+        int i = kYModesIntra4[VP8GetBit(br, prob[0], "pred-modes")];
         while (i > 0) {
-          i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
+          i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i], "pred-modes")];
         }
         ymode = -i;
 #else
         // Hardcoded tree parsing
-        ymode = !VP8GetBit(br, prob[0]) ? B_DC_PRED :
-                  !VP8GetBit(br, prob[1]) ? B_TM_PRED :
-                    !VP8GetBit(br, prob[2]) ? B_VE_PRED :
-                      !VP8GetBit(br, prob[3]) ?
-                        (!VP8GetBit(br, prob[4]) ? B_HE_PRED :
-                          (!VP8GetBit(br, prob[5]) ? B_RD_PRED : B_VR_PRED)) :
-                        (!VP8GetBit(br, prob[6]) ? B_LD_PRED :
-                          (!VP8GetBit(br, prob[7]) ? B_VL_PRED :
-                            (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
+        ymode = !VP8GetBit(br, prob[0], "pred-modes") ? B_DC_PRED :
+                  !VP8GetBit(br, prob[1], "pred-modes") ? B_TM_PRED :
+                    !VP8GetBit(br, prob[2], "pred-modes") ? B_VE_PRED :
+                      !VP8GetBit(br, prob[3], "pred-modes") ?
+                        (!VP8GetBit(br, prob[4], "pred-modes") ? B_HE_PRED :
+                          (!VP8GetBit(br, prob[5], "pred-modes") ? B_RD_PRED
+                                                                 : B_VR_PRED)) :
+                        (!VP8GetBit(br, prob[6], "pred-modes") ? B_LD_PRED :
+                          (!VP8GetBit(br, prob[7], "pred-modes") ? B_VL_PRED :
+                            (!VP8GetBit(br, prob[8], "pred-modes") ? B_HD_PRED
+                                                                   : B_HU_PRED))
+                        );
 #endif  // USE_GENERIC_TREE
         top[x] = ymode;
       }
@@ -348,9 +352,9 @@ static void ParseIntraMode(VP8BitReader* const br,
     }
   }
   // Hardcoded UVMode decision tree
-  block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
-                 : !VP8GetBit(br, 114) ? V_PRED
-                 : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
+  block->uvmode_ = !VP8GetBit(br, 142, "pred-modes-uv") ? DC_PRED
+                 : !VP8GetBit(br, 114, "pred-modes-uv") ? V_PRED
+                 : VP8GetBit(br, 183, "pred-modes-uv") ? TM_PRED : H_PRED;
 }
 
 int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec) {
@@ -514,8 +518,10 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
     for (b = 0; b < NUM_BANDS; ++b) {
       for (c = 0; c < NUM_CTX; ++c) {
         for (p = 0; p < NUM_PROBAS; ++p) {
-          const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ?
-                        VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p];
+          const int v =
+              VP8GetBit(br, CoeffsUpdateProba[t][b][c][p], "global-header") ?
+                        VP8GetValue(br, 8, "global-header") :
+                        CoeffsProba0[t][b][c][p];
           proba->bands_[t][b].probas_[c][p] = v;
         }
       }
@@ -524,9 +530,8 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
       proba->bands_ptr_[t][b] = &proba->bands_[t][kBands[b]];
     }
   }
-  dec->use_skip_proba_ = VP8Get(br);
+  dec->use_skip_proba_ = VP8Get(br, "global-header");
   if (dec->use_skip_proba_) {
-    dec->skip_p_ = VP8GetValue(br, 8);
+    dec->skip_p_ = VP8GetValue(br, 8, "global-header");
   }
 }
-
diff --git a/media/libwebp/dec/vp8_dec.c b/media/libwebp/dec/vp8_dec.c
index e7958be6b0..5f51363e53 100644
--- a/media/libwebp/dec/vp8_dec.c
+++ b/media/libwebp/dec/vp8_dec.c
@@ -161,23 +161,26 @@ static int ParseSegmentHeader(VP8BitReader* br,
                               VP8SegmentHeader* hdr, VP8Proba* proba) {
   assert(br != NULL);
   assert(hdr != NULL);
-  hdr->use_segment_ = VP8Get(br);
+  hdr->use_segment_ = VP8Get(br, "global-header");
   if (hdr->use_segment_) {
-    hdr->update_map_ = VP8Get(br);
-    if (VP8Get(br)) {   // update data
+    hdr->update_map_ = VP8Get(br, "global-header");
+    if (VP8Get(br, "global-header")) {   // update data
       int s;
-      hdr->absolute_delta_ = VP8Get(br);
+      hdr->absolute_delta_ = VP8Get(br, "global-header");
       for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        hdr->quantizer_[s] = VP8Get(br) ? VP8GetSignedValue(br, 7) : 0;
+        hdr->quantizer_[s] = VP8Get(br, "global-header") ?
+            VP8GetSignedValue(br, 7, "global-header") : 0;
       }
       for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        hdr->filter_strength_[s] = VP8Get(br) ? VP8GetSignedValue(br, 6) : 0;
+        hdr->filter_strength_[s] = VP8Get(br, "global-header") ?
+            VP8GetSignedValue(br, 6, "global-header") : 0;
       }
     }
     if (hdr->update_map_) {
       int s;
       for (s = 0; s < MB_FEATURE_TREE_PROBS; ++s) {
-        proba->segments_[s] = VP8Get(br) ? VP8GetValue(br, 8) : 255u;
+        proba->segments_[s] = VP8Get(br, "global-header") ?
+            VP8GetValue(br, 8, "global-header") : 255u;
       }
     }
   } else {
@@ -205,7 +208,7 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
   size_t last_part;
   size_t p;
 
-  dec->num_parts_minus_one_ = (1 << VP8GetValue(br, 2)) - 1;
+  dec->num_parts_minus_one_ = (1 << VP8GetValue(br, 2, "global-header")) - 1;
   last_part = dec->num_parts_minus_one_;
   if (size < 3 * last_part) {
     // we can't even read the sizes with sz[]! That's a failure.
@@ -229,21 +232,21 @@ static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
 // Paragraph 9.4
 static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
   VP8FilterHeader* const hdr = &dec->filter_hdr_;
-  hdr->simple_    = VP8Get(br);
-  hdr->level_     = VP8GetValue(br, 6);
-  hdr->sharpness_ = VP8GetValue(br, 3);
-  hdr->use_lf_delta_ = VP8Get(br);
+  hdr->simple_    = VP8Get(br, "global-header");
+  hdr->level_     = VP8GetValue(br, 6, "global-header");
+  hdr->sharpness_ = VP8GetValue(br, 3, "global-header");
+  hdr->use_lf_delta_ = VP8Get(br, "global-header");
   if (hdr->use_lf_delta_) {
-    if (VP8Get(br)) {   // update lf-delta?
+    if (VP8Get(br, "global-header")) {   // update lf-delta?
       int i;
       for (i = 0; i < NUM_REF_LF_DELTAS; ++i) {
-        if (VP8Get(br)) {
-          hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6);
+        if (VP8Get(br, "global-header")) {
+          hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6, "global-header");
         }
       }
       for (i = 0; i < NUM_MODE_LF_DELTAS; ++i) {
-        if (VP8Get(br)) {
-          hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6);
+        if (VP8Get(br, "global-header")) {
+          hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6, "global-header");
         }
       }
     }
@@ -332,7 +335,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
     io->scaled_width = io->width;
     io->scaled_height = io->height;
 
-    io->mb_w = io->width;   // sanity check
+    io->mb_w = io->width;   // for soundness
     io->mb_h = io->height;  // ditto
 
     VP8ResetProba(&dec->proba_);
@@ -352,8 +355,8 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
   buf_size -= frm_hdr->partition_length_;
 
   if (frm_hdr->key_frame_) {
-    pic_hdr->colorspace_ = VP8Get(br);
-    pic_hdr->clamp_type_ = VP8Get(br);
+    pic_hdr->colorspace_ = VP8Get(br, "global-header");
+    pic_hdr->clamp_type_ = VP8Get(br, "global-header");
   }
   if (!ParseSegmentHeader(br, &dec->segment_hdr_, &dec->proba_)) {
     return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@@ -378,7 +381,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
                        "Not a key frame.");
   }
 
-  VP8Get(br);   // ignore the value of update_proba_
+  VP8Get(br, "global-header");   // ignore the value of update_proba_
 
   VP8ParseProba(br, dec);
 
@@ -400,31 +403,31 @@ static const uint8_t kZigzag[16] = {
   0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 
-// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+// See section 13-2: https://datatracker.ietf.org/doc/html/rfc6386#section-13.2
 static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
   int v;
-  if (!VP8GetBit(br, p[3])) {
-    if (!VP8GetBit(br, p[4])) {
+  if (!VP8GetBit(br, p[3], "coeffs")) {
+    if (!VP8GetBit(br, p[4], "coeffs")) {
       v = 2;
     } else {
-      v = 3 + VP8GetBit(br, p[5]);
+      v = 3 + VP8GetBit(br, p[5], "coeffs");
     }
   } else {
-    if (!VP8GetBit(br, p[6])) {
-      if (!VP8GetBit(br, p[7])) {
-        v = 5 + VP8GetBit(br, 159);
+    if (!VP8GetBit(br, p[6], "coeffs")) {
+      if (!VP8GetBit(br, p[7], "coeffs")) {
+        v = 5 + VP8GetBit(br, 159, "coeffs");
       } else {
-        v = 7 + 2 * VP8GetBit(br, 165);
-        v += VP8GetBit(br, 145);
+        v = 7 + 2 * VP8GetBit(br, 165, "coeffs");
+        v += VP8GetBit(br, 145, "coeffs");
       }
     } else {
       const uint8_t* tab;
-      const int bit1 = VP8GetBit(br, p[8]);
-      const int bit0 = VP8GetBit(br, p[9 + bit1]);
+      const int bit1 = VP8GetBit(br, p[8], "coeffs");
+      const int bit0 = VP8GetBit(br, p[9 + bit1], "coeffs");
       const int cat = 2 * bit1 + bit0;
       v = 0;
       for (tab = kCat3456[cat]; *tab; ++tab) {
-        v += v + VP8GetBit(br, *tab);
+        v += v + VP8GetBit(br, *tab, "coeffs");
       }
       v += 3 + (8 << cat);
     }
@@ -438,24 +441,24 @@ static int GetCoeffsFast(VP8BitReader* const br,
                          int ctx, const quant_t dq, int n, int16_t* out) {
   const uint8_t* p = prob[n]->probas_[ctx];
   for (; n < 16; ++n) {
-    if (!VP8GetBit(br, p[0])) {
+    if (!VP8GetBit(br, p[0], "coeffs")) {
       return n;  // previous coeff was last non-zero coeff
     }
-    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
+    while (!VP8GetBit(br, p[1], "coeffs")) {       // sequence of zero coeffs
       p = prob[++n]->probas_[0];
       if (n == 16) return 16;
     }
     {        // non zero coeff
       const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
       int v;
-      if (!VP8GetBit(br, p[2])) {
+      if (!VP8GetBit(br, p[2], "coeffs")) {
         v = 1;
         p = p_ctx[1];
       } else {
         v = GetLargeValue(br, p);
         p = p_ctx[2];
       }
-      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+      out[kZigzag[n]] = VP8GetSigned(br, v, "coeffs") * dq[n > 0];
     }
   }
   return 16;
@@ -468,36 +471,34 @@ static int GetCoeffsAlt(VP8BitReader* const br,
                         int ctx, const quant_t dq, int n, int16_t* out) {
   const uint8_t* p = prob[n]->probas_[ctx];
   for (; n < 16; ++n) {
-    if (!VP8GetBitAlt(br, p[0])) {
+    if (!VP8GetBitAlt(br, p[0], "coeffs")) {
       return n;  // previous coeff was last non-zero coeff
     }
-    while (!VP8GetBitAlt(br, p[1])) {       // sequence of zero coeffs
+    while (!VP8GetBitAlt(br, p[1], "coeffs")) {       // sequence of zero coeffs
       p = prob[++n]->probas_[0];
       if (n == 16) return 16;
     }
     {        // non zero coeff
       const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
       int v;
-      if (!VP8GetBitAlt(br, p[2])) {
+      if (!VP8GetBitAlt(br, p[2], "coeffs")) {
         v = 1;
         p = p_ctx[1];
       } else {
         v = GetLargeValue(br, p);
         p = p_ctx[2];
       }
-      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+      out[kZigzag[n]] = VP8GetSigned(br, v, "coeffs") * dq[n > 0];
     }
   }
   return 16;
 }
 
-static WEBP_TSAN_IGNORE_FUNCTION void InitGetCoeffs(void) {
-  if (GetCoeffs == NULL) {
-    if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
-      GetCoeffs = GetCoeffsAlt;
-    } else {
-      GetCoeffs = GetCoeffsFast;
-    }
+WEBP_DSP_INIT_FUNC(InitGetCoeffs) {
+  if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
+    GetCoeffs = GetCoeffsAlt;
+  } else {
+    GetCoeffs = GetCoeffsFast;
   }
 }
 
diff --git a/media/libwebp/dec/vp8i_dec.h b/media/libwebp/dec/vp8i_dec.h
index fabee44a0b..31d9080ca1 100644
--- a/media/libwebp/dec/vp8i_dec.h
+++ b/media/libwebp/dec/vp8i_dec.h
@@ -31,7 +31,7 @@ extern "C" {
 
 // version numbers
 #define DEC_MAJ_VERSION 1
-#define DEC_MIN_VERSION 0
+#define DEC_MIN_VERSION 2
 #define DEC_REV_VERSION 2
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
diff --git a/media/libwebp/dec/vp8l_dec.c b/media/libwebp/dec/vp8l_dec.c
index 0502cb9a52..32371a67fe 100644
--- a/media/libwebp/dec/vp8l_dec.c
+++ b/media/libwebp/dec/vp8l_dec.c
@@ -84,7 +84,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
 // to 256 (green component values) + 24 (length prefix values)
 // + color_cache_size (between 0 and 2048).
 // All values computed for 8-bit first level lookup with Mark Adler's tool:
-// http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
+// https://github.com/madler/zlib/blob/v1.2.5/examples/enough.c
 #define FIXED_TABLE_SIZE (630 * 3 + 410)
 static const uint16_t kTableSize[12] = {
   FIXED_TABLE_SIZE + 654,
@@ -362,12 +362,8 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
   VP8LMetadata* const hdr = &dec->hdr_;
   uint32_t* huffman_image = NULL;
   HTreeGroup* htree_groups = NULL;
-  // When reading htrees, some might be unused, as the format allows it.
-  // We will still read them but put them in this htree_group_bogus.
-  HTreeGroup htree_group_bogus;
   HuffmanCode* huffman_tables = NULL;
-  HuffmanCode* huffman_tables_bogus = NULL;
-  HuffmanCode* next = NULL;
+  HuffmanCode* huffman_table = NULL;
   int num_htree_groups = 1;
   int num_htree_groups_max = 1;
   int max_alphabet_size = 0;
@@ -418,12 +414,6 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
         if (*mapped_group == -1) *mapped_group = num_htree_groups++;
         huffman_image[i] = *mapped_group;
       }
-      huffman_tables_bogus = (HuffmanCode*)WebPSafeMalloc(
-          table_size, sizeof(*huffman_tables_bogus));
-      if (huffman_tables_bogus == NULL) {
-        dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-        goto Error;
-      }
     } else {
       num_htree_groups = num_htree_groups_max;
     }
@@ -453,63 +443,71 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
     goto Error;
   }
 
-  next = huffman_tables;
+  huffman_table = huffman_tables;
   for (i = 0; i < num_htree_groups_max; ++i) {
-    // If the index "i" is unused in the Huffman image, read the coefficients
-    // but store them to a bogus htree_group.
-    const int is_bogus = (mapping != NULL && mapping[i] == -1);
-    HTreeGroup* const htree_group =
-        is_bogus ? &htree_group_bogus :
-        &htree_groups[(mapping == NULL) ? i : mapping[i]];
-    HuffmanCode** const htrees = htree_group->htrees;
-    HuffmanCode* huffman_tables_i = is_bogus ? huffman_tables_bogus : next;
-    int size;
-    int total_size = 0;
-    int is_trivial_literal = 1;
-    int max_bits = 0;
-    for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
-      int alphabet_size = kAlphabetSize[j];
-      htrees[j] = huffman_tables_i;
-      if (j == 0 && color_cache_bits > 0) {
-        alphabet_size += 1 << color_cache_bits;
-      }
-      size =
-          ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_tables_i);
-      if (size == 0) {
-        goto Error;
-      }
-      if (is_trivial_literal && kLiteralMap[j] == 1) {
-        is_trivial_literal = (huffman_tables_i->bits == 0);
+    // If the index "i" is unused in the Huffman image, just make sure the
+    // coefficients are valid but do not store them.
+    if (mapping != NULL && mapping[i] == -1) {
+      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+        int alphabet_size = kAlphabetSize[j];
+        if (j == 0 && color_cache_bits > 0) {
+          alphabet_size += (1 << color_cache_bits);
+        }
+        // Passing in NULL so that nothing gets filled.
+        if (!ReadHuffmanCode(alphabet_size, dec, code_lengths, NULL)) {
+          goto Error;
+        }
       }
-      total_size += huffman_tables_i->bits;
-      huffman_tables_i += size;
-      if (j <= ALPHA) {
-        int local_max_bits = code_lengths[0];
-        int k;
-        for (k = 1; k < alphabet_size; ++k) {
-          if (code_lengths[k] > local_max_bits) {
-            local_max_bits = code_lengths[k];
+    } else {
+      HTreeGroup* const htree_group =
+          &htree_groups[(mapping == NULL) ? i : mapping[i]];
+      HuffmanCode** const htrees = htree_group->htrees;
+      int size;
+      int total_size = 0;
+      int is_trivial_literal = 1;
+      int max_bits = 0;
+      for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+        int alphabet_size = kAlphabetSize[j];
+        htrees[j] = huffman_table;
+        if (j == 0 && color_cache_bits > 0) {
+          alphabet_size += (1 << color_cache_bits);
+        }
+        size = ReadHuffmanCode(alphabet_size, dec, code_lengths, huffman_table);
+        if (size == 0) {
+          goto Error;
+        }
+        if (is_trivial_literal && kLiteralMap[j] == 1) {
+          is_trivial_literal = (huffman_table->bits == 0);
+        }
+        total_size += huffman_table->bits;
+        huffman_table += size;
+        if (j <= ALPHA) {
+          int local_max_bits = code_lengths[0];
+          int k;
+          for (k = 1; k < alphabet_size; ++k) {
+            if (code_lengths[k] > local_max_bits) {
+              local_max_bits = code_lengths[k];
+            }
           }
+          max_bits += local_max_bits;
         }
-        max_bits += local_max_bits;
       }
-    }
-    if (!is_bogus) next = huffman_tables_i;
-    htree_group->is_trivial_literal = is_trivial_literal;
-    htree_group->is_trivial_code = 0;
-    if (is_trivial_literal) {
-      const int red = htrees[RED][0].value;
-      const int blue = htrees[BLUE][0].value;
-      const int alpha = htrees[ALPHA][0].value;
-      htree_group->literal_arb = ((uint32_t)alpha << 24) | (red << 16) | blue;
-      if (total_size == 0 && htrees[GREEN][0].value < NUM_LITERAL_CODES) {
-        htree_group->is_trivial_code = 1;
-        htree_group->literal_arb |= htrees[GREEN][0].value << 8;
+      htree_group->is_trivial_literal = is_trivial_literal;
+      htree_group->is_trivial_code = 0;
+      if (is_trivial_literal) {
+        const int red = htrees[RED][0].value;
+        const int blue = htrees[BLUE][0].value;
+        const int alpha = htrees[ALPHA][0].value;
+        htree_group->literal_arb = ((uint32_t)alpha << 24) | (red << 16) | blue;
+        if (total_size == 0 && htrees[GREEN][0].value < NUM_LITERAL_CODES) {
+          htree_group->is_trivial_code = 1;
+          htree_group->literal_arb |= htrees[GREEN][0].value << 8;
+        }
       }
+      htree_group->use_packed_table =
+          !htree_group->is_trivial_code && (max_bits < HUFFMAN_PACKED_BITS);
+      if (htree_group->use_packed_table) BuildPackedTable(htree_group);
     }
-    htree_group->use_packed_table =
-        !htree_group->is_trivial_code && (max_bits < HUFFMAN_PACKED_BITS);
-    if (htree_group->use_packed_table) BuildPackedTable(htree_group);
   }
   ok = 1;
 
@@ -521,7 +519,6 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
 
  Error:
   WebPSafeFree(code_lengths);
-  WebPSafeFree(huffman_tables_bogus);
   WebPSafeFree(mapping);
   if (!ok) {
     WebPSafeFree(huffman_image);
@@ -562,8 +559,11 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
   memory += work_size * sizeof(*work);
   scaled_data = (uint32_t*)memory;
 
-  WebPRescalerInit(dec->rescaler, in_width, in_height, (uint8_t*)scaled_data,
-                   out_width, out_height, 0, num_channels, work);
+  if (!WebPRescalerInit(dec->rescaler, in_width, in_height,
+                        (uint8_t*)scaled_data, out_width, out_height,
+                        0, num_channels, work)) {
+    return 0;
+  }
   return 1;
 }
 #endif   // WEBP_REDUCE_SIZE
@@ -577,13 +577,14 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
 static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                   int rgba_stride, uint8_t* const rgba) {
   uint32_t* const src = (uint32_t*)rescaler->dst;
+  uint8_t* dst = rgba;
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
-    uint8_t* const dst = rgba + num_lines_out * rgba_stride;
     WebPRescalerExportRow(rescaler);
     WebPMultARGBRow(src, dst_width, 1);
     VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
+    dst += rgba_stride;
     ++num_lines_out;
   }
   return num_lines_out;
@@ -597,8 +598,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
   int num_lines_in = 0;
   int num_lines_out = 0;
   while (num_lines_in < mb_h) {
-    uint8_t* const row_in = in + num_lines_in * in_stride;
-    uint8_t* const row_out = out + num_lines_out * out_stride;
+    uint8_t* const row_in = in + (uint64_t)num_lines_in * in_stride;
+    uint8_t* const row_out = out + (uint64_t)num_lines_out * out_stride;
     const int lines_left = mb_h - num_lines_in;
     const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
     int lines_imported;
@@ -757,11 +758,11 @@ static WEBP_INLINE HTreeGroup* GetHtreeGroupForPos(VP8LMetadata* const hdr,
 
 typedef void (*ProcessRowsFunc)(VP8LDecoder* const dec, int row);
 
-static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
+static void ApplyInverseTransforms(VP8LDecoder* const dec,
+                                   int start_row, int num_rows,
                                    const uint32_t* const rows) {
   int n = dec->next_transform_;
   const int cache_pixs = dec->width_ * num_rows;
-  const int start_row = dec->last_row_;
   const int end_row = start_row + num_rows;
   const uint32_t* rows_in = rows;
   uint32_t* const rows_out = dec->argb_cache_;
@@ -792,15 +793,15 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
     VP8Io* const io = dec->io_;
     uint8_t* rows_data = (uint8_t*)dec->argb_cache_;
     const int in_stride = io->width * sizeof(uint32_t);  // in unit of RGBA
-
-    ApplyInverseTransforms(dec, num_rows, rows);
+    ApplyInverseTransforms(dec, dec->last_row_, num_rows, rows);
     if (!SetCropWindow(io, dec->last_row_, row, &rows_data, in_stride)) {
       // Nothing to output (this time).
     } else {
       const WebPDecBuffer* const output = dec->output_;
       if (WebPIsRGBMode(output->colorspace)) {  // convert to RGBA
         const WebPRGBABuffer* const buf = &output->u.RGBA;
-        uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
+        uint8_t* const rgba =
+            buf->rgba + (int64_t)dec->last_out_row_ * buf->stride;
         const int num_rows_out =
 #if !defined(WEBP_REDUCE_SIZE)
          io->use_scaling ?
@@ -951,7 +952,6 @@ static WEBP_INLINE void CopyBlock8b(uint8_t* const dst, int dist, int length) {
         break;
       default:
         goto Copy;
-        break;
     }
     CopySmallPattern8b(src, dst, length, pattern);
     return;
@@ -1196,6 +1196,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
       VP8LFillBitWindow(br);
       dist_code = GetCopyDistance(dist_symbol, br);
       dist = PlaneCodeToDistance(width, dist_code);
+
       if (VP8LIsEndOfStream(br)) break;
       if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
         goto Error;
@@ -1518,7 +1519,7 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
   assert(dec->width_ <= final_width);
   dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint32_t));
   if (dec->pixels_ == NULL) {
-    dec->argb_cache_ = NULL;    // for sanity check
+    dec->argb_cache_ = NULL;    // for soundness
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     return 0;
   }
@@ -1528,7 +1529,7 @@ static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
 
 static int AllocateInternalBuffers8b(VP8LDecoder* const dec) {
   const uint64_t total_num_pixels = (uint64_t)dec->width_ * dec->height_;
-  dec->argb_cache_ = NULL;    // for sanity check
+  dec->argb_cache_ = NULL;    // for soundness
   dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint8_t));
   if (dec->pixels_ == NULL) {
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
@@ -1556,7 +1557,7 @@ static void ExtractAlphaRows(VP8LDecoder* const dec, int last_row) {
     const int cache_pixs = width * num_rows_to_process;
     uint8_t* const dst = output + width * cur_row;
     const uint32_t* const src = dec->argb_cache_;
-    ApplyInverseTransforms(dec, num_rows_to_process, in);
+    ApplyInverseTransforms(dec, cur_row, num_rows_to_process, in);
     WebPExtractGreen(src, dst, cache_pixs);
     AlphaApplyFilter(alph_dec,
                      cur_row, cur_row + num_rows_to_process, dst, width);
@@ -1670,7 +1671,6 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
   VP8Io* io = NULL;
   WebPDecParams* params = NULL;
 
-  // Sanity checks.
   if (dec == NULL) return 0;
 
   assert(dec->hdr_.huffman_tables_ != NULL);
diff --git a/media/libwebp/dec/vp8li_dec.h b/media/libwebp/dec/vp8li_dec.h
index 2b9c95a44b..8df713beb8 100644
--- a/media/libwebp/dec/vp8li_dec.h
+++ b/media/libwebp/dec/vp8li_dec.h
@@ -37,7 +37,7 @@ struct VP8LTransform {
   int                    bits_;   // subsampling bits defining transform window.
   int                    xsize_;  // transform window X index.
   int                    ysize_;  // transform window Y index.
-  uint32_t              *data_;   // transform data.
+  uint32_t*              data_;   // transform data.
 };
 
 typedef struct {
@@ -48,23 +48,23 @@ typedef struct {
   int             huffman_mask_;
   int             huffman_subsample_bits_;
   int             huffman_xsize_;
-  uint32_t       *huffman_image_;
+  uint32_t*       huffman_image_;
   int             num_htree_groups_;
-  HTreeGroup     *htree_groups_;
-  HuffmanCode    *huffman_tables_;
+  HTreeGroup*     htree_groups_;
+  HuffmanCode*    huffman_tables_;
 } VP8LMetadata;
 
 typedef struct VP8LDecoder VP8LDecoder;
 struct VP8LDecoder {
   VP8StatusCode    status_;
   VP8LDecodeState  state_;
-  VP8Io           *io_;
+  VP8Io*           io_;
 
-  const WebPDecBuffer *output_;    // shortcut to io->opaque->output
+  const WebPDecBuffer* output_;    // shortcut to io->opaque->output
 
-  uint32_t        *pixels_;        // Internal data: either uint8_t* for alpha
+  uint32_t*        pixels_;        // Internal data: either uint8_t* for alpha
                                    // or uint32_t* for BGRA.
-  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
+  uint32_t*        argb_cache_;    // Scratch buffer for temporary BGRA storage.
 
   VP8LBitReader    br_;
   int              incremental_;   // if true, incremental decoding is expected
@@ -86,8 +86,8 @@ struct VP8LDecoder {
   // or'd bitset storing the transforms types.
   uint32_t         transforms_seen_;
 
-  uint8_t         *rescaler_memory;  // Working memory for rescaling work.
-  WebPRescaler    *rescaler;         // Common rescaler for all channels.
+  uint8_t*         rescaler_memory;  // Working memory for rescaling work.
+  WebPRescaler*    rescaler;         // Common rescaler for all channels.
 };
 
 //------------------------------------------------------------------------------
diff --git a/media/libwebp/dec/webp_dec.c b/media/libwebp/dec/webp_dec.c
index 89c264d0a0..6857960774 100644
--- a/media/libwebp/dec/webp_dec.c
+++ b/media/libwebp/dec/webp_dec.c
@@ -785,6 +785,13 @@ VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
 //------------------------------------------------------------------------------
 // Cropping and rescaling.
 
+int WebPCheckCropDimensions(int image_width, int image_height,
+                            int x, int y, int w, int h) {
+  return !(x < 0 || y < 0 || w <= 0 || h <= 0 ||
+           x >= image_width || w > image_width || w > image_width - x ||
+           y >= image_height || h > image_height || h > image_height - y);
+}
+
 int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
                           VP8Io* const io, WEBP_CSP_MODE src_colorspace) {
   const int W = io->width;
@@ -792,7 +799,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
   int x = 0, y = 0, w = W, h = H;
 
   // Cropping
-  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
+  io->use_cropping = (options != NULL) && options->use_cropping;
   if (io->use_cropping) {
     w = options->crop_width;
     h = options->crop_height;
@@ -802,7 +809,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
       x &= ~1;
       y &= ~1;
     }
-    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
+    if (!WebPCheckCropDimensions(W, H, x, y, w, h)) {
       return 0;  // out of frame boundary error
     }
   }
@@ -814,7 +821,7 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
   io->mb_h = h;
 
   // Scaling
-  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
+  io->use_scaling = (options != NULL) && options->use_scaling;
   if (io->use_scaling) {
     int scaled_width = options->scaled_width;
     int scaled_height = options->scaled_height;
@@ -835,8 +842,8 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
 
   if (io->use_scaling) {
     // disable filter (only for large downscaling ratio).
-    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
-                           (io->scaled_height < H * 3 / 4);
+    io->bypass_filtering |= (io->scaled_width < W * 3 / 4) &&
+                            (io->scaled_height < H * 3 / 4);
     io->fancy_upsampling = 0;
   }
   return 1;
diff --git a/media/libwebp/dec/webpi_dec.h b/media/libwebp/dec/webpi_dec.h
index 83d7444e51..a1b7c83fcd 100644
--- a/media/libwebp/dec/webpi_dec.h
+++ b/media/libwebp/dec/webpi_dec.h
@@ -77,6 +77,10 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);
 //------------------------------------------------------------------------------
 // Misc utils
 
+// Returns true if crop dimensions are within image bounds.
+int WebPCheckCropDimensions(int image_width, int image_height,
+                            int x, int y, int w, int h);
+
 // Initializes VP8Io with custom setup, io and teardown functions. The default
 // hooks will use the supplied 'params' as io->opaque handle.
 void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
diff --git a/media/libwebp/demux/demux.c b/media/libwebp/demux/demux.c
index 2034024d06..13953b1c54 100644
--- a/media/libwebp/demux/demux.c
+++ b/media/libwebp/demux/demux.c
@@ -24,7 +24,7 @@
 #include "../webp/format_constants.h"
 
 #define DMUX_MAJ_VERSION 1
-#define DMUX_MIN_VERSION 0
+#define DMUX_MIN_VERSION 2
 #define DMUX_REV_VERSION 2
 
 typedef struct {
@@ -221,12 +221,16 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
     const size_t chunk_start_offset = mem->start_;
     const uint32_t fourcc = ReadLE32(mem);
     const uint32_t payload_size = ReadLE32(mem);
-    const uint32_t payload_size_padded = payload_size + (payload_size & 1);
-    const size_t payload_available = (payload_size_padded > MemDataSize(mem))
-                                   ? MemDataSize(mem) : payload_size_padded;
-    const size_t chunk_size = CHUNK_HEADER_SIZE + payload_available;
+    uint32_t payload_size_padded;
+    size_t payload_available;
+    size_t chunk_size;
 
     if (payload_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+
+    payload_size_padded = payload_size + (payload_size & 1);
+    payload_available = (payload_size_padded > MemDataSize(mem))
+                      ? MemDataSize(mem) : payload_size_padded;
+    chunk_size = CHUNK_HEADER_SIZE + payload_available;
     if (SizeIsInvalid(mem, payload_size_padded)) return PARSE_ERROR;
     if (payload_size_padded > MemDataSize(mem)) status = PARSE_NEED_MORE_DATA;
 
@@ -312,6 +316,7 @@ static ParseStatus ParseAnimationFrame(
   int bits;
   MemBuffer* const mem = &dmux->mem_;
   Frame* frame;
+  size_t start_offset;
   ParseStatus status =
       NewFrame(mem, ANMF_CHUNK_SIZE, frame_chunk_size, &frame);
   if (status != PARSE_OK) return status;
@@ -332,7 +337,11 @@ static ParseStatus ParseAnimationFrame(
 
   // Store a frame only if the animation flag is set there is some data for
   // this frame is available.
+  start_offset = mem->start_;
   status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
+  if (status != PARSE_ERROR && mem->start_ - start_offset > anmf_payload_size) {
+    status = PARSE_ERROR;
+  }
   if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
     added_frame = AddFrame(dmux, frame);
     if (added_frame) {
@@ -446,9 +455,11 @@ static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
     const size_t chunk_start_offset = mem->start_;
     const uint32_t fourcc = ReadLE32(mem);
     const uint32_t chunk_size = ReadLE32(mem);
-    const uint32_t chunk_size_padded = chunk_size + (chunk_size & 1);
+    uint32_t chunk_size_padded;
 
     if (chunk_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+
+    chunk_size_padded = chunk_size + (chunk_size & 1);
     if (SizeIsInvalid(mem, chunk_size_padded)) return PARSE_ERROR;
 
     switch (fourcc) {
diff --git a/media/libwebp/dsp/alpha_processing.c b/media/libwebp/dsp/alpha_processing.c
index 6ff1352ae2..8c5e90210f 100644
--- a/media/libwebp/dsp/alpha_processing.c
+++ b/media/libwebp/dsp/alpha_processing.c
@@ -157,7 +157,8 @@ void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
   }
 }
 
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
+                   const uint8_t* WEBP_RESTRICT const alpha,
                    int width, int inverse) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -178,7 +179,8 @@ void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
 #undef MFIX
 
 void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
-void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
+                    const uint8_t* WEBP_RESTRICT const alpha,
                     int width, int inverse);
 
 //------------------------------------------------------------------------------
@@ -193,8 +195,8 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
   }
 }
 
-void WebPMultRows(uint8_t* ptr, int stride,
-                  const uint8_t* alpha, int alpha_stride,
+void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
+                  const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
                   int width, int num_rows, int inverse) {
   int n;
   for (n = 0; n < num_rows; ++n) {
@@ -290,9 +292,9 @@ static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
 }
 
 #if !WEBP_NEON_OMIT_C_CODE
-static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
+static int DispatchAlpha_C(const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
                            int width, int height,
-                           uint8_t* dst, int dst_stride) {
+                           uint8_t* WEBP_RESTRICT dst, int dst_stride) {
   uint32_t alpha_mask = 0xff;
   int i, j;
 
@@ -309,9 +311,10 @@ static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
   return (alpha_mask != 0xff);
 }
 
-static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
-                                   int width, int height,
-                                   uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_C(const uint8_t* WEBP_RESTRICT alpha,
+                                   int alpha_stride, int width, int height,
+                                   uint32_t* WEBP_RESTRICT dst,
+                                   int dst_stride) {
   int i, j;
   for (j = 0; j < height; ++j) {
     for (i = 0; i < width; ++i) {
@@ -322,9 +325,9 @@ static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_C(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                           int width, int height,
-                          uint8_t* alpha, int alpha_stride) {
+                          uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   uint8_t alpha_mask = 0xff;
   int i, j;
 
@@ -340,7 +343,8 @@ static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
   return (alpha_mask == 0xff);
 }
 
-static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
+static void ExtractGreen_C(const uint32_t* WEBP_RESTRICT argb,
+                           uint8_t* WEBP_RESTRICT alpha, int size) {
   int i;
   for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
 }
@@ -359,6 +363,11 @@ static int HasAlpha32b_C(const uint8_t* src, int length) {
   return 0;
 }
 
+static void AlphaReplace_C(uint32_t* src, int length, uint32_t color) {
+  int x;
+  for (x = 0; x < length; ++x) if ((src[x] >> 24) == 0) src[x] = color;
+}
+
 //------------------------------------------------------------------------------
 // Simple channel manipulations.
 
@@ -367,8 +376,11 @@ static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
 }
 
 #ifdef WORDS_BIGENDIAN
-static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                       const uint8_t* b, int len, uint32_t* out) {
+static void PackARGB_C(const uint8_t* WEBP_RESTRICT a,
+                       const uint8_t* WEBP_RESTRICT r,
+                       const uint8_t* WEBP_RESTRICT g,
+                       const uint8_t* WEBP_RESTRICT b,
+                       int len, uint32_t* WEBP_RESTRICT out) {
   int i;
   for (i = 0; i < len; ++i) {
     out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
@@ -376,8 +388,10 @@ static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
 }
 #endif
 
-static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                      int len, int step, uint32_t* out) {
+static void PackRGB_C(const uint8_t* WEBP_RESTRICT r,
+                      const uint8_t* WEBP_RESTRICT g,
+                      const uint8_t* WEBP_RESTRICT b,
+                      int len, int step, uint32_t* WEBP_RESTRICT out) {
   int i, offset = 0;
   for (i = 0; i < len; ++i) {
     out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
@@ -387,19 +401,26 @@ static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
 
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
-int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
-void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
-int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
-void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
+                         uint8_t* WEBP_RESTRICT, int);
+void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT, int, int, int,
+                                 uint32_t* WEBP_RESTRICT, int);
+int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
+                        uint8_t* WEBP_RESTRICT, int);
+void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
+                         uint8_t* WEBP_RESTRICT alpha, int size);
 #ifdef WORDS_BIGENDIAN
 void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
                      const uint8_t* b, int, uint32_t*);
 #endif
-void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out);
+void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
+                    const uint8_t* WEBP_RESTRICT g,
+                    const uint8_t* WEBP_RESTRICT b,
+                    int len, int step, uint32_t* WEBP_RESTRICT out);
 
 int (*WebPHasAlpha8b)(const uint8_t* src, int length);
 int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
 
 //------------------------------------------------------------------------------
 // Init function
@@ -428,13 +449,14 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
 
   WebPHasAlpha8b = HasAlpha8b_C;
   WebPHasAlpha32b = HasAlpha32b_C;
+  WebPAlphaReplace = AlphaReplace_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitAlphaProcessingSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
       if (VP8GetCPUInfo(kSSE4_1)) {
         WebPInitAlphaProcessingSSE41();
       }
@@ -448,7 +470,7 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPInitAlphaProcessingNEON();
@@ -469,4 +491,5 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
   assert(WebPPackRGB != NULL);
   assert(WebPHasAlpha8b != NULL);
   assert(WebPHasAlpha32b != NULL);
+  assert(WebPAlphaReplace != NULL);
 }
diff --git a/media/libwebp/dsp/alpha_processing_mips_dsp_r2.c b/media/libwebp/dsp/alpha_processing_mips_dsp_r2.c
new file mode 100644
index 0000000000..ab597e68bb
--- /dev/null
+++ b/media/libwebp/dsp/alpha_processing_mips_dsp_r2.c
@@ -0,0 +1,228 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+//            Djordje Pesut  (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
+                                   int width, int height,
+                                   uint8_t* dst, int dst_stride) {
+  uint32_t alpha_mask = 0xffffffff;
+  int i, j, temp0;
+
+  for (j = 0; j < height; ++j) {
+    uint8_t* pdst = dst;
+    const uint8_t* palpha = alpha;
+    for (i = 0; i < (width >> 2); ++i) {
+      int temp1, temp2, temp3;
+
+      __asm__ volatile (
+        "ulw    %[temp0],      0(%[palpha])                \n\t"
+        "addiu  %[palpha],     %[palpha],     4            \n\t"
+        "addiu  %[pdst],       %[pdst],       16           \n\t"
+        "srl    %[temp1],      %[temp0],      8            \n\t"
+        "srl    %[temp2],      %[temp0],      16           \n\t"
+        "srl    %[temp3],      %[temp0],      24           \n\t"
+        "and    %[alpha_mask], %[alpha_mask], %[temp0]     \n\t"
+        "sb     %[temp0],      -16(%[pdst])                \n\t"
+        "sb     %[temp1],      -12(%[pdst])                \n\t"
+        "sb     %[temp2],      -8(%[pdst])                 \n\t"
+        "sb     %[temp3],      -4(%[pdst])                 \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+          [temp3]"=&r"(temp3), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
+          [alpha_mask]"+r"(alpha_mask)
+        :
+        : "memory"
+      );
+    }
+
+    for (i = 0; i < (width & 3); ++i) {
+      __asm__ volatile (
+        "lbu    %[temp0],      0(%[palpha])                \n\t"
+        "addiu  %[palpha],     %[palpha],     1            \n\t"
+        "sb     %[temp0],      0(%[pdst])                  \n\t"
+        "and    %[alpha_mask], %[alpha_mask], %[temp0]     \n\t"
+        "addiu  %[pdst],       %[pdst],       4            \n\t"
+        : [temp0]"=&r"(temp0), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
+          [alpha_mask]"+r"(alpha_mask)
+        :
+        : "memory"
+      );
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+
+  __asm__ volatile (
+    "ext    %[temp0],      %[alpha_mask], 0, 16            \n\t"
+    "srl    %[alpha_mask], %[alpha_mask], 16               \n\t"
+    "and    %[alpha_mask], %[alpha_mask], %[temp0]         \n\t"
+    "ext    %[temp0],      %[alpha_mask], 0, 8             \n\t"
+    "srl    %[alpha_mask], %[alpha_mask], 8                \n\t"
+    "and    %[alpha_mask], %[alpha_mask], %[temp0]         \n\t"
+    : [temp0]"=&r"(temp0), [alpha_mask]"+r"(alpha_mask)
+    :
+  );
+
+  return (alpha_mask != 0xff);
+}
+
+static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
+                                  int inverse) {
+  int x;
+  const uint32_t c_00ffffff = 0x00ffffffu;
+  const uint32_t c_ff000000 = 0xff000000u;
+  const uint32_t c_8000000  = 0x00800000u;
+  const uint32_t c_8000080  = 0x00800080u;
+  for (x = 0; x < width; ++x) {
+    const uint32_t argb = ptr[x];
+    if (argb < 0xff000000u) {      // alpha < 255
+      if (argb <= 0x00ffffffu) {   // alpha == 0
+        ptr[x] = 0;
+      } else {
+        int temp0, temp1, temp2, temp3, alpha;
+        __asm__ volatile (
+          "srl          %[alpha],   %[argb],       24                \n\t"
+          "replv.qb     %[temp0],   %[alpha]                         \n\t"
+          "and          %[temp0],   %[temp0],      %[c_00ffffff]     \n\t"
+          "beqz         %[inverse], 0f                               \n\t"
+          "divu         $zero,      %[c_ff000000], %[alpha]          \n\t"
+          "mflo         %[temp0]                                     \n\t"
+        "0:                                                          \n\t"
+          "andi         %[temp1],   %[argb],       0xff              \n\t"
+          "ext          %[temp2],   %[argb],       8,             8  \n\t"
+          "ext          %[temp3],   %[argb],       16,            8  \n\t"
+          "mul          %[temp1],   %[temp1],      %[temp0]          \n\t"
+          "mul          %[temp2],   %[temp2],      %[temp0]          \n\t"
+          "mul          %[temp3],   %[temp3],      %[temp0]          \n\t"
+          "precrq.ph.w  %[temp1],   %[temp2],      %[temp1]          \n\t"
+          "addu         %[temp3],   %[temp3],      %[c_8000000]      \n\t"
+          "addu         %[temp1],   %[temp1],      %[c_8000080]      \n\t"
+          "precrq.ph.w  %[temp3],   %[argb],       %[temp3]          \n\t"
+          "precrq.qb.ph %[temp1],   %[temp3],      %[temp1]          \n\t"
+          : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+            [temp3]"=&r"(temp3), [alpha]"=&r"(alpha)
+          : [inverse]"r"(inverse), [c_00ffffff]"r"(c_00ffffff),
+            [c_8000000]"r"(c_8000000), [c_8000080]"r"(c_8000080),
+            [c_ff000000]"r"(c_ff000000), [argb]"r"(argb)
+          : "memory", "hi", "lo"
+        );
+        ptr[x] = temp1;
+      }
+    }
+  }
+}
+
+#ifdef WORDS_BIGENDIAN
+static void PackARGB_MIPSdspR2(const uint8_t* a, const uint8_t* r,
+                               const uint8_t* g, const uint8_t* b, int len,
+                               uint32_t* out) {
+  int temp0, temp1, temp2, temp3, offset;
+  const int rest = len & 1;
+  const uint32_t* const loop_end = out + len - rest;
+  const int step = 4;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+#endif  // WORDS_BIGENDIAN
+
+static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
+                              const uint8_t* b, int len, int step,
+                              uint32_t* out) {
+  int temp0, temp1, temp2, offset;
+  const int rest = len & 1;
+  const int a = 0xff;
+  const uint32_t* const loop_end = out + len - rest;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitAlphaProcessingMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
+  WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
+  WebPMultARGBRow = MultARGBRow_MIPSdspR2;
+#ifdef WORDS_BIGENDIAN
+  WebPPackARGB = PackARGB_MIPSdspR2;
+#endif
+  WebPPackRGB = PackRGB_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/alpha_processing_neon.c b/media/libwebp/dsp/alpha_processing_neon.c
index 53dfce2b36..c900279a35 100644
--- a/media/libwebp/dsp/alpha_processing_neon.c
+++ b/media/libwebp/dsp/alpha_processing_neon.c
@@ -80,9 +80,9 @@ static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
 
 //------------------------------------------------------------------------------
 
-static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
-                              int width, int height,
-                              uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha,
+                              int alpha_stride, int width, int height,
+                              uint8_t* WEBP_RESTRICT dst, int dst_stride) {
   uint32_t alpha_mask = 0xffffffffu;
   uint8x8_t mask8 = vdup_n_u8(0xff);
   uint32_t tmp[2];
@@ -112,9 +112,10 @@ static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
   return (alpha_mask != 0xffffffffu);
 }
 
-static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
-                                      int width, int height,
-                                      uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_NEON(const uint8_t* WEBP_RESTRICT alpha,
+                                      int alpha_stride, int width, int height,
+                                      uint32_t* WEBP_RESTRICT dst,
+                                      int dst_stride) {
   int i, j;
   uint8x8x4_t greens;   // leave A/R/B channels zero'd.
   greens.val[0] = vdup_n_u8(0);
@@ -131,9 +132,9 @@ static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                              int width, int height,
-                             uint8_t* alpha, int alpha_stride) {
+                             uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   uint32_t alpha_mask = 0xffffffffu;
   uint8x8_t mask8 = vdup_n_u8(0xff);
   uint32_t tmp[2];
@@ -161,8 +162,8 @@ static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
   return (alpha_mask == 0xffffffffu);
 }
 
-static void ExtractGreen_NEON(const uint32_t* argb,
-                              uint8_t* alpha, int size) {
+static void ExtractGreen_NEON(const uint32_t* WEBP_RESTRICT argb,
+                              uint8_t* WEBP_RESTRICT alpha, int size) {
   int i;
   for (i = 0; i + 16 <= size; i += 16) {
     const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i));
diff --git a/media/libwebp/dsp/alpha_processing_sse2.c b/media/libwebp/dsp/alpha_processing_sse2.c
index 9a3bc4485a..56d9ee5e98 100644
--- a/media/libwebp/dsp/alpha_processing_sse2.c
+++ b/media/libwebp/dsp/alpha_processing_sse2.c
@@ -18,9 +18,9 @@
 
 //------------------------------------------------------------------------------
 
-static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
-                              int width, int height,
-                              uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
+                              int alpha_stride, int width, int height,
+                              uint8_t* WEBP_RESTRICT dst, int dst_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
@@ -72,9 +72,10 @@ static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
   return (alpha_and != 0xff);
 }
 
-static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
-                                      int width, int height,
-                                      uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha,
+                                      int alpha_stride, int width, int height,
+                                      uint32_t* WEBP_RESTRICT dst,
+                                      int dst_stride) {
   int i, j;
   const __m128i zero = _mm_setzero_si128();
   const int limit = width & ~15;
@@ -98,9 +99,9 @@ static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
+static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
                              int width, int height,
-                             uint8_t* alpha, int alpha_stride) {
+                             uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
@@ -214,7 +215,7 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
 // Alpha detection
 
 static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
-  const __m128i all_0xff = _mm_set1_epi8(0xff);
+  const __m128i all_0xff = _mm_set1_epi8((char)0xff);
   int i = 0;
   for (; i + 16 <= length; i += 16) {
     const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
@@ -228,7 +229,7 @@ static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
 
 static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
   const __m128i alpha_mask = _mm_set1_epi32(0xff);
-  const __m128i all_0xff = _mm_set1_epi8(0xff);
+  const __m128i all_0xff = _mm_set1_epi8((char)0xff);
   int i = 0;
   // We don't know if we can access the last 3 bytes after the last alpha
   // value 'src[4 * length - 4]' (because we don't know if alpha is the first
@@ -265,6 +266,27 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
   return 0;
 }
 
+static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) {
+  const __m128i m_color = _mm_set1_epi32(color);
+  const __m128i zero = _mm_setzero_si128();
+  int i = 0;
+  for (; i + 8 <= length; i += 8) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4));
+    const __m128i b0 = _mm_srai_epi32(a0, 24);
+    const __m128i b1 = _mm_srai_epi32(a1, 24);
+    const __m128i c0 = _mm_cmpeq_epi32(b0, zero);
+    const __m128i c1 = _mm_cmpeq_epi32(b1, zero);
+    const __m128i d0 = _mm_and_si128(c0, m_color);
+    const __m128i d1 = _mm_and_si128(c1, m_color);
+    const __m128i e0 = _mm_andnot_si128(c0, a0);
+    const __m128i e1 = _mm_andnot_si128(c1, a1);
+    _mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0));
+    _mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1));
+  }
+  for (; i < length; ++i) if ((src[i] >> 24) == 0) src[i] = color;
+}
+
 // -----------------------------------------------------------------------------
 // Apply alpha value to rows
 
@@ -296,7 +318,8 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
   if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
 }
 
-static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
+static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr,
+                         const uint8_t* WEBP_RESTRICT const alpha,
                          int width, int inverse) {
   int x = 0;
   if (!inverse) {
@@ -334,6 +357,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
 
   WebPHasAlpha8b = HasAlpha8b_SSE2;
   WebPHasAlpha32b = HasAlpha32b_SSE2;
+  WebPAlphaReplace = AlphaReplace_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/alpha_processing_sse41.c b/media/libwebp/dsp/alpha_processing_sse41.c
index e33c1aba4d..307d200f1f 100644
--- a/media/libwebp/dsp/alpha_processing_sse41.c
+++ b/media/libwebp/dsp/alpha_processing_sse41.c
@@ -19,9 +19,9 @@
 
 //------------------------------------------------------------------------------
 
-static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
-                              int width, int height,
-                              uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb,
+                              int argb_stride, int width, int height,
+                              uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
diff --git a/media/libwebp/dsp/cost.c b/media/libwebp/dsp/cost.c
new file mode 100644
index 0000000000..bf112c7f0c
--- /dev/null
+++ b/media/libwebp/dsp/cost.c
@@ -0,0 +1,411 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+#include "../enc/cost_enc.h"
+
+//------------------------------------------------------------------------------
+// Boolean-cost cost table
+
+const uint16_t VP8EntropyCost[256] = {
+  1792, 1792, 1792, 1536, 1536, 1408, 1366, 1280, 1280, 1216,
+  1178, 1152, 1110, 1076, 1061, 1024, 1024,  992,  968,  951,
+   939,  911,  896,  878,  871,  854,  838,  820,  811,  794,
+   786,  768,  768,  752,  740,  732,  720,  709,  704,  690,
+   683,  672,  666,  655,  647,  640,  631,  622,  615,  607,
+   598,  592,  586,  576,  572,  564,  559,  555,  547,  541,
+   534,  528,  522,  512,  512,  504,  500,  494,  488,  483,
+   477,  473,  467,  461,  458,  452,  448,  443,  438,  434,
+   427,  424,  419,  415,  410,  406,  403,  399,  394,  390,
+   384,  384,  377,  374,  370,  366,  362,  359,  355,  351,
+   347,  342,  342,  336,  333,  330,  326,  323,  320,  316,
+   312,  308,  305,  302,  299,  296,  293,  288,  287,  283,
+   280,  277,  274,  272,  268,  266,  262,  256,  256,  256,
+   251,  248,  245,  242,  240,  237,  234,  232,  228,  226,
+   223,  221,  218,  216,  214,  211,  208,  205,  203,  201,
+   198,  196,  192,  191,  188,  187,  183,  181,  179,  176,
+   175,  171,  171,  168,  165,  163,  160,  159,  156,  154,
+   152,  150,  148,  146,  144,  142,  139,  138,  135,  133,
+   131,  128,  128,  125,  123,  121,  119,  117,  115,  113,
+   111,  110,  107,  105,  103,  102,  100,   98,   96,   94,
+    92,   91,   89,   86,   86,   83,   82,   80,   77,   76,
+    74,   73,   71,   69,   67,   66,   64,   63,   61,   59,
+    57,   55,   54,   52,   51,   49,   47,   46,   44,   43,
+    41,   40,   38,   36,   35,   33,   32,   30,   29,   27,
+    25,   24,   22,   21,   19,   18,   16,   15,   13,   12,
+    10,    9,    7,    6,    4,    3
+};
+
+//------------------------------------------------------------------------------
+// Level cost tables
+
+// fixed costs for coding levels, deduce from the coding tree.
+// This is only the part that doesn't depend on the probability state.
+const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
+     0,  256,  256,  256,  256,  432,  618,  630,
+   731,  640,  640,  828,  901,  948, 1021, 1101,
+  1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
+  1245, 1275, 1318, 1337, 1380, 1410, 1453, 1497,
+  1540, 1570, 1613, 1280, 1295, 1317, 1332, 1358,
+  1373, 1395, 1410, 1454, 1469, 1491, 1506, 1532,
+  1547, 1569, 1584, 1601, 1616, 1638, 1653, 1679,
+  1694, 1716, 1731, 1775, 1790, 1812, 1827, 1853,
+  1868, 1890, 1905, 1727, 1733, 1742, 1748, 1759,
+  1765, 1774, 1780, 1800, 1806, 1815, 1821, 1832,
+  1838, 1847, 1853, 1878, 1884, 1893, 1899, 1910,
+  1916, 1925, 1931, 1951, 1957, 1966, 1972, 1983,
+  1989, 1998, 2004, 2027, 2033, 2042, 2048, 2059,
+  2065, 2074, 2080, 2100, 2106, 2115, 2121, 2132,
+  2138, 2147, 2153, 2178, 2184, 2193, 2199, 2210,
+  2216, 2225, 2231, 2251, 2257, 2266, 2272, 2283,
+  2289, 2298, 2304, 2168, 2174, 2183, 2189, 2200,
+  2206, 2215, 2221, 2241, 2247, 2256, 2262, 2273,
+  2279, 2288, 2294, 2319, 2325, 2334, 2340, 2351,
+  2357, 2366, 2372, 2392, 2398, 2407, 2413, 2424,
+  2430, 2439, 2445, 2468, 2474, 2483, 2489, 2500,
+  2506, 2515, 2521, 2541, 2547, 2556, 2562, 2573,
+  2579, 2588, 2594, 2619, 2625, 2634, 2640, 2651,
+  2657, 2666, 2672, 2692, 2698, 2707, 2713, 2724,
+  2730, 2739, 2745, 2540, 2546, 2555, 2561, 2572,
+  2578, 2587, 2593, 2613, 2619, 2628, 2634, 2645,
+  2651, 2660, 2666, 2691, 2697, 2706, 2712, 2723,
+  2729, 2738, 2744, 2764, 2770, 2779, 2785, 2796,
+  2802, 2811, 2817, 2840, 2846, 2855, 2861, 2872,
+  2878, 2887, 2893, 2913, 2919, 2928, 2934, 2945,
+  2951, 2960, 2966, 2991, 2997, 3006, 3012, 3023,
+  3029, 3038, 3044, 3064, 3070, 3079, 3085, 3096,
+  3102, 3111, 3117, 2981, 2987, 2996, 3002, 3013,
+  3019, 3028, 3034, 3054, 3060, 3069, 3075, 3086,
+  3092, 3101, 3107, 3132, 3138, 3147, 3153, 3164,
+  3170, 3179, 3185, 3205, 3211, 3220, 3226, 3237,
+  3243, 3252, 3258, 3281, 3287, 3296, 3302, 3313,
+  3319, 3328, 3334, 3354, 3360, 3369, 3375, 3386,
+  3392, 3401, 3407, 3432, 3438, 3447, 3453, 3464,
+  3470, 3479, 3485, 3505, 3511, 3520, 3526, 3537,
+  3543, 3552, 3558, 2816, 2822, 2831, 2837, 2848,
+  2854, 2863, 2869, 2889, 2895, 2904, 2910, 2921,
+  2927, 2936, 2942, 2967, 2973, 2982, 2988, 2999,
+  3005, 3014, 3020, 3040, 3046, 3055, 3061, 3072,
+  3078, 3087, 3093, 3116, 3122, 3131, 3137, 3148,
+  3154, 3163, 3169, 3189, 3195, 3204, 3210, 3221,
+  3227, 3236, 3242, 3267, 3273, 3282, 3288, 3299,
+  3305, 3314, 3320, 3340, 3346, 3355, 3361, 3372,
+  3378, 3387, 3393, 3257, 3263, 3272, 3278, 3289,
+  3295, 3304, 3310, 3330, 3336, 3345, 3351, 3362,
+  3368, 3377, 3383, 3408, 3414, 3423, 3429, 3440,
+  3446, 3455, 3461, 3481, 3487, 3496, 3502, 3513,
+  3519, 3528, 3534, 3557, 3563, 3572, 3578, 3589,
+  3595, 3604, 3610, 3630, 3636, 3645, 3651, 3662,
+  3668, 3677, 3683, 3708, 3714, 3723, 3729, 3740,
+  3746, 3755, 3761, 3781, 3787, 3796, 3802, 3813,
+  3819, 3828, 3834, 3629, 3635, 3644, 3650, 3661,
+  3667, 3676, 3682, 3702, 3708, 3717, 3723, 3734,
+  3740, 3749, 3755, 3780, 3786, 3795, 3801, 3812,
+  3818, 3827, 3833, 3853, 3859, 3868, 3874, 3885,
+  3891, 3900, 3906, 3929, 3935, 3944, 3950, 3961,
+  3967, 3976, 3982, 4002, 4008, 4017, 4023, 4034,
+  4040, 4049, 4055, 4080, 4086, 4095, 4101, 4112,
+  4118, 4127, 4133, 4153, 4159, 4168, 4174, 4185,
+  4191, 4200, 4206, 4070, 4076, 4085, 4091, 4102,
+  4108, 4117, 4123, 4143, 4149, 4158, 4164, 4175,
+  4181, 4190, 4196, 4221, 4227, 4236, 4242, 4253,
+  4259, 4268, 4274, 4294, 4300, 4309, 4315, 4326,
+  4332, 4341, 4347, 4370, 4376, 4385, 4391, 4402,
+  4408, 4417, 4423, 4443, 4449, 4458, 4464, 4475,
+  4481, 4490, 4496, 4521, 4527, 4536, 4542, 4553,
+  4559, 4568, 4574, 4594, 4600, 4609, 4615, 4626,
+  4632, 4641, 4647, 3515, 3521, 3530, 3536, 3547,
+  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
+  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
+  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
+  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
+  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
+  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
+  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
+  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
+  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
+  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
+  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
+  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
+  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
+  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
+  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
+  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
+  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
+  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
+  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
+  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
+  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
+  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
+  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
+  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
+  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
+  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
+  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
+  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
+  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
+  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
+  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
+  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
+  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
+  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
+  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
+  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
+  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
+  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
+  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
+  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
+  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
+  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
+  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
+  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
+  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
+  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
+  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
+  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
+  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
+  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
+  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
+  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
+  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
+  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
+  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
+  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
+  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
+  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
+  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
+  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
+  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
+  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
+  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
+  6420, 6429, 6435, 3515, 3521, 3530, 3536, 3547,
+  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
+  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
+  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
+  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
+  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
+  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
+  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
+  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
+  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
+  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
+  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
+  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
+  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
+  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
+  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
+  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
+  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
+  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
+  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
+  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
+  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
+  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
+  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
+  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
+  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
+  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
+  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
+  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
+  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
+  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
+  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
+  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
+  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
+  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
+  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
+  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
+  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
+  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
+  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
+  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
+  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
+  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
+  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
+  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
+  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
+  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
+  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
+  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
+  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
+  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
+  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
+  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
+  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
+  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
+  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
+  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
+  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
+  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
+  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
+  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
+  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
+  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
+  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
+  6420, 6429, 6435, 5303, 5309, 5318, 5324, 5335,
+  5341, 5350, 5356, 5376, 5382, 5391, 5397, 5408,
+  5414, 5423, 5429, 5454, 5460, 5469, 5475, 5486,
+  5492, 5501, 5507, 5527, 5533, 5542, 5548, 5559,
+  5565, 5574, 5580, 5603, 5609, 5618, 5624, 5635,
+  5641, 5650, 5656, 5676, 5682, 5691, 5697, 5708,
+  5714, 5723, 5729, 5754, 5760, 5769, 5775, 5786,
+  5792, 5801, 5807, 5827, 5833, 5842, 5848, 5859,
+  5865, 5874, 5880, 5744, 5750, 5759, 5765, 5776,
+  5782, 5791, 5797, 5817, 5823, 5832, 5838, 5849,
+  5855, 5864, 5870, 5895, 5901, 5910, 5916, 5927,
+  5933, 5942, 5948, 5968, 5974, 5983, 5989, 6000,
+  6006, 6015, 6021, 6044, 6050, 6059, 6065, 6076,
+  6082, 6091, 6097, 6117, 6123, 6132, 6138, 6149,
+  6155, 6164, 6170, 6195, 6201, 6210, 6216, 6227,
+  6233, 6242, 6248, 6268, 6274, 6283, 6289, 6300,
+  6306, 6315, 6321, 6116, 6122, 6131, 6137, 6148,
+  6154, 6163, 6169, 6189, 6195, 6204, 6210, 6221,
+  6227, 6236, 6242, 6267, 6273, 6282, 6288, 6299,
+  6305, 6314, 6320, 6340, 6346, 6355, 6361, 6372,
+  6378, 6387, 6393, 6416, 6422, 6431, 6437, 6448,
+  6454, 6463, 6469, 6489, 6495, 6504, 6510, 6521,
+  6527, 6536, 6542, 6567, 6573, 6582, 6588, 6599,
+  6605, 6614, 6620, 6640, 6646, 6655, 6661, 6672,
+  6678, 6687, 6693, 6557, 6563, 6572, 6578, 6589,
+  6595, 6604, 6610, 6630, 6636, 6645, 6651, 6662,
+  6668, 6677, 6683, 6708, 6714, 6723, 6729, 6740,
+  6746, 6755, 6761, 6781, 6787, 6796, 6802, 6813,
+  6819, 6828, 6834, 6857, 6863, 6872, 6878, 6889,
+  6895, 6904, 6910, 6930, 6936, 6945, 6951, 6962,
+  6968, 6977, 6983, 7008, 7014, 7023, 7029, 7040,
+  7046, 7055, 7061, 7081, 7087, 7096, 7102, 7113,
+  7119, 7128, 7134, 6392, 6398, 6407, 6413, 6424,
+  6430, 6439, 6445, 6465, 6471, 6480, 6486, 6497,
+  6503, 6512, 6518, 6543, 6549, 6558, 6564, 6575,
+  6581, 6590, 6596, 6616, 6622, 6631, 6637, 6648,
+  6654, 6663, 6669, 6692, 6698, 6707, 6713, 6724,
+  6730, 6739, 6745, 6765, 6771, 6780, 6786, 6797,
+  6803, 6812, 6818, 6843, 6849, 6858, 6864, 6875,
+  6881, 6890, 6896, 6916, 6922, 6931, 6937, 6948,
+  6954, 6963, 6969, 6833, 6839, 6848, 6854, 6865,
+  6871, 6880, 6886, 6906, 6912, 6921, 6927, 6938,
+  6944, 6953, 6959, 6984, 6990, 6999, 7005, 7016,
+  7022, 7031, 7037, 7057, 7063, 7072, 7078, 7089,
+  7095, 7104, 7110, 7133, 7139, 7148, 7154, 7165,
+  7171, 7180, 7186, 7206, 7212, 7221, 7227, 7238,
+  7244, 7253, 7259, 7284, 7290, 7299, 7305, 7316,
+  7322, 7331, 7337, 7357, 7363, 7372, 7378, 7389,
+  7395, 7404, 7410, 7205, 7211, 7220, 7226, 7237,
+  7243, 7252, 7258, 7278, 7284, 7293, 7299, 7310,
+  7316, 7325, 7331, 7356, 7362, 7371, 7377, 7388,
+  7394, 7403, 7409, 7429, 7435, 7444, 7450, 7461,
+  7467, 7476, 7482, 7505, 7511, 7520, 7526, 7537,
+  7543, 7552, 7558, 7578, 7584, 7593, 7599, 7610,
+  7616, 7625, 7631, 7656, 7662, 7671, 7677, 7688,
+  7694, 7703, 7709, 7729, 7735, 7744, 7750, 7761
+};
+
+//------------------------------------------------------------------------------
+// Tables for level coding
+
+const uint8_t VP8EncBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  // sentinel
+};
+
+//------------------------------------------------------------------------------
+// Mode costs
+
+static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+  for (; n < res->last; ++n) {
+    const int v = abs(res->coeffs[n]);
+    const int ctx = (v >= 2) ? 2 : v;
+    cost += VP8LevelCost(t, v);
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+static void SetResidualCoeffs_C(const int16_t* const coeffs,
+                                VP8Residual* const res) {
+  int n;
+  res->last = -1;
+  assert(res->first == 0 || coeffs[0] == 0);
+  for (n = 15; n >= 0; --n) {
+    if (coeffs[n]) {
+      res->last = n;
+      break;
+    }
+  }
+  res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// init function
+
+VP8GetResidualCostFunc VP8GetResidualCost;
+VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+extern void VP8EncDspCostInitMIPS32(void);
+extern void VP8EncDspCostInitMIPSdspR2(void);
+extern void VP8EncDspCostInitSSE2(void);
+extern void VP8EncDspCostInitNEON(void);
+
+WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
+  VP8GetResidualCost = GetResidualCost_C;
+  VP8SetResidualCoeffs = SetResidualCoeffs_C;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8EncDspCostInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspCostInitMIPSdspR2();
+    }
+#endif
+#if defined(WEBP_HAVE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspCostInitSSE2();
+    }
+#endif
+#if defined(WEBP_HAVE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8EncDspCostInitNEON();
+    }
+#endif
+  }
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/dsp/cost_mips32.c b/media/libwebp/dsp/cost_mips32.c
new file mode 100644
index 0000000000..4e97e8a756
--- /dev/null
+++ b/media/libwebp/dsp/cost_mips32.c
@@ -0,0 +1,154 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../enc/cost_enc.h"
+
+static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
+  int temp0, temp1;
+  int v_reg, ctx_reg;
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+  const int16_t* res_coeffs = res->coeffs;
+  const int res_last = res->last;
+  const int const_max_level = MAX_VARIABLE_LEVEL;
+  const int const_2 = 2;
+  const uint16_t** p_costs = &costs[n][0];
+  const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  __asm__ volatile (
+    ".set      push                                                        \n\t"
+    ".set      noreorder                                                   \n\t"
+    "subu      %[temp1],        %[res_last],        %[n]                   \n\t"
+    "sll       %[temp0],        %[n],               1                      \n\t"
+    "blez      %[temp1],        2f                                         \n\t"
+    " addu     %[res_coeffs],   %[res_coeffs],      %[temp0]               \n\t"
+  "1:                                                                      \n\t"
+    "lh        %[v_reg],        0(%[res_coeffs])                           \n\t"
+    "addiu     %[n],            %[n],               1                      \n\t"
+    "negu      %[temp0],        %[v_reg]                                   \n\t"
+    "slti      %[temp1],        %[v_reg],           0                      \n\t"
+    "movn      %[v_reg],        %[temp0],           %[temp1]               \n\t"
+    "sltiu     %[temp0],        %[v_reg],           2                      \n\t"
+    "move      %[ctx_reg],      %[v_reg]                                   \n\t"
+    "movz      %[ctx_reg],      %[const_2],         %[temp0]               \n\t"
+    "sll       %[temp1],        %[v_reg],           1                      \n\t"
+    "addu      %[temp1],        %[temp1],           %[VP8LevelFixedCosts]  \n\t"
+    "lhu       %[temp1],        0(%[temp1])                                \n\t"
+    "slt       %[temp0],        %[v_reg],           %[const_max_level]     \n\t"
+    "movz      %[v_reg],        %[const_max_level], %[temp0]               \n\t"
+    "addu      %[cost],         %[cost],            %[temp1]               \n\t"
+    "sll       %[v_reg],        %[v_reg],           1                      \n\t"
+    "sll       %[ctx_reg],      %[ctx_reg],         2                      \n\t"
+    "addu      %[v_reg],        %[v_reg],           %[t]                   \n\t"
+    "lhu       %[temp0],        0(%[v_reg])                                \n\t"
+    "addu      %[p_costs],      %[p_costs],         %[inc_p_costs]         \n\t"
+    "addu      %[t],            %[p_costs],         %[ctx_reg]             \n\t"
+    "addu      %[cost],         %[cost],            %[temp0]               \n\t"
+    "addiu     %[res_coeffs],   %[res_coeffs],      2                      \n\t"
+    "bne       %[n],            %[res_last],        1b                     \n\t"
+    " lw       %[t],            0(%[t])                                    \n\t"
+  "2:                                                                      \n\t"
+    ".set      pop                                                         \n\t"
+    : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [res_coeffs]"+&r"(res_coeffs)
+    : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+      [inc_p_costs]"r"(inc_p_costs)
+    : "memory"
+  );
+
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
+                                     VP8Residual* const res) {
+  const int16_t* p_coeffs = (int16_t*)coeffs;
+  int temp0, temp1, temp2, n, n1;
+  assert(res->first == 0 || coeffs[0] == 0);
+
+  __asm__ volatile (
+    ".set     push                                      \n\t"
+    ".set     noreorder                                 \n\t"
+    "addiu    %[p_coeffs],   %[p_coeffs],    28         \n\t"
+    "li       %[n],          15                         \n\t"
+    "li       %[temp2],      -1                         \n\t"
+  "0:                                                   \n\t"
+    "ulw      %[temp0],      0(%[p_coeffs])             \n\t"
+    "beqz     %[temp0],      1f                         \n\t"
+#if defined(WORDS_BIGENDIAN)
+    " sll     %[temp1],      %[temp0],       16         \n\t"
+#else
+    " srl     %[temp1],      %[temp0],       16         \n\t"
+#endif
+    "addiu    %[n1],         %[n],           -1         \n\t"
+    "movz     %[temp0],      %[n1],          %[temp1]   \n\t"
+    "movn     %[temp0],      %[n],           %[temp1]   \n\t"
+    "j        2f                                        \n\t"
+    " addiu   %[temp2],      %[temp0],       0          \n\t"
+  "1:                                                   \n\t"
+    "addiu    %[n],          %[n],           -2         \n\t"
+    "bgtz     %[n],          0b                         \n\t"
+    " addiu   %[p_coeffs],   %[p_coeffs],    -4         \n\t"
+  "2:                                                   \n\t"
+    ".set     pop                                       \n\t"
+    : [p_coeffs]"+&r"(p_coeffs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [n]"=&r"(n), [n1]"=&r"(n1)
+    :
+    : "memory"
+  );
+  res->last = temp2;
+  res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
+  VP8GetResidualCost = GetResidualCost_MIPS32;
+  VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/cost_mips_dsp_r2.c b/media/libwebp/dsp/cost_mips_dsp_r2.c
new file mode 100644
index 0000000000..e9ee99f6ac
--- /dev/null
+++ b/media/libwebp/dsp/cost_mips_dsp_r2.c
@@ -0,0 +1,107 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../enc/cost_enc.h"
+
+static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
+  int temp0, temp1;
+  int v_reg, ctx_reg;
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+  const int16_t* res_coeffs = res->coeffs;
+  const int res_last = res->last;
+  const int const_max_level = MAX_VARIABLE_LEVEL;
+  const int const_2 = 2;
+  const uint16_t** p_costs = &costs[n][0];
+  const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  __asm__ volatile (
+    ".set      push                                                     \n\t"
+    ".set      noreorder                                                \n\t"
+    "subu      %[temp1],        %[res_last],        %[n]                \n\t"
+    "blez      %[temp1],        2f                                      \n\t"
+    " nop                                                               \n\t"
+  "1:                                                                   \n\t"
+    "sll       %[temp0],        %[n],               1                   \n\t"
+    "lhx       %[v_reg],        %[temp0](%[res_coeffs])                 \n\t"
+    "addiu     %[n],            %[n],               1                   \n\t"
+    "absq_s.w  %[v_reg],        %[v_reg]                                \n\t"
+    "sltiu     %[temp0],        %[v_reg],           2                   \n\t"
+    "move      %[ctx_reg],      %[v_reg]                                \n\t"
+    "movz      %[ctx_reg],      %[const_2],         %[temp0]            \n\t"
+    "sll       %[temp1],        %[v_reg],           1                   \n\t"
+    "lhx       %[temp1],        %[temp1](%[VP8LevelFixedCosts])         \n\t"
+    "slt       %[temp0],        %[v_reg],           %[const_max_level]  \n\t"
+    "movz      %[v_reg],        %[const_max_level], %[temp0]            \n\t"
+    "addu      %[cost],         %[cost],            %[temp1]            \n\t"
+    "sll       %[v_reg],        %[v_reg],           1                   \n\t"
+    "sll       %[ctx_reg],      %[ctx_reg],         2                   \n\t"
+    "lhx       %[temp0],        %[v_reg](%[t])                          \n\t"
+    "addu      %[p_costs],      %[p_costs],         %[inc_p_costs]      \n\t"
+    "addu      %[t],            %[p_costs],         %[ctx_reg]          \n\t"
+    "addu      %[cost],         %[cost],            %[temp0]            \n\t"
+    "bne       %[n],            %[res_last],        1b                  \n\t"
+    " lw       %[t],            0(%[t])                                 \n\t"
+  "2:                                                                   \n\t"
+    ".set      pop                                                      \n\t"
+    : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1)
+    : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+      [res_coeffs]"r"(res_coeffs), [inc_p_costs]"r"(inc_p_costs)
+    : "memory"
+  );
+
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
+  VP8GetResidualCost = GetResidualCost_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/cost_neon.c b/media/libwebp/dsp/cost_neon.c
new file mode 100644
index 0000000000..78f715ff27
--- /dev/null
+++ b/media/libwebp/dsp/cost_neon.c
@@ -0,0 +1,122 @@
+// Copyright 2018 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of cost functions
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include "../dsp/neon.h"
+#include "../enc/cost_enc.h"
+
+static const uint8_t position[16] = { 1, 2,  3,  4,  5,  6,  7,  8,
+                                      9, 10, 11, 12, 13, 14, 15, 16 };
+
+static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
+                                   VP8Residual* const res) {
+  const int16x8_t minus_one = vdupq_n_s16(-1);
+  const int16x8_t coeffs_0 = vld1q_s16(coeffs);
+  const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8);
+  const uint16x8_t eob_0 = vtstq_s16(coeffs_0, minus_one);
+  const uint16x8_t eob_1 = vtstq_s16(coeffs_1, minus_one);
+  const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1));
+  const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position));
+
+#ifdef __aarch64__
+  res->last = vmaxvq_u8(masked) - 1;
+#else
+  const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked));
+  const uint16x8_t eob_16x8 = vmovl_u8(eob_8x8);
+  const uint16x4_t eob_16x4 =
+      vmax_u16(vget_low_u16(eob_16x8), vget_high_u16(eob_16x8));
+  const uint32x4_t eob_32x4 = vmovl_u16(eob_16x4);
+  uint32x2_t eob_32x2 =
+      vmax_u32(vget_low_u32(eob_32x4), vget_high_u32(eob_32x4));
+  eob_32x2 = vpmax_u32(eob_32x2, eob_32x2);
+
+  vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0);
+  --res->last;
+#endif  // __aarch64__
+
+  res->coeffs = coeffs;
+}
+
+static int GetResidualCost_NEON(int ctx0, const VP8Residual* const res) {
+  uint8_t levels[16], ctxs[16];
+  uint16_t abs_levels[16];
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  {   // precompute clamped levels and contexts, packed to 8b.
+    const uint8x16_t kCst2 = vdupq_n_u8(2);
+    const uint8x16_t kCst67 = vdupq_n_u8(MAX_VARIABLE_LEVEL);
+    const int16x8_t c0 = vld1q_s16(res->coeffs);
+    const int16x8_t c1 = vld1q_s16(res->coeffs + 8);
+    const uint16x8_t E0 = vreinterpretq_u16_s16(vabsq_s16(c0));
+    const uint16x8_t E1 = vreinterpretq_u16_s16(vabsq_s16(c1));
+    const uint8x16_t F = vcombine_u8(vqmovn_u16(E0), vqmovn_u16(E1));
+    const uint8x16_t G = vminq_u8(F, kCst2);   // context = 0,1,2
+    const uint8x16_t H = vminq_u8(F, kCst67);  // clamp_level in [0..67]
+
+    vst1q_u8(ctxs, G);
+    vst1q_u8(levels, H);
+
+    vst1q_u16(abs_levels, E0);
+    vst1q_u16(abs_levels + 8, E1);
+  }
+  for (; n < res->last; ++n) {
+    const int ctx = ctxs[n];
+    const int level = levels[n];
+    const int flevel = abs_levels[n];   // full level
+    cost += VP8LevelFixedCosts[flevel] + t[level];  // simplified VP8LevelCost()
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int level = levels[n];
+    const int flevel = abs_levels[n];
+    assert(flevel != 0);
+    cost += VP8LevelFixedCosts[flevel] + t[level];
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = ctxs[n];
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitNEON(void) {
+  VP8SetResidualCoeffs = SetResidualCoeffs_NEON;
+  VP8GetResidualCost = GetResidualCost_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/media/libwebp/dsp/cost_sse2.c b/media/libwebp/dsp/cost_sse2.c
new file mode 100644
index 0000000000..8cfe4e0091
--- /dev/null
+++ b/media/libwebp/dsp/cost_sse2.c
@@ -0,0 +1,119 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of cost functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+
+static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
+                                   VP8Residual* const res) {
+  const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
+  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
+  // Use SSE2 to compare 16 values with a single instruction.
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i m0 = _mm_packs_epi16(c0, c1);
+  const __m128i m1 = _mm_cmpeq_epi8(m0, zero);
+  // Get the comparison results as a bitmask into 16bits. Negate the mask to get
+  // the position of entries that are not equal to zero. We don't need to mask
+  // out least significant bits according to res->first, since coeffs[0] is 0
+  // if res->first > 0.
+  const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1);
+  // The position of the most significant non-zero bit indicates the position of
+  // the last non-zero value.
+  assert(res->first == 0 || coeffs[0] == 0);
+  res->last = mask ? BitsLog2Floor(mask) : -1;
+  res->coeffs = coeffs;
+}
+
+static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
+  uint8_t levels[16], ctxs[16];
+  uint16_t abs_levels[16];
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  {   // precompute clamped levels and contexts, packed to 8b.
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i kCst2 = _mm_set1_epi8(2);
+    const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL);
+    const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]);
+    const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]);
+    const __m128i D0 = _mm_sub_epi16(zero, c0);
+    const __m128i D1 = _mm_sub_epi16(zero, c1);
+    const __m128i E0 = _mm_max_epi16(c0, D0);   // abs(v), 16b
+    const __m128i E1 = _mm_max_epi16(c1, D1);
+    const __m128i F = _mm_packs_epi16(E0, E1);
+    const __m128i G = _mm_min_epu8(F, kCst2);    // context = 0,1,2
+    const __m128i H = _mm_min_epu8(F, kCst67);   // clamp_level in [0..67]
+
+    _mm_storeu_si128((__m128i*)&ctxs[0], G);
+    _mm_storeu_si128((__m128i*)&levels[0], H);
+
+    _mm_storeu_si128((__m128i*)&abs_levels[0], E0);
+    _mm_storeu_si128((__m128i*)&abs_levels[8], E1);
+  }
+  for (; n < res->last; ++n) {
+    const int ctx = ctxs[n];
+    const int level = levels[n];
+    const int flevel = abs_levels[n];   // full level
+    cost += VP8LevelFixedCosts[flevel] + t[level];  // simplified VP8LevelCost()
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int level = levels[n];
+    const int flevel = abs_levels[n];
+    assert(flevel != 0);
+    cost += VP8LevelFixedCosts[flevel] + t[level];
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = ctxs[n];
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
+  VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
+  VP8GetResidualCost = GetResidualCost_SSE2;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/cpu.c b/media/libwebp/dsp/cpu.c
new file mode 100644
index 0000000000..ff57d90224
--- /dev/null
+++ b/media/libwebp/dsp/cpu.c
@@ -0,0 +1,253 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// CPU detection
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_HAVE_NEON_RTCD)
+#include <stdio.h>
+#include <string.h>
+#endif
+
+#if defined(WEBP_ANDROID_NEON)
+#include <cpu-features.h>
+#endif
+
+//------------------------------------------------------------------------------
+// SSE2 detection.
+//
+
+// apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
+#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "mov %%ebx, %%edi\n"
+    "cpuid\n"
+    "xchg %%edi, %%ebx\n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type), "c"(0));
+}
+#elif defined(__x86_64__) && \
+      (defined(__code_model_medium__) || defined(__code_model_large__)) && \
+      defined(__PIC__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "xchg{q}\t{%%rbx}, %q1\n"
+    "cpuid\n"
+    "xchg{q}\t{%%rbx}, %q1\n"
+    : "=a"(cpu_info[0]), "=&r"(cpu_info[1]), "=c"(cpu_info[2]),
+      "=d"(cpu_info[3])
+    : "a"(info_type), "c"(0));
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "cpuid\n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type), "c"(0));
+}
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+
+#if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
+#include <intrin.h>
+#define GetCPUInfo(info, type) __cpuidex(info, type, 0)  // set ecx=0
+#define WEBP_HAVE_MSC_CPUID
+#elif _MSC_VER > 1310
+#include <intrin.h>
+#define GetCPUInfo __cpuid
+#define WEBP_HAVE_MSC_CPUID
+#endif
+
+#endif
+
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static WEBP_INLINE uint64_t xgetbv(void) {
+  const uint32_t ecx = 0;
+  uint32_t eax, edx;
+  // Use the raw opcode for xgetbv for compatibility with older toolchains.
+  __asm__ volatile (
+    ".byte 0x0f, 0x01, 0xd0\n"
+    : "=a"(eax), "=d"(edx) : "c" (ecx));
+  return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static WEBP_INLINE uint64_t xgetbv(void) {
+  uint32_t eax_, edx_;
+  __asm {
+    xor ecx, ecx  // ecx = 0
+    // Use the raw opcode for xgetbv for compatibility with older toolchains.
+    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+    mov eax_, eax
+    mov edx_, edx
+  }
+  return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_HAVE_MSC_CPUID)
+
+// helper function for run-time detection of slow SSSE3 platforms
+static int CheckSlowModel(int info) {
+  // Table listing display models with longer latencies for the bsr instruction
+  // (ie 2 cycles vs 10/16 cycles) and some SSSE3 instructions like pshufb.
+  // Refer to Intel 64 and IA-32 Architectures Optimization Reference Manual.
+  static const uint8_t kSlowModels[] = {
+    0x37, 0x4a, 0x4d,  // Silvermont Microarchitecture
+    0x1c, 0x26, 0x27   // Atom Microarchitecture
+  };
+  const uint32_t model = ((info & 0xf0000) >> 12) | ((info >> 4) & 0xf);
+  const uint32_t family = (info >> 8) & 0xf;
+  if (family == 0x06) {
+    size_t i;
+    for (i = 0; i < sizeof(kSlowModels) / sizeof(kSlowModels[0]); ++i) {
+      if (model == kSlowModels[i]) return 1;
+    }
+  }
+  return 0;
+}
+
+static int x86CPUInfo(CPUFeature feature) {
+  int max_cpuid_value;
+  int cpu_info[4];
+  int is_intel = 0;
+
+  // get the highest feature value cpuid supports
+  GetCPUInfo(cpu_info, 0);
+  max_cpuid_value = cpu_info[0];
+  if (max_cpuid_value < 1) {
+    return 0;
+  } else {
+    const int VENDOR_ID_INTEL_EBX = 0x756e6547;  // uneG
+    const int VENDOR_ID_INTEL_EDX = 0x49656e69;  // Ieni
+    const int VENDOR_ID_INTEL_ECX = 0x6c65746e;  // letn
+    is_intel = (cpu_info[1] == VENDOR_ID_INTEL_EBX &&
+                cpu_info[2] == VENDOR_ID_INTEL_ECX &&
+                cpu_info[3] == VENDOR_ID_INTEL_EDX);    // genuine Intel?
+  }
+
+  GetCPUInfo(cpu_info, 1);
+  if (feature == kSSE2) {
+    return !!(cpu_info[3] & (1 << 26));
+  }
+  if (feature == kSSE3) {
+    return !!(cpu_info[2] & (1 << 0));
+  }
+  if (feature == kSlowSSSE3) {
+    if (is_intel && (cpu_info[2] & (1 << 9))) {   // SSSE3?
+      return CheckSlowModel(cpu_info[0]);
+    }
+    return 0;
+  }
+
+  if (feature == kSSE4_1) {
+    return !!(cpu_info[2] & (1 << 19));
+  }
+  if (feature == kAVX) {
+    // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+    if ((cpu_info[2] & 0x18000000) == 0x18000000) {
+      // XMM state and YMM state enabled by the OS.
+      return (xgetbv() & 0x6) == 0x6;
+    }
+  }
+  if (feature == kAVX2) {
+    if (x86CPUInfo(kAVX) && max_cpuid_value >= 7) {
+      GetCPUInfo(cpu_info, 7);
+      return !!(cpu_info[1] & (1 << 5));
+    }
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
+#elif defined(WEBP_ANDROID_NEON)  // NB: needs to be before generic NEON test.
+static int AndroidCPUInfo(CPUFeature feature) {
+  const AndroidCpuFamily cpu_family = android_getCpuFamily();
+  const uint64_t cpu_features = android_getCpuFeatures();
+  if (feature == kNEON) {
+    return cpu_family == ANDROID_CPU_FAMILY_ARM &&
+           (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
+#elif defined(EMSCRIPTEN) // also needs to be before generic NEON test
+// Use compile flags as an indicator of SIMD support instead of a runtime check.
+static int wasmCPUInfo(CPUFeature feature) {
+  switch (feature) {
+#ifdef WEBP_HAVE_SSE2
+    case kSSE2:
+      return 1;
+#endif
+#ifdef WEBP_HAVE_SSE41
+    case kSSE3:
+    case kSlowSSSE3:
+    case kSSE4_1:
+      return 1;
+#endif
+#ifdef WEBP_HAVE_NEON
+    case kNEON:
+      return 1;
+#endif
+    default:
+      break;
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
+#elif defined(WEBP_HAVE_NEON)
+// In most cases this function doesn't check for NEON support (it's assumed by
+// the configuration), but enables turning off NEON at runtime, for testing
+// purposes, by setting VP8DecGetCPUInfo = NULL.
+static int armCPUInfo(CPUFeature feature) {
+  if (feature != kNEON) return 0;
+#if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD)
+  {
+    int has_neon = 0;
+    char line[200];
+    FILE* const cpuinfo = fopen("/proc/cpuinfo", "r");
+    if (cpuinfo == NULL) return 0;
+    while (fgets(line, sizeof(line), cpuinfo)) {
+      if (!strncmp(line, "Features", 8)) {
+        if (strstr(line, " neon ") != NULL) {
+          has_neon = 1;
+          break;
+        }
+      }
+    }
+    fclose(cpuinfo);
+    return has_neon;
+  }
+#else
+  return 1;
+#endif
+}
+VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
+#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2) || \
+      defined(WEBP_USE_MSA)
+static int mipsCPUInfo(CPUFeature feature) {
+  if ((feature == kMIPS32) || (feature == kMIPSdspR2) || (feature == kMSA)) {
+    return 1;
+  } else {
+    return 0;
+  }
+
+}
+VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
+#else
+VP8CPUInfo VP8GetCPUInfo = NULL;
+#endif
diff --git a/media/libwebp/dsp/dec.c b/media/libwebp/dsp/dec.c
index a599d26bc0..5c94b6d403 100644
--- a/media/libwebp/dsp/dec.c
+++ b/media/libwebp/dsp/dec.c
@@ -807,10 +807,10 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8DspInitSSE2();
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
       if (VP8GetCPUInfo(kSSE4_1)) {
         VP8DspInitSSE41();
       }
@@ -834,7 +834,7 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8DspInitNEON();
diff --git a/media/libwebp/dsp/dec_mips32.c b/media/libwebp/dsp/dec_mips32.c
new file mode 100644
index 0000000000..2d55214faa
--- /dev/null
+++ b/media/libwebp/dsp/dec_mips32.c
@@ -0,0 +1,587 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of dsp functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../dsp/mips_macro.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+static WEBP_INLINE int abs_mips32(int x) {
+  const int sign = x >> 31;
+  return (x ^ sign) - sign;
+}
+
+// 4 pixels in, 2 pixels out
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];
+  const int a1 = VP8ksclip2[(a + 4) >> 3];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
+  p[-step] = VP8kclip1[p0 + a2];
+  p[    0] = VP8kclip1[q0 - a1];
+}
+
+// 4 pixels in, 4 pixels out
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0);
+  const int a1 = VP8ksclip2[(a + 4) >> 3];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
+  const int a3 = (a1 + 1) >> 1;
+  p[-2 * step] = VP8kclip1[p1 + a3];
+  p[-    step] = VP8kclip1[p0 + a2];
+  p[        0] = VP8kclip1[q0 - a1];
+  p[     step] = VP8kclip1[q1 - a3];
+}
+
+// 6 pixels in, 6 pixels out
+static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
+  // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
+  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
+  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
+  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
+  p[-3 * step] = VP8kclip1[p2 + a3];
+  p[-2 * step] = VP8kclip1[p1 + a2];
+  p[-    step] = VP8kclip1[p0 + a1];
+  p[        0] = VP8kclip1[q0 - a1];
+  p[     step] = VP8kclip1[q1 - a2];
+  p[ 2 * step] = VP8kclip1[q2 - a3];
+}
+
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
+}
+
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) <= t);
+}
+
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+                                     int step, int t, int it) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step];
+  const int p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  if ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) > t) {
+    return 0;
+  }
+  return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
+         abs_mips32(p1 - p0) <= it && abs_mips32(q3 - q2) <= it &&
+         abs_mips32(q2 - q1) <= it && abs_mips32(q1 - q0) <= it;
+}
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter6(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter4(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i, stride, thresh2)) {
+      do_filter2(p + i, stride);
+    }
+  }
+}
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i * stride, 1, thresh2)) {
+      do_filter2(p + i * stride, 1);
+    }
+  }
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14;
+  int temp15, temp16, temp17, temp18;
+  int16_t* p_in = (int16_t*)in;
+
+  // loops unrolled and merged to avoid usage of tmp buffer
+  // and to reduce number of stalls. MUL macro is written
+  // in assembler and inlined
+  __asm__ volatile(
+    "lh       %[temp0],  0(%[in])                      \n\t"
+    "lh       %[temp8],  16(%[in])                     \n\t"
+    "lh       %[temp4],  8(%[in])                      \n\t"
+    "lh       %[temp12], 24(%[in])                     \n\t"
+    "addu     %[temp16], %[temp0],  %[temp8]           \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp8]           \n\t"
+    "mul      %[temp8],  %[temp4],  %[kC2]             \n\t"
+    "mul      %[temp17], %[temp12], %[kC1]             \n\t"
+    "mul      %[temp4],  %[temp4],  %[kC1]             \n\t"
+    "mul      %[temp12], %[temp12], %[kC2]             \n\t"
+    "lh       %[temp1],  2(%[in])                      \n\t"
+    "lh       %[temp5],  10(%[in])                     \n\t"
+    "lh       %[temp9],  18(%[in])                     \n\t"
+    "lh       %[temp13], 26(%[in])                     \n\t"
+    "sra      %[temp8],  %[temp8],  16                 \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp4],  %[temp4],  16                 \n\t"
+    "sra      %[temp12], %[temp12], 16                 \n\t"
+    "lh       %[temp2],  4(%[in])                      \n\t"
+    "lh       %[temp6],  12(%[in])                     \n\t"
+    "lh       %[temp10], 20(%[in])                     \n\t"
+    "lh       %[temp14], 28(%[in])                     \n\t"
+    "subu     %[temp17], %[temp8],  %[temp17]          \n\t"
+    "addu     %[temp4],  %[temp4],  %[temp12]          \n\t"
+    "addu     %[temp8],  %[temp16], %[temp4]           \n\t"
+    "subu     %[temp4],  %[temp16], %[temp4]           \n\t"
+    "addu     %[temp16], %[temp1],  %[temp9]           \n\t"
+    "subu     %[temp1],  %[temp1],  %[temp9]           \n\t"
+    "lh       %[temp3],  6(%[in])                      \n\t"
+    "lh       %[temp7],  14(%[in])                     \n\t"
+    "lh       %[temp11], 22(%[in])                     \n\t"
+    "lh       %[temp15], 30(%[in])                     \n\t"
+    "addu     %[temp12], %[temp0],  %[temp17]          \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp17]          \n\t"
+    "mul      %[temp9],  %[temp5],  %[kC2]             \n\t"
+    "mul      %[temp17], %[temp13], %[kC1]             \n\t"
+    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
+    "mul      %[temp13], %[temp13], %[kC2]             \n\t"
+    "sra      %[temp9],  %[temp9],  16                 \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "subu     %[temp17], %[temp9],  %[temp17]          \n\t"
+    "sra      %[temp5],  %[temp5],  16                 \n\t"
+    "sra      %[temp13], %[temp13], 16                 \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
+    "addu     %[temp13], %[temp1],  %[temp17]          \n\t"
+    "subu     %[temp1],  %[temp1],  %[temp17]          \n\t"
+    "mul      %[temp17], %[temp14], %[kC1]             \n\t"
+    "mul      %[temp14], %[temp14], %[kC2]             \n\t"
+    "addu     %[temp9],  %[temp16], %[temp5]           \n\t"
+    "subu     %[temp5],  %[temp16], %[temp5]           \n\t"
+    "addu     %[temp16], %[temp2],  %[temp10]          \n\t"
+    "subu     %[temp2],  %[temp2],  %[temp10]          \n\t"
+    "mul      %[temp10], %[temp6],  %[kC2]             \n\t"
+    "mul      %[temp6],  %[temp6],  %[kC1]             \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp14], %[temp14], 16                 \n\t"
+    "sra      %[temp10], %[temp10], 16                 \n\t"
+    "sra      %[temp6],  %[temp6],  16                 \n\t"
+    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
+    "addu     %[temp6],  %[temp6],  %[temp14]          \n\t"
+    "addu     %[temp10], %[temp16], %[temp6]           \n\t"
+    "subu     %[temp6],  %[temp16], %[temp6]           \n\t"
+    "addu     %[temp14], %[temp2],  %[temp17]          \n\t"
+    "subu     %[temp2],  %[temp2],  %[temp17]          \n\t"
+    "mul      %[temp17], %[temp15], %[kC1]             \n\t"
+    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
+    "addu     %[temp16], %[temp3],  %[temp11]          \n\t"
+    "subu     %[temp3],  %[temp3],  %[temp11]          \n\t"
+    "mul      %[temp11], %[temp7],  %[kC2]             \n\t"
+    "mul      %[temp7],  %[temp7],  %[kC1]             \n\t"
+    "addiu    %[temp8],  %[temp8],  4                  \n\t"
+    "addiu    %[temp12], %[temp12], 4                  \n\t"
+    "addiu    %[temp0],  %[temp0],  4                  \n\t"
+    "addiu    %[temp4],  %[temp4],  4                  \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp15], %[temp15], 16                 \n\t"
+    "sra      %[temp11], %[temp11], 16                 \n\t"
+    "sra      %[temp7],  %[temp7],  16                 \n\t"
+    "subu     %[temp17], %[temp11], %[temp17]          \n\t"
+    "addu     %[temp7],  %[temp7],  %[temp15]          \n\t"
+    "addu     %[temp15], %[temp3],  %[temp17]          \n\t"
+    "subu     %[temp3],  %[temp3],  %[temp17]          \n\t"
+    "addu     %[temp11], %[temp16], %[temp7]           \n\t"
+    "subu     %[temp7],  %[temp16], %[temp7]           \n\t"
+    "addu     %[temp16], %[temp8],  %[temp10]          \n\t"
+    "subu     %[temp8],  %[temp8],  %[temp10]          \n\t"
+    "mul      %[temp10], %[temp9],  %[kC2]             \n\t"
+    "mul      %[temp17], %[temp11], %[kC1]             \n\t"
+    "mul      %[temp9],  %[temp9],  %[kC1]             \n\t"
+    "mul      %[temp11], %[temp11], %[kC2]             \n\t"
+    "sra      %[temp10], %[temp10], 16                 \n\t"
+    "sra      %[temp17], %[temp17], 16                 \n\t"
+    "sra      %[temp9],  %[temp9],  16                 \n\t"
+    "sra      %[temp11], %[temp11], 16                 \n\t"
+    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
+    "addu     %[temp11], %[temp9],  %[temp11]          \n\t"
+    "addu     %[temp10], %[temp12], %[temp14]          \n\t"
+    "subu     %[temp12], %[temp12], %[temp14]          \n\t"
+    "mul      %[temp14], %[temp13], %[kC2]             \n\t"
+    "mul      %[temp9],  %[temp15], %[kC1]             \n\t"
+    "mul      %[temp13], %[temp13], %[kC1]             \n\t"
+    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
+    "sra      %[temp14], %[temp14], 16                 \n\t"
+    "sra      %[temp9],  %[temp9],  16                 \n\t"
+    "sra      %[temp13], %[temp13], 16                 \n\t"
+    "sra      %[temp15], %[temp15], 16                 \n\t"
+    "subu     %[temp9],  %[temp14], %[temp9]           \n\t"
+    "addu     %[temp15], %[temp13], %[temp15]          \n\t"
+    "addu     %[temp14], %[temp0],  %[temp2]           \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp2]           \n\t"
+    "mul      %[temp2],  %[temp1],  %[kC2]             \n\t"
+    "mul      %[temp13], %[temp3],  %[kC1]             \n\t"
+    "mul      %[temp1],  %[temp1],  %[kC1]             \n\t"
+    "mul      %[temp3],  %[temp3],  %[kC2]             \n\t"
+    "sra      %[temp2],  %[temp2],  16                 \n\t"
+    "sra      %[temp13], %[temp13], 16                 \n\t"
+    "sra      %[temp1],  %[temp1],  16                 \n\t"
+    "sra      %[temp3],  %[temp3],  16                 \n\t"
+    "subu     %[temp13], %[temp2],  %[temp13]          \n\t"
+    "addu     %[temp3],  %[temp1],  %[temp3]           \n\t"
+    "addu     %[temp2],  %[temp4],  %[temp6]           \n\t"
+    "subu     %[temp4],  %[temp4],  %[temp6]           \n\t"
+    "mul      %[temp6],  %[temp5],  %[kC2]             \n\t"
+    "mul      %[temp1],  %[temp7],  %[kC1]             \n\t"
+    "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
+    "mul      %[temp7],  %[temp7],  %[kC2]             \n\t"
+    "sra      %[temp6],  %[temp6],  16                 \n\t"
+    "sra      %[temp1],  %[temp1],  16                 \n\t"
+    "sra      %[temp5],  %[temp5],  16                 \n\t"
+    "sra      %[temp7],  %[temp7],  16                 \n\t"
+    "subu     %[temp1],  %[temp6],  %[temp1]           \n\t"
+    "addu     %[temp7],  %[temp5],  %[temp7]           \n\t"
+    "addu     %[temp5],  %[temp16], %[temp11]          \n\t"
+    "subu     %[temp16], %[temp16], %[temp11]          \n\t"
+    "addu     %[temp11], %[temp8],  %[temp17]          \n\t"
+    "subu     %[temp8],  %[temp8],  %[temp17]          \n\t"
+    "sra      %[temp5],  %[temp5],  3                  \n\t"
+    "sra      %[temp16], %[temp16], 3                  \n\t"
+    "sra      %[temp11], %[temp11], 3                  \n\t"
+    "sra      %[temp8],  %[temp8],  3                  \n\t"
+    "addu     %[temp17], %[temp10], %[temp15]          \n\t"
+    "subu     %[temp10], %[temp10], %[temp15]          \n\t"
+    "addu     %[temp15], %[temp12], %[temp9]           \n\t"
+    "subu     %[temp12], %[temp12], %[temp9]           \n\t"
+    "sra      %[temp17], %[temp17], 3                  \n\t"
+    "sra      %[temp10], %[temp10], 3                  \n\t"
+    "sra      %[temp15], %[temp15], 3                  \n\t"
+    "sra      %[temp12], %[temp12], 3                  \n\t"
+    "addu     %[temp9],  %[temp14], %[temp3]           \n\t"
+    "subu     %[temp14], %[temp14], %[temp3]           \n\t"
+    "addu     %[temp3],  %[temp0],  %[temp13]          \n\t"
+    "subu     %[temp0],  %[temp0],  %[temp13]          \n\t"
+    "sra      %[temp9],  %[temp9],  3                  \n\t"
+    "sra      %[temp14], %[temp14], 3                  \n\t"
+    "sra      %[temp3],  %[temp3],  3                  \n\t"
+    "sra      %[temp0],  %[temp0],  3                  \n\t"
+    "addu     %[temp13], %[temp2],  %[temp7]           \n\t"
+    "subu     %[temp2],  %[temp2],  %[temp7]           \n\t"
+    "addu     %[temp7],  %[temp4],  %[temp1]           \n\t"
+    "subu     %[temp4],  %[temp4],  %[temp1]           \n\t"
+    "sra      %[temp13], %[temp13], 3                  \n\t"
+    "sra      %[temp2],  %[temp2],  3                  \n\t"
+    "sra      %[temp7],  %[temp7],  3                  \n\t"
+    "sra      %[temp4],  %[temp4],  3                  \n\t"
+    "addiu    %[temp6],  $zero,     255                \n\t"
+    "lbu      %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
+    "sra      %[temp5],  %[temp1],  8                  \n\t"
+    "sra      %[temp18], %[temp1],  31                 \n\t"
+    "beqz     %[temp5],  1f                            \n\t"
+    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
+    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
+  "1:                                                  \n\t"
+    "lbu      %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp18], %[temp18], %[temp11]          \n\t"
+    "sra      %[temp11], %[temp18], 8                  \n\t"
+    "sra      %[temp1],  %[temp18], 31                 \n\t"
+    "beqz     %[temp11], 2f                            \n\t"
+    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
+    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
+  "2:                                                  \n\t"
+    "lbu      %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
+    "sra      %[temp8],  %[temp1],  8                  \n\t"
+    "sra      %[temp18], %[temp1],  31                 \n\t"
+    "beqz     %[temp8],  3f                            \n\t"
+    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
+    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
+  "3:                                                  \n\t"
+    "lbu      %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp18], %[temp18], %[temp16]          \n\t"
+    "sra      %[temp16], %[temp18], 8                  \n\t"
+    "sra      %[temp1],  %[temp18], 31                 \n\t"
+    "beqz     %[temp16], 4f                            \n\t"
+    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
+    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
+  "4:                                                  \n\t"
+    "sb       %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
+    "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
+    "addu     %[temp11], %[temp11], %[temp12]          \n\t"
+    "addu     %[temp16], %[temp16], %[temp10]          \n\t"
+    "sra      %[temp18], %[temp5],  8                  \n\t"
+    "sra      %[temp1],  %[temp5],  31                 \n\t"
+    "beqz     %[temp18], 5f                            \n\t"
+    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
+    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
+  "5:                                                  \n\t"
+    "sra      %[temp18], %[temp8],  8                  \n\t"
+    "sra      %[temp1],  %[temp8],  31                 \n\t"
+    "beqz     %[temp18], 6f                            \n\t"
+    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
+    "movz     %[temp8],  %[temp6],  %[temp1]           \n\t"
+  "6:                                                  \n\t"
+    "sra      %[temp18], %[temp11], 8                  \n\t"
+    "sra      %[temp1],  %[temp11], 31                 \n\t"
+    "sra      %[temp17], %[temp16], 8                  \n\t"
+    "sra      %[temp15], %[temp16], 31                 \n\t"
+    "beqz     %[temp18], 7f                            \n\t"
+    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
+    "movz     %[temp11], %[temp6],  %[temp1]           \n\t"
+  "7:                                                  \n\t"
+    "beqz     %[temp17], 8f                            \n\t"
+    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
+    "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
+  "8:                                                  \n\t"
+    "sb       %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
+    "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
+    "addu     %[temp11], %[temp11], %[temp0]           \n\t"
+    "addu     %[temp16], %[temp16], %[temp14]          \n\t"
+    "sra      %[temp18], %[temp5],  8                  \n\t"
+    "sra      %[temp1],  %[temp5],  31                 \n\t"
+    "sra      %[temp17], %[temp8],  8                  \n\t"
+    "sra      %[temp15], %[temp8],  31                 \n\t"
+    "sra      %[temp12], %[temp11], 8                  \n\t"
+    "sra      %[temp10], %[temp11], 31                 \n\t"
+    "sra      %[temp9],  %[temp16], 8                  \n\t"
+    "sra      %[temp3],  %[temp16], 31                 \n\t"
+    "beqz     %[temp18], 9f                            \n\t"
+    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
+    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
+  "9:                                                  \n\t"
+    "beqz     %[temp17], 10f                           \n\t"
+    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
+    "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
+  "10:                                                 \n\t"
+    "beqz     %[temp12], 11f                           \n\t"
+    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
+    "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
+  "11:                                                 \n\t"
+    "beqz     %[temp9],  12f                           \n\t"
+    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
+    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
+  "12:                                                 \n\t"
+    "sb       %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
+    "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
+    "addu     %[temp11], %[temp11], %[temp4]           \n\t"
+    "addu     %[temp16], %[temp16], %[temp2]           \n\t"
+    "sra      %[temp18], %[temp5],  8                  \n\t"
+    "sra      %[temp1],  %[temp5],  31                 \n\t"
+    "sra      %[temp17], %[temp8],  8                  \n\t"
+    "sra      %[temp15], %[temp8],  31                 \n\t"
+    "sra      %[temp12], %[temp11], 8                  \n\t"
+    "sra      %[temp10], %[temp11], 31                 \n\t"
+    "sra      %[temp9],  %[temp16], 8                  \n\t"
+    "sra      %[temp3],  %[temp16], 31                 \n\t"
+    "beqz     %[temp18], 13f                           \n\t"
+    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
+    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
+  "13:                                                 \n\t"
+    "beqz     %[temp17], 14f                           \n\t"
+    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
+    "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
+  "14:                                                 \n\t"
+    "beqz     %[temp12], 15f                           \n\t"
+    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
+    "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
+  "15:                                                 \n\t"
+    "beqz     %[temp9],  16f                           \n\t"
+    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
+    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
+  "16:                                                 \n\t"
+    "sb       %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+      [temp18]"=&r"(temp18)
+    : [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPS32(void) {
+  VP8InitClipTables();
+
+  VP8Transform = TransformTwo;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8DspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/dec_mips_dsp_r2.c b/media/libwebp/dsp/dec_mips_dsp_r2.c
new file mode 100644
index 0000000000..dcc3041019
--- /dev/null
+++ b/media/libwebp/dsp/dec_mips_dsp_r2.c
@@ -0,0 +1,994 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of dsp functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/mips_macro.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
+
+  __asm__ volatile (
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    "lh               %[temp5],  0(%[in])               \n\t"
+    "addiu            %[temp5],  %[temp5],  4           \n\t"
+    "ins              %[temp5],  %[temp5],  16, 16      \n\t"
+    "shra.ph          %[temp5],  %[temp5],  3           \n\t"
+    CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
+                            temp3, temp1, temp2, temp3, temp4)
+    STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
+                     temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_10()
+    : [in]"r"(in), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  int c4 = MUL(in[4], kC2);
+  const int d4 = MUL(in[4], kC1);
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ins              %[c4],      %[d4],     16,       16    \n\t"
+    "replv.ph         %[temp1],   %[a]                       \n\t"
+    "replv.ph         %[temp4],   %[d1]                      \n\t"
+    ADD_SUB_HALVES(temp2, temp3, temp1, c4)
+    "replv.ph         %[temp5],   %[c1]                      \n\t"
+    SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
+                   temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
+    LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
+                            temp11, temp17, temp3, temp5, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
+                          temp4, temp7, temp6, temp10, temp9)
+    STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
+                     temp17, temp12, temp18, temp1, temp8, temp2, temp4,
+                     temp7, temp6, dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18(),
+      [c4]"+&r"(c4)
+    : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
+    : "memory"
+  );
+}
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ulw              %[temp1],   0(%[in])                 \n\t"
+    "ulw              %[temp2],   16(%[in])                \n\t"
+    LOAD_IN_X2(temp5, temp6, 24, 26)
+    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
+    LOAD_IN_X2(temp1, temp2, 8, 10)
+    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
+                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
+                  temp13, temp11, temp14, temp12)
+    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
+    "ulw              %[temp17],  4(%[in])                 \n\t"
+    "ulw              %[temp18],  20(%[in])                \n\t"
+    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
+    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
+    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
+    LOAD_IN_X2(temp17, temp18, 12, 14)
+    LOAD_IN_X2(temp9, temp10, 28, 30)
+    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
+                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
+                  temp15, temp4, temp16, temp17)
+    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
+    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
+    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
+
+    // horizontal
+    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
+    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
+    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
+    "repl.ph          %[temp2],   0x4                      \n\t"
+    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
+    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
+    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
+    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
+    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
+    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
+                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
+                  temp6, temp17, temp8, temp18)
+    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
+                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
+                  temp18, temp12, temp17, temp16)
+    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
+    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
+    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
+                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
+                   temp6)
+    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
+                          temp16, temp11, temp10, temp15, temp14)
+    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
+                            temp11, temp10, temp11, temp14, temp15)
+    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
+                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18()
+    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15;
+
+  __asm__ volatile (
+    ".set      push                                      \n\t"
+    ".set      noreorder                                 \n\t"
+  "1:                                                    \n\t"
+    "negu      %[temp1],  %[hstride]                     \n\t"
+    "addiu     %[size],   %[size],        -1             \n\t"
+    "sll       %[temp2],  %[hstride],     1              \n\t"
+    "sll       %[temp3],  %[temp1],       1              \n\t"
+    "addu      %[temp4],  %[temp2],       %[hstride]     \n\t"
+    "addu      %[temp5],  %[temp3],       %[temp1]       \n\t"
+    "lbu       %[temp7],  0(%[p])                        \n\t"
+    "sll       %[temp6],  %[temp3],       1              \n\t"
+    "lbux      %[temp8],  %[temp5](%[p])                 \n\t"
+    "lbux      %[temp9],  %[temp3](%[p])                 \n\t"
+    "lbux      %[temp10], %[temp1](%[p])                 \n\t"
+    "lbux      %[temp11], %[temp6](%[p])                 \n\t"
+    "lbux      %[temp12], %[hstride](%[p])               \n\t"
+    "lbux      %[temp13], %[temp2](%[p])                 \n\t"
+    "lbux      %[temp14], %[temp4](%[p])                 \n\t"
+    "subu      %[temp1],  %[temp10],      %[temp7]       \n\t"
+    "subu      %[temp2],  %[temp9],       %[temp12]      \n\t"
+    "absq_s.w  %[temp3],  %[temp1]                       \n\t"
+    "absq_s.w  %[temp4],  %[temp2]                       \n\t"
+    "negu      %[temp1],  %[temp1]                       \n\t"
+    "sll       %[temp3],  %[temp3],       2              \n\t"
+    "addu      %[temp15], %[temp3],       %[temp4]       \n\t"
+    "subu      %[temp3],  %[temp15],      %[thresh2]     \n\t"
+    "sll       %[temp6],  %[temp1],       1              \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp4],  %[temp11],      %[temp8]       \n\t"
+    "absq_s.w  %[temp4],  %[temp4]                       \n\t"
+    "shll_s.w  %[temp2],  %[temp2],       24             \n\t"
+    "subu      %[temp4],  %[temp4],       %[ithresh]     \n\t"
+    "bgtz      %[temp4],  3f                             \n\t"
+    " subu     %[temp3],  %[temp8],       %[temp9]       \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp5],  %[temp9],       %[temp10]      \n\t"
+    "absq_s.w  %[temp3],  %[temp5]                       \n\t"
+    "absq_s.w  %[temp5],  %[temp5]                       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp3],  %[temp14],      %[temp13]      \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "slt       %[temp5],  %[hev_thresh],  %[temp5]       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp3],  %[temp13],      %[temp12]      \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "sra       %[temp4],  %[temp2],       24             \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp15], %[temp12],      %[temp7]       \n\t"
+    "absq_s.w  %[temp3],  %[temp15]                      \n\t"
+    "absq_s.w  %[temp15], %[temp15]                      \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " slt      %[temp15], %[hev_thresh],  %[temp15]      \n\t"
+    "addu      %[temp3],  %[temp6],       %[temp1]       \n\t"
+    "or        %[temp2],  %[temp5],       %[temp15]      \n\t"
+    "addu      %[temp5],  %[temp4],       %[temp3]       \n\t"
+    "beqz      %[temp2],  4f                             \n\t"
+    " shra_r.w %[temp1],  %[temp5],       3              \n\t"
+    "addiu     %[temp2],  %[temp5],       3              \n\t"
+    "sra       %[temp2],  %[temp2],       3              \n\t"
+    "shll_s.w  %[temp1],  %[temp1],       27             \n\t"
+    "shll_s.w  %[temp2],  %[temp2],       27             \n\t"
+    "subu      %[temp3],  %[p],           %[hstride]     \n\t"
+    "sra       %[temp1],  %[temp1],       27             \n\t"
+    "sra       %[temp2],  %[temp2],       27             \n\t"
+    "subu      %[temp1],  %[temp7],       %[temp1]       \n\t"
+    "addu      %[temp2],  %[temp10],      %[temp2]       \n\t"
+    "lbux      %[temp2],  %[temp2](%[VP8kclip1])         \n\t"
+    "lbux      %[temp1],  %[temp1](%[VP8kclip1])         \n\t"
+    "sb        %[temp2],  0(%[temp3])                    \n\t"
+    "j         3f                                        \n\t"
+    " sb       %[temp1],  0(%[p])                        \n\t"
+  "4:                                                    \n\t"
+    "shll_s.w  %[temp5],  %[temp5],       24             \n\t"
+    "subu      %[temp14], %[p],           %[hstride]     \n\t"
+    "subu      %[temp11], %[temp14],      %[hstride]     \n\t"
+    "sra       %[temp6],  %[temp5],       24             \n\t"
+    "sll       %[temp1],  %[temp6],       3              \n\t"
+    "subu      %[temp15], %[temp11],      %[hstride]     \n\t"
+    "addu      %[temp2],  %[temp6],       %[temp1]       \n\t"
+    "sll       %[temp3],  %[temp2],       1              \n\t"
+    "addu      %[temp4],  %[temp3],       %[temp2]       \n\t"
+    "addiu     %[temp2],  %[temp2],       63             \n\t"
+    "addiu     %[temp3],  %[temp3],       63             \n\t"
+    "addiu     %[temp4],  %[temp4],       63             \n\t"
+    "sra       %[temp2],  %[temp2],       7              \n\t"
+    "sra       %[temp3],  %[temp3],       7              \n\t"
+    "sra       %[temp4],  %[temp4],       7              \n\t"
+    "addu      %[temp1],  %[temp8],       %[temp2]       \n\t"
+    "addu      %[temp5],  %[temp9],       %[temp3]       \n\t"
+    "addu      %[temp6],  %[temp10],      %[temp4]       \n\t"
+    "subu      %[temp8],  %[temp7],       %[temp4]       \n\t"
+    "subu      %[temp7],  %[temp12],      %[temp3]       \n\t"
+    "addu      %[temp10], %[p],           %[hstride]     \n\t"
+    "subu      %[temp9],  %[temp13],      %[temp2]       \n\t"
+    "addu      %[temp12], %[temp10],      %[hstride]     \n\t"
+    "lbux      %[temp2],  %[temp1](%[VP8kclip1])         \n\t"
+    "lbux      %[temp3],  %[temp5](%[VP8kclip1])         \n\t"
+    "lbux      %[temp4],  %[temp6](%[VP8kclip1])         \n\t"
+    "lbux      %[temp5],  %[temp8](%[VP8kclip1])         \n\t"
+    "lbux      %[temp6],  %[temp7](%[VP8kclip1])         \n\t"
+    "lbux      %[temp8],  %[temp9](%[VP8kclip1])         \n\t"
+    "sb        %[temp2],  0(%[temp15])                   \n\t"
+    "sb        %[temp3],  0(%[temp11])                   \n\t"
+    "sb        %[temp4],  0(%[temp14])                   \n\t"
+    "sb        %[temp5],  0(%[p])                        \n\t"
+    "sb        %[temp6],  0(%[temp10])                   \n\t"
+    "sb        %[temp8],  0(%[temp12])                   \n\t"
+  "3:                                                    \n\t"
+    "bgtz      %[size],   1b                             \n\t"
+    " addu     %[p],      %[p],           %[vstride]     \n\t"
+    ".set      pop                                       \n\t"
+    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
+      [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
+      [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
+      [size]"+&r"(size), [p]"+&r"(p)
+    : [hstride]"r"(hstride), [thresh2]"r"(thresh2),
+      [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
+      [VP8kclip1]"r"(VP8kclip1)
+    : "memory"
+  );
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  int p0, q0, p1, q1, p2, q2, p3, q3;
+  int step1, step2, temp1, temp2, temp3, temp4;
+  uint8_t* pTemp0;
+  uint8_t* pTemp1;
+  const int thresh2 = 2 * thresh + 1;
+
+  __asm__ volatile (
+    ".set      push                                   \n\t"
+    ".set      noreorder                              \n\t"
+    "bltz      %[size],    3f                         \n\t"
+    " nop                                             \n\t"
+  "2:                                                 \n\t"
+    "negu      %[step1],   %[hstride]                 \n\t"
+    "lbu       %[q0],      0(%[p])                    \n\t"
+    "lbux      %[p0],      %[step1](%[p])             \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "lbux      %[q1],      %[hstride](%[p])           \n\t"
+    "subu      %[temp1],   %[p0],         %[q0]       \n\t"
+    "lbux      %[p1],      %[step1](%[p])             \n\t"
+    "addu      %[step2],   %[hstride],    %[hstride]  \n\t"
+    "absq_s.w  %[temp2],   %[temp1]                   \n\t"
+    "subu      %[temp3],   %[p1],         %[q1]       \n\t"
+    "absq_s.w  %[temp4],   %[temp3]                   \n\t"
+    "sll       %[temp2],   %[temp2],      2           \n\t"
+    "addu      %[temp2],   %[temp2],      %[temp4]    \n\t"
+    "subu      %[temp4],   %[temp2],      %[thresh2]  \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " lbux     %[p2],      %[step1](%[p])             \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "lbux      %[q2],      %[step2](%[p])             \n\t"
+    "lbux      %[p3],      %[step1](%[p])             \n\t"
+    "subu      %[temp4],   %[p2],         %[p1]       \n\t"
+    "addu      %[step2],   %[step2],      %[hstride]  \n\t"
+    "subu      %[temp2],   %[p3],         %[p2]       \n\t"
+    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
+    "absq_s.w  %[temp2],   %[temp2]                   \n\t"
+    "lbux      %[q3],      %[step2](%[p])             \n\t"
+    "subu      %[temp4],   %[temp4],      %[ithresh]  \n\t"
+    "negu      %[temp1],   %[temp1]                   \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " subu     %[temp2],   %[temp2],      %[ithresh]  \n\t"
+    "subu      %[p3],      %[p1],         %[p0]       \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " absq_s.w %[p3],      %[p3]                      \n\t"
+    "subu      %[temp4],   %[q3],         %[q2]       \n\t"
+    "subu      %[pTemp0],  %[p],          %[hstride]  \n\t"
+    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
+    "subu      %[temp2],   %[p3],         %[ithresh]  \n\t"
+    "sll       %[step1],   %[temp1],      1           \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " subu     %[temp4],   %[temp4],      %[ithresh]  \n\t"
+    "subu      %[temp2],   %[q2],         %[q1]       \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " absq_s.w %[temp2],   %[temp2]                   \n\t"
+    "subu      %[q3],      %[q1],         %[q0]       \n\t"
+    "absq_s.w  %[q3],      %[q3]                      \n\t"
+    "subu      %[temp2],   %[temp2],      %[ithresh]  \n\t"
+    "addu      %[temp1],   %[temp1],      %[step1]    \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " subu     %[temp4],   %[q3],         %[ithresh]  \n\t"
+    "slt       %[p3],      %[hev_thresh], %[p3]       \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " slt      %[q3],      %[hev_thresh], %[q3]       \n\t"
+    "or        %[q3],      %[q3],         %[p3]       \n\t"
+    "bgtz      %[q3],      1f                         \n\t"
+    " shra_r.w %[temp2],   %[temp1],      3           \n\t"
+    "addiu     %[temp1],   %[temp1],      3           \n\t"
+    "sra       %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
+    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
+    "addu      %[pTemp1],  %[p],          %[hstride]  \n\t"
+    "sra       %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      27          \n\t"
+    "addiu     %[step1],   %[temp2],      1           \n\t"
+    "sra       %[step1],   %[step1],      1           \n\t"
+    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
+    "addu      %[p1],      %[p1],         %[step1]    \n\t"
+    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
+    "subu      %[q1],      %[q1],         %[step1]    \n\t"
+    "lbux      %[temp2],   %[p0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp3],   %[q0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp4],   %[q1](%[VP8kclip1])        \n\t"
+    "sb        %[temp2],   0(%[pTemp0])               \n\t"
+    "lbux      %[temp1],   %[p1](%[VP8kclip1])        \n\t"
+    "subu      %[pTemp0],  %[pTemp0],    %[hstride]   \n\t"
+    "sb        %[temp3],   0(%[p])                    \n\t"
+    "sb        %[temp4],   0(%[pTemp1])               \n\t"
+    "j         0f                                     \n\t"
+    " sb       %[temp1],   0(%[pTemp0])               \n\t"
+  "1:                                                 \n\t"
+    "shll_s.w  %[temp3],   %[temp3],      24          \n\t"
+    "sra       %[temp3],   %[temp3],      24          \n\t"
+    "addu      %[temp1],   %[temp1],      %[temp3]    \n\t"
+    "shra_r.w  %[temp2],   %[temp1],      3           \n\t"
+    "addiu     %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
+    "sra       %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      27          \n\t"
+    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
+    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
+    "lbux      %[temp1],   %[p0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp2],   %[q0](%[VP8kclip1])        \n\t"
+    "sb        %[temp2],   0(%[p])                    \n\t"
+    "sb        %[temp1],   0(%[pTemp0])               \n\t"
+  "0:                                                 \n\t"
+    "subu      %[size],    %[size],       1           \n\t"
+    "bgtz      %[size],    2b                         \n\t"
+    " addu     %[p],       %[p],          %[vstride]  \n\t"
+  "3:                                                 \n\t"
+    ".set      pop                                    \n\t"
+    : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
+      [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
+      [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+      [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
+      [size]"+&r"(size)
+    : [vstride]"r"(vstride), [ithresh]"r"(ithresh),
+      [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
+      [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+#undef MUL
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  uint8_t* p1 = p - stride;
+  __asm__ volatile (
+    ".set      push                                      \n\t"
+    ".set      noreorder                                 \n\t"
+    "li        %[i],        16                           \n\t"
+  "0:                                                    \n\t"
+    "negu      %[temp4],    %[stride]                    \n\t"
+    "sll       %[temp5],    %[temp4],       1            \n\t"
+    "lbu       %[temp2],    0(%[p])                      \n\t"
+    "lbux      %[temp3],    %[stride](%[p])              \n\t"
+    "lbux      %[temp1],    %[temp4](%[p])               \n\t"
+    "lbux      %[temp0],    %[temp5](%[p])               \n\t"
+    "subu      %[temp7],    %[temp1],       %[temp2]     \n\t"
+    "subu      %[temp6],    %[temp0],       %[temp3]     \n\t"
+    "absq_s.w  %[temp4],    %[temp7]                     \n\t"
+    "absq_s.w  %[temp5],    %[temp6]                     \n\t"
+    "sll       %[temp4],    %[temp4],       2            \n\t"
+    "subu      %[temp5],    %[temp5],       %[thresh2]   \n\t"
+    "addu      %[temp5],    %[temp4],       %[temp5]     \n\t"
+    "negu      %[temp8],    %[temp7]                     \n\t"
+    "bgtz      %[temp5],    1f                           \n\t"
+    " addiu    %[i],        %[i],           -1           \n\t"
+    "sll       %[temp4],    %[temp8],       1            \n\t"
+    "shll_s.w  %[temp5],    %[temp6],       24           \n\t"
+    "addu      %[temp3],    %[temp4],       %[temp8]     \n\t"
+    "sra       %[temp5],    %[temp5],       24           \n\t"
+    "addu      %[temp3],    %[temp3],       %[temp5]     \n\t"
+    "addiu     %[temp7],    %[temp3],       3            \n\t"
+    "sra       %[temp7],    %[temp7],       3            \n\t"
+    "shra_r.w  %[temp8],    %[temp3],       3            \n\t"
+    "shll_s.w  %[temp0],    %[temp7],       27           \n\t"
+    "shll_s.w  %[temp4],    %[temp8],       27           \n\t"
+    "sra       %[temp0],    %[temp0],       27           \n\t"
+    "sra       %[temp4],    %[temp4],       27           \n\t"
+    "addu      %[temp7],    %[temp1],       %[temp0]     \n\t"
+    "subu      %[temp2],    %[temp2],       %[temp4]     \n\t"
+    "lbux      %[temp3],    %[temp7](%[VP8kclip1])       \n\t"
+    "lbux      %[temp4],    %[temp2](%[VP8kclip1])       \n\t"
+    "sb        %[temp3],    0(%[p1])                     \n\t"
+    "sb        %[temp4],    0(%[p])                      \n\t"
+  "1:                                                    \n\t"
+    "addiu     %[p1],       %[p1],          1            \n\t"
+    "bgtz      %[i],        0b                           \n\t"
+    " addiu    %[p],        %[p],           1            \n\t"
+    " .set     pop                                       \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
+    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+// TEMP0 = SRC[A + A1 * BPS]
+// TEMP1 = SRC[B + B1 * BPS]
+// TEMP2 = SRC[C + C1 * BPS]
+// TEMP3 = SRC[D + D1 * BPS]
+#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3,                               \
+                     A, A1, B, B1, C, C1, D, D1, SRC)                          \
+  "lbu      %[" #TEMP0 "],   " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP1 "],   " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP2 "],   " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP3 "],   " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    ".set      push                                     \n\t"
+    ".set      noreorder                                \n\t"
+    "li        %[i],       16                           \n\t"
+  "0:                                                   \n\t"
+    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
+    "subu      %[temp7],    %[temp1],       %[temp2]    \n\t"
+    "subu      %[temp6],    %[temp0],       %[temp3]    \n\t"
+    "absq_s.w  %[temp4],    %[temp7]                    \n\t"
+    "absq_s.w  %[temp5],    %[temp6]                    \n\t"
+    "sll       %[temp4],    %[temp4],       2           \n\t"
+    "addu      %[temp5],    %[temp4],       %[temp5]    \n\t"
+    "subu      %[temp5],    %[temp5],       %[thresh2]  \n\t"
+    "negu      %[temp8],    %[temp7]                    \n\t"
+    "bgtz      %[temp5],    1f                          \n\t"
+    " addiu    %[i],        %[i],           -1          \n\t"
+    "sll       %[temp4],    %[temp8],       1           \n\t"
+    "shll_s.w  %[temp5],    %[temp6],       24          \n\t"
+    "addu      %[temp3],    %[temp4],       %[temp8]    \n\t"
+    "sra       %[temp5],    %[temp5],       24          \n\t"
+    "addu      %[temp3],    %[temp3],       %[temp5]    \n\t"
+    "addiu     %[temp7],    %[temp3],       3           \n\t"
+    "sra       %[temp7],    %[temp7],       3           \n\t"
+    "shra_r.w  %[temp8],    %[temp3],       3           \n\t"
+    "shll_s.w  %[temp0],    %[temp7],       27          \n\t"
+    "shll_s.w  %[temp4],    %[temp8],       27          \n\t"
+    "sra       %[temp0],    %[temp0],       27          \n\t"
+    "sra       %[temp4],    %[temp4],       27          \n\t"
+    "addu      %[temp7],    %[temp1],       %[temp0]    \n\t"
+    "subu      %[temp2],    %[temp2],       %[temp4]    \n\t"
+    "lbux      %[temp3],    %[temp7](%[VP8kclip1])      \n\t"
+    "lbux      %[temp4],    %[temp2](%[VP8kclip1])      \n\t"
+    "sb        %[temp3],    -1(%[p])                    \n\t"
+    "sb        %[temp4],    0(%[p])                     \n\t"
+  "1:                                                   \n\t"
+    "bgtz      %[i],        0b                          \n\t"
+    " addu     %[p],        %[p],           %[stride]   \n\t"
+    ".set      pop                                      \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [p]"+&r"(p), [i]"=&r"(i)
+    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+// DST[A * BPS]     = TEMP0
+// DST[B + C * BPS] = TEMP1
+#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST)                              \
+  "usw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #DST "])         \n\t"     \
+  "usw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #DST "])  \n\t"
+
+static void VE4(uint8_t* dst) {    // vertical
+  const uint8_t* top = dst - BPS;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  __asm__ volatile (
+    "ulw             %[temp0],   -1(%[top])              \n\t"
+    "ulh             %[temp1],   3(%[top])               \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
+    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
+    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
+    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
+    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
+    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
+    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
+    STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
+    STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC4(uint8_t* dst) {   // DC
+  int temp0, temp1, temp2, temp3, temp4;
+  __asm__ volatile (
+    "ulw          %[temp0],   -1*" XSTR(BPS) "(%[dst]) \n\t"
+    LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    "ins          %[temp1],   %[temp2],    8,     8    \n\t"
+    "ins          %[temp1],   %[temp3],    16,    8    \n\t"
+    "ins          %[temp1],   %[temp4],    24,    8    \n\t"
+    "raddu.w.qb   %[temp0],   %[temp0]                 \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                 \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]    \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3           \n\t"
+    "replv.qb     %[temp0],   %[temp0]                 \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    "ulw            %[temp7],   -1-" XSTR(BPS) "(%[dst])       \n\t"
+    "ins            %[temp1],   %[temp0], 16, 16               \n\t"
+    "preceu.ph.qbr  %[temp5],   %[temp7]                       \n\t"
+    "ins            %[temp2],   %[temp1], 16, 16               \n\t"
+    "preceu.ph.qbl  %[temp4],   %[temp7]                       \n\t"
+    "ins            %[temp3],   %[temp2], 16, 16               \n\t"
+    "shll.ph        %[temp2],   %[temp2], 1                    \n\t"
+    "addq.ph        %[temp3],   %[temp3], %[temp1]             \n\t"
+    "packrl.ph      %[temp6],   %[temp5], %[temp1]             \n\t"
+    "addq.ph        %[temp3],   %[temp3], %[temp2]             \n\t"
+    "addq.ph        %[temp1],   %[temp1], %[temp5]             \n\t"
+    "shll.ph        %[temp6],   %[temp6], 1                    \n\t"
+    "addq.ph        %[temp1],   %[temp1], %[temp6]             \n\t"
+    "packrl.ph      %[temp0],   %[temp4], %[temp5]             \n\t"
+    "addq.ph        %[temp8],   %[temp5], %[temp4]             \n\t"
+    "shra_r.ph      %[temp3],   %[temp3], 2                    \n\t"
+    "shll.ph        %[temp0],   %[temp0], 1                    \n\t"
+    "shra_r.ph      %[temp1],   %[temp1], 2                    \n\t"
+    "addq.ph        %[temp8],   %[temp0], %[temp8]             \n\t"
+    "lbu            %[temp5],   3-" XSTR(BPS) "(%[dst])        \n\t"
+    "precrq.ph.w    %[temp7],   %[temp7], %[temp7]             \n\t"
+    "shra_r.ph      %[temp8],   %[temp8], 2                    \n\t"
+    "ins            %[temp7],   %[temp5], 0,  8                \n\t"
+    "precr.qb.ph    %[temp2],   %[temp1], %[temp3]             \n\t"
+    "raddu.w.qb     %[temp4],   %[temp7]                       \n\t"
+    "precr.qb.ph    %[temp6],   %[temp8], %[temp1]             \n\t"
+    "shra_r.w       %[temp4],   %[temp4], 2                    \n\t"
+    STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
+    "prepend        %[temp2],   %[temp8], 8                    \n\t"
+    "prepend        %[temp6],   %[temp4], 8                    \n\t"
+    STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+// TEMP0 = SRC[A * BPS]
+// TEMP1 = SRC[B + C * BPS]
+#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC)                               \
+  "ulw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #SRC "])         \n\t"     \
+  "ulw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "])  \n\t"
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    "preceu.ph.qbl   %[temp2],    %[temp0]                     \n\t"
+    "preceu.ph.qbr   %[temp3],    %[temp0]                     \n\t"
+    "preceu.ph.qbr   %[temp4],    %[temp1]                     \n\t"
+    "preceu.ph.qbl   %[temp5],    %[temp1]                     \n\t"
+    "packrl.ph       %[temp6],    %[temp2],    %[temp3]        \n\t"
+    "packrl.ph       %[temp7],    %[temp4],    %[temp2]        \n\t"
+    "packrl.ph       %[temp8],    %[temp5],    %[temp4]        \n\t"
+    "shll.ph         %[temp6],    %[temp6],    1               \n\t"
+    "addq.ph         %[temp9],    %[temp2],    %[temp6]        \n\t"
+    "shll.ph         %[temp7],    %[temp7],    1               \n\t"
+    "addq.ph         %[temp9],    %[temp9],    %[temp3]        \n\t"
+    "shll.ph         %[temp8],    %[temp8],    1               \n\t"
+    "shra_r.ph       %[temp9],    %[temp9],    2               \n\t"
+    "addq.ph         %[temp3],    %[temp4],    %[temp7]        \n\t"
+    "addq.ph         %[temp0],    %[temp5],    %[temp8]        \n\t"
+    "addq.ph         %[temp3],    %[temp3],    %[temp2]        \n\t"
+    "addq.ph         %[temp0],    %[temp0],    %[temp4]        \n\t"
+    "shra_r.ph       %[temp3],    %[temp3],    2               \n\t"
+    "shra_r.ph       %[temp0],    %[temp0],    2               \n\t"
+    "srl             %[temp1],    %[temp1],    24              \n\t"
+    "sll             %[temp1],    %[temp1],    1               \n\t"
+    "raddu.w.qb      %[temp5],    %[temp5]                     \n\t"
+    "precr.qb.ph     %[temp9],    %[temp3],    %[temp9]        \n\t"
+    "precr.qb.ph     %[temp3],    %[temp0],    %[temp3]        \n\t"
+    "addu            %[temp1],    %[temp1],    %[temp5]        \n\t"
+    "shra_r.w        %[temp1],    %[temp1],    2               \n\t"
+    STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
+    "prepend         %[temp9],    %[temp0],    8               \n\t"
+    "prepend         %[temp3],    %[temp1],    8               \n\t"
+    STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void DC8uv(uint8_t* dst) {     // DC
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
+    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
+    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
+    "addu         %[temp8],   %[temp8],    %[temp9]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp2]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp6]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    4             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  int temp0, temp1;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
+    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
+    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
+    "addu         %[temp8],   %[temp8],    %[temp1]      \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
+    "addu         %[temp0],   %[temp6],    %[temp2]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+#undef LOAD_8_BYTES
+#undef STORE_8_BYTES
+#undef LOAD_4_BYTES
+
+#define CLIPPING(SIZE)                                                         \
+  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
+".endif                                                  \n\t"                 \
+  "addu.ph         %[temp2],   %[temp2],   %[dst_1]      \n\t"                 \
+  "addu.ph         %[temp0],   %[temp0],   %[dst_1]      \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "addu.ph         %[temp3],   %[temp3],   %[dst_1]      \n\t"                 \
+  "addu.ph         %[temp1],   %[temp1],   %[dst_1]      \n\t"                 \
+".endif                                                  \n\t"                 \
+  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
+  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
+  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
+".endif                                                  \n\t"                 \
+  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"                 \
+".endif                                                  \n\t"
+
+
+#define CLIP_8B_TO_DST(DST, TOP, SIZE) do {                                    \
+  int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1];                              \
+  int temp0, temp1, temp2, temp3;                                              \
+  __asm__ volatile (                                                           \
+  ".if " #SIZE " < 8                                     \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
+    CLIPPING(4)                                                                \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+  ".else                                                 \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "ulw             %[temp1],   4(%[top])               \n\t"                 \
+    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
+    CLIPPING(8)                                                                \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+    "usw             %[temp1],   4(%[dst])               \n\t"                 \
+  ".if " #SIZE " == 16                                   \n\t"                 \
+    "ulw             %[temp0],   8(%[top])               \n\t"                 \
+    "ulw             %[temp1],   12(%[top])              \n\t"                 \
+    CLIPPING(8)                                                                \
+    "usw             %[temp0],   8(%[dst])               \n\t"                 \
+    "usw             %[temp1],   12(%[dst])              \n\t"                 \
+  ".endif                                                \n\t"                 \
+  ".endif                                                \n\t"                 \
+    : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),           \
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
+    : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST))                      \
+    : "memory"                                                                 \
+  );                                                                           \
+} while (0)
+
+#define CLIP_TO_DST(DST, SIZE) do {                                            \
+  int y;                                                                       \
+  const uint8_t* top = (DST) - BPS;                                            \
+  const int top_1 = ((int)top[-1] << 16) + top[-1];                            \
+  for (y = 0; y < (SIZE); ++y) {                                               \
+    CLIP_8B_TO_DST((DST), top, (SIZE));                                        \
+    (DST) += BPS;                                                              \
+  }                                                                            \
+} while (0)
+
+#define TRUE_MOTION(DST, SIZE)                                                 \
+static void TrueMotion##SIZE(uint8_t* (DST)) {                                 \
+  CLIP_TO_DST((DST), (SIZE));                                                  \
+}
+
+TRUE_MOTION(dst, 4)
+TRUE_MOTION(dst, 8)
+TRUE_MOTION(dst, 16)
+
+#undef TRUE_MOTION
+#undef CLIP_TO_DST
+#undef CLIP_8B_TO_DST
+#undef CLIPPING
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
+  VP8TransformDC = TransformDC;
+  VP8TransformAC3 = TransformAC3;
+  VP8Transform = TransformTwo;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TrueMotion4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[6] = LD4;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TrueMotion8;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+
+  VP8PredLuma16[1] = TrueMotion16;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/dec_msa.c b/media/libwebp/dsp/dec_msa.c
new file mode 100644
index 0000000000..5b0b14cc93
--- /dev/null
+++ b/media/libwebp/dsp/dec_msa.c
@@ -0,0 +1,1020 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of dsp functions
+//
+// Author(s):  Prashant Patil   (prashant.patil@imgtec.com)
+
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/msa_macro.h"
+
+//------------------------------------------------------------------------------
+// Transforms
+
+#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v4i32 a1_m, b1_m, c1_m, d1_m;                                  \
+  v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
+  const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091);           \
+  const v4i32 sinpi8sqrt2 = __msa_fill_w(35468);                 \
+                                                                 \
+  a1_m = in0 + in2;                                              \
+  b1_m = in0 - in2;                                              \
+  c_tmp1_m = (in1 * sinpi8sqrt2) >> 16;                          \
+  c_tmp2_m = in3 + ((in3 * cospi8sqrt2minus1) >> 16);            \
+  c1_m = c_tmp1_m - c_tmp2_m;                                    \
+  d_tmp1_m = in1 + ((in1 * cospi8sqrt2minus1) >> 16);            \
+  d_tmp2_m = (in3 * sinpi8sqrt2) >> 16;                          \
+  d1_m = d_tmp1_m + d_tmp2_m;                                    \
+  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
+}
+#define MULT1(a) ((((a) * 20091) >> 16) + (a))
+#define MULT2(a) (((a) * 35468) >> 16)
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  v8i16 input0, input1;
+  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+  v4i32 res0, res1, res2, res3;
+  const v16i8 zero = { 0 };
+  v16i8 dest0, dest1, dest2, dest3;
+
+  LD_SH2(in, 8, input0, input1);
+  UNPCK_SH_SW(input0, in0, in1);
+  UNPCK_SH_SW(input1, in2, in3);
+  IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+  IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+  SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+  TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+  LD_SB4(dst, BPS, dest0, dest1, dest2, dest3);
+  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+             res0, res1, res2, res3);
+  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+             res0, res1, res2, res3);
+  ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+  CLIP_SW4_0_255(res0, res1, res2, res3);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+  v8i16 input0, input1;
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+  v8i16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0, out1;
+
+  LD_SH2(in, 8, input0, input1);
+  input1 = SLDI_SH(input1, input1, 8);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  out0 = tmp2 + tmp3;
+  out1 = tmp2 - tmp3;
+  VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  tmp0 = tmp2 + tmp3;
+  tmp1 = tmp2 - tmp3;
+  ADDVI_H2_SH(tmp0, 3, tmp1, 3, out0, out1);
+  SRAI_H2_SH(out0, out1, 3);
+  out[0] = __msa_copy_s_h(out0, 0);
+  out[16] = __msa_copy_s_h(out0, 4);
+  out[32] = __msa_copy_s_h(out1, 0);
+  out[48] = __msa_copy_s_h(out1, 4);
+  out[64] = __msa_copy_s_h(out0, 1);
+  out[80] = __msa_copy_s_h(out0, 5);
+  out[96] = __msa_copy_s_h(out1, 1);
+  out[112] = __msa_copy_s_h(out1, 5);
+  out[128] = __msa_copy_s_h(out0, 2);
+  out[144] = __msa_copy_s_h(out0, 6);
+  out[160] = __msa_copy_s_h(out1, 2);
+  out[176] = __msa_copy_s_h(out1, 6);
+  out[192] = __msa_copy_s_h(out0, 3);
+  out[208] = __msa_copy_s_h(out0, 7);
+  out[224] = __msa_copy_s_h(out1, 3);
+  out[240] = __msa_copy_s_h(out1, 7);
+}
+
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+  const int DC = (in[0] + 4) >> 3;
+  const v8i16 tmp0 = __msa_fill_h(DC);
+  ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
+}
+
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  const int c4 = MULT2(in[4]);
+  const int d4 = MULT1(in[4]);
+  const int in2 = MULT2(in[1]);
+  const int in3 = MULT1(in[1]);
+  v4i32 tmp0 = { 0 };
+  v4i32 out0 = __msa_fill_w(a + d4);
+  v4i32 out1 = __msa_fill_w(a + c4);
+  v4i32 out2 = __msa_fill_w(a - c4);
+  v4i32 out3 = __msa_fill_w(a - d4);
+  v4i32 res0, res1, res2, res3;
+  const v4i32 zero = { 0 };
+  v16u8 dest0, dest1, dest2, dest3;
+
+  INSERT_W4_SW(in3, in2, -in2, -in3, tmp0);
+  ADD4(out0, tmp0, out1, tmp0, out2, tmp0, out3, tmp0,
+       out0, out1, out2, out3);
+  SRAI_W4_SW(out0, out1, out2, out3, 3);
+  LD_UB4(dst, BPS, dest0, dest1, dest2, dest3);
+  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+             res0, res1, res2, res3);
+  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+             res0, res1, res2, res3);
+  ADD4(res0, out0, res1, out1, res2, out2, res3, out3, res0, res1, res2, res3);
+  CLIP_SW4_0_255(res0, res1, res2, res3);
+  PCKEV_B2_SW(res0, res1, res2, res3, out0, out1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)out0, (v16i8)out1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+//------------------------------------------------------------------------------
+// Edge filtering functions
+
+#define FLIP_SIGN2(in0, in1, out0, out1) {  \
+  out0 = (v16i8)__msa_xori_b(in0, 0x80);    \
+  out1 = (v16i8)__msa_xori_b(in1, 0x80);    \
+}
+
+#define FLIP_SIGN4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  FLIP_SIGN2(in0, in1, out0, out1);                               \
+  FLIP_SIGN2(in2, in3, out2, out3);                               \
+}
+
+#define FILT_VAL(q0_m, p0_m, mask, filt) do {  \
+  v16i8 q0_sub_p0;                             \
+  q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);      \
+  filt = __msa_adds_s_b(filt, q0_sub_p0);      \
+  filt = __msa_adds_s_b(filt, q0_sub_p0);      \
+  filt = __msa_adds_s_b(filt, q0_sub_p0);      \
+  filt = filt & mask;                          \
+} while (0)
+
+#define FILT2(q_m, p_m, q, p) do {            \
+  u_r = SRAI_H(temp1, 7);                     \
+  u_r = __msa_sat_s_h(u_r, 7);                \
+  u_l = SRAI_H(temp3, 7);                     \
+  u_l = __msa_sat_s_h(u_l, 7);                \
+  u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);  \
+  q_m = __msa_subs_s_b(q_m, u);               \
+  p_m = __msa_adds_s_b(p_m, u);               \
+  q = __msa_xori_b((v16u8)q_m, 0x80);         \
+  p = __msa_xori_b((v16u8)p_m, 0x80);         \
+} while (0)
+
+#define LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) do {  \
+  v16i8 p1_m, p0_m, q0_m, q1_m;                         \
+  v16i8 filt, t1, t2;                                   \
+  const v16i8 cnst4b = __msa_ldi_b(4);                  \
+  const v16i8 cnst3b = __msa_ldi_b(3);                  \
+                                                        \
+  FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m);   \
+  filt = __msa_subs_s_b(p1_m, q1_m);                    \
+  filt = filt & hev;                                    \
+  FILT_VAL(q0_m, p0_m, mask, filt);                     \
+  t1 = __msa_adds_s_b(filt, cnst4b);                    \
+  t1 = SRAI_B(t1, 3);                                   \
+  t2 = __msa_adds_s_b(filt, cnst3b);                    \
+  t2 = SRAI_B(t2, 3);                                   \
+  q0_m = __msa_subs_s_b(q0_m, t1);                      \
+  q0 = __msa_xori_b((v16u8)q0_m, 0x80);                 \
+  p0_m = __msa_adds_s_b(p0_m, t2);                      \
+  p0 = __msa_xori_b((v16u8)p0_m, 0x80);                 \
+  filt = __msa_srari_b(t1, 1);                          \
+  hev = __msa_xori_b(hev, 0xff);                        \
+  filt = filt & hev;                                    \
+  q1_m = __msa_subs_s_b(q1_m, filt);                    \
+  q1 = __msa_xori_b((v16u8)q1_m, 0x80);                 \
+  p1_m = __msa_adds_s_b(p1_m, filt);                    \
+  p1 = __msa_xori_b((v16u8)p1_m, 0x80);                 \
+} while (0)
+
+#define LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) do {  \
+  v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                   \
+  v16i8 u, filt, t1, t2, filt_sign;                           \
+  v8i16 filt_r, filt_l, u_r, u_l;                             \
+  v8i16 temp0, temp1, temp2, temp3;                           \
+  const v16i8 cnst4b = __msa_ldi_b(4);                        \
+  const v16i8 cnst3b = __msa_ldi_b(3);                        \
+  const v8i16 cnst9h = __msa_ldi_h(9);                        \
+  const v8i16 cnst63h = __msa_ldi_h(63);                      \
+                                                              \
+  FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m);         \
+  filt = __msa_subs_s_b(p1_m, q1_m);                          \
+  FILT_VAL(q0_m, p0_m, mask, filt);                           \
+  FLIP_SIGN2(p2, q2, p2_m, q2_m);                             \
+  t2 = filt & hev;                                            \
+  /* filt_val &= ~hev */                                      \
+  hev = __msa_xori_b(hev, 0xff);                              \
+  filt = filt & hev;                                          \
+  t1 = __msa_adds_s_b(t2, cnst4b);                            \
+  t1 = SRAI_B(t1, 3);                                         \
+  t2 = __msa_adds_s_b(t2, cnst3b);                            \
+  t2 = SRAI_B(t2, 3);                                         \
+  q0_m = __msa_subs_s_b(q0_m, t1);                            \
+  p0_m = __msa_adds_s_b(p0_m, t2);                            \
+  filt_sign = __msa_clti_s_b(filt, 0);                        \
+  ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);               \
+  /* update q2/p2 */                                          \
+  temp0 = filt_r * cnst9h;                                    \
+  temp1 = temp0 + cnst63h;                                    \
+  temp2 = filt_l * cnst9h;                                    \
+  temp3 = temp2 + cnst63h;                                    \
+  FILT2(q2_m, p2_m, q2, p2);                                  \
+  /* update q1/p1 */                                          \
+  temp1 = temp1 + temp0;                                      \
+  temp3 = temp3 + temp2;                                      \
+  FILT2(q1_m, p1_m, q1, p1);                                  \
+  /* update q0/p0 */                                          \
+  temp1 = temp1 + temp0;                                      \
+  temp3 = temp3 + temp2;                                      \
+  FILT2(q0_m, p0_m, q0, p0);                                  \
+} while (0)
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                 \
+                     q0_in, q1_in, q2_in, q3_in,                 \
+                     limit_in, b_limit_in, thresh_in,            \
+                     hev_out, mask_out) do {                     \
+  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+  v16u8 flat_out;                                                \
+                                                                 \
+  /* absolute subtraction of pixel values */                     \
+  p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
+  p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
+  p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
+  q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
+  q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
+  q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
+  p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
+  p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
+  /* calculation of hev */                                       \
+  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+  hev_out = (thresh_in < flat_out);                              \
+  /* calculation of mask */                                      \
+  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+  p1_asub_q1_m = SRAI_B(p1_asub_q1_m, 1);                        \
+  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+  mask_out = (b_limit_in < p0_asub_q0_m);                        \
+  mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+  mask_out = (limit_in < mask_out);                              \
+  mask_out = __msa_xori_b(mask_out, 0xff);                       \
+} while (0)
+
+#define ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) do { \
+  const uint16_t tmp0_h = __msa_copy_s_h((v8i16)in1, in1_idx);  \
+  const uint32_t tmp0_w = __msa_copy_s_w((v4i32)in0, in0_idx);  \
+  SW(tmp0_w, pdst);                                             \
+  SH(tmp0_h, pdst + stride);                                    \
+} while (0)
+
+#define ST6x4_UB(in0, start_in0_idx, in1, start_in1_idx, pdst, stride) do { \
+  uint8_t* ptmp1 = (uint8_t*)pdst;                                          \
+  ST6x1_UB(in0, start_in0_idx, in1, start_in1_idx, ptmp1, 4);               \
+  ptmp1 += stride;                                                          \
+  ST6x1_UB(in0, start_in0_idx + 1, in1, start_in1_idx + 1, ptmp1, 4);       \
+  ptmp1 += stride;                                                          \
+  ST6x1_UB(in0, start_in0_idx + 2, in1, start_in1_idx + 2, ptmp1, 4);       \
+  ptmp1 += stride;                                                          \
+  ST6x1_UB(in0, start_in0_idx + 3, in1, start_in1_idx + 3, ptmp1, 4);       \
+} while (0)
+
+#define LPF_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) do {       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2;                \
+    const v16i8 cnst4b = __msa_ldi_b(4);                             \
+    const v16i8 cnst3b =  __msa_ldi_b(3);                            \
+                                                                     \
+    FLIP_SIGN4(p1_in, p0_in, q0_in, q1_in, p1_m, p0_m, q0_m, q1_m);  \
+    filt = __msa_subs_s_b(p1_m, q1_m);                               \
+    FILT_VAL(q0_m, p0_m, mask, filt);                                \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                            \
+    filt1 = SRAI_B(filt1, 3);                                        \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                            \
+    filt2 = SRAI_B(filt2, 3);                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                              \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                              \
+    q0_in = __msa_xori_b((v16u8)q0_m, 0x80);                         \
+    p0_in = __msa_xori_b((v16u8)p0_m, 0x80);                         \
+} while (0)
+
+#define LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) do {    \
+    v16u8 p1_a_sub_q1, p0_a_sub_q0;                            \
+                                                               \
+    p0_a_sub_q0 = __msa_asub_u_b(p0, q0);                      \
+    p1_a_sub_q1 = __msa_asub_u_b(p1, q1);                      \
+    p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1);  \
+    p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0);    \
+    mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1);           \
+    mask = (mask <= b_limit);                                  \
+} while (0)
+
+static void VFilter16(uint8_t* src, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptemp = src - 4 * stride;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 mask, hev;
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB8(ptemp, stride, p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  ptemp = src - 3 * stride;
+  ST_UB4(p2, p1, p0, q0, ptemp, stride);
+  ptemp += (4 * stride);
+  ST_UB2(q1, q2, ptemp, stride);
+}
+
+static void HFilter16(uint8_t* src, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptmp  = src - 4;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 mask, hev;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  v16u8 row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+  LD_UB8(ptmp, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  ptmp += (8 * stride);
+  LD_UB8(ptmp, stride, row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+  ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+  ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+  ptmp = src - 3;
+  ST6x1_UB(tmp3, 0, tmp2, 0, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp3, 1, tmp2, 1, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp3, 2, tmp2, 2, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp3, 3, tmp2, 3, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 0, tmp2, 4, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 1, tmp2, 5, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 2, tmp2, 6, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 3, tmp2, 7, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 0, tmp5, 0, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 1, tmp5, 1, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 2, tmp5, 2, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 3, tmp5, 3, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 0, tmp5, 4, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 1, tmp5, 5, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 2, tmp5, 6, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 3, tmp5, 7, ptmp, 4);
+}
+
+// on three inner edges
+static void VFilterHorEdge16i(uint8_t* src, int stride,
+                              int b_limit, int limit, int thresh) {
+  v16u8 mask, hev;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
+  const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
+  const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
+
+  LD_UB8((src - 4 * stride), stride, p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  ST_UB4(p1, p0, q0, q1, (src - 2 * stride), stride);
+}
+
+static void VFilter16i(uint8_t* src_y, int stride,
+                       int b_limit, int limit, int thresh) {
+  VFilterHorEdge16i(src_y +  4 * stride, stride, b_limit, limit, thresh);
+  VFilterHorEdge16i(src_y +  8 * stride, stride, b_limit, limit, thresh);
+  VFilterHorEdge16i(src_y + 12 * stride, stride, b_limit, limit, thresh);
+}
+
+static void HFilterVertEdge16i(uint8_t* src, int stride,
+                               int b_limit, int limit, int thresh) {
+  v16u8 mask, hev;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
+  const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
+  const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
+
+  LD_UB8(src - 4, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(src - 4 + (8 * stride), stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+  src -= 2;
+  ST4x8_UB(tmp2, tmp3, src, stride);
+  src += (8 * stride);
+  ST4x8_UB(tmp4, tmp5, src, stride);
+}
+
+static void HFilter16i(uint8_t* src_y, int stride,
+                       int b_limit, int limit, int thresh) {
+  HFilterVertEdge16i(src_y +  4, stride, b_limit, limit, thresh);
+  HFilterVertEdge16i(src_y +  8, stride, b_limit, limit, thresh);
+  HFilterVertEdge16i(src_y + 12, stride, b_limit, limit, thresh);
+}
+
+// 8-pixels wide variants, for chroma filtering
+static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
+                     int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptmp_src_u = src_u - 4 * stride;
+  uint8_t* ptmp_src_v = src_v - 4 * stride;
+  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+  LD_UB8(ptmp_src_u, stride, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+  LD_UB8(ptmp_src_v, stride, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  p2_d = __msa_copy_s_d((v2i64)p2, 0);
+  p1_d = __msa_copy_s_d((v2i64)p1, 0);
+  p0_d = __msa_copy_s_d((v2i64)p0, 0);
+  q0_d = __msa_copy_s_d((v2i64)q0, 0);
+  q1_d = __msa_copy_s_d((v2i64)q1, 0);
+  q2_d = __msa_copy_s_d((v2i64)q2, 0);
+  ptmp_src_u += stride;
+  SD4(p2_d, p1_d, p0_d, q0_d, ptmp_src_u, stride);
+  ptmp_src_u += (4 * stride);
+  SD(q1_d, ptmp_src_u);
+  ptmp_src_u += stride;
+  SD(q2_d, ptmp_src_u);
+  p2_d = __msa_copy_s_d((v2i64)p2, 1);
+  p1_d = __msa_copy_s_d((v2i64)p1, 1);
+  p0_d = __msa_copy_s_d((v2i64)p0, 1);
+  q0_d = __msa_copy_s_d((v2i64)q0, 1);
+  q1_d = __msa_copy_s_d((v2i64)q1, 1);
+  q2_d = __msa_copy_s_d((v2i64)q2, 1);
+  ptmp_src_v += stride;
+  SD4(p2_d, p1_d, p0_d, q0_d, ptmp_src_v, stride);
+  ptmp_src_v += (4 * stride);
+  SD(q1_d, ptmp_src_v);
+  ptmp_src_v += stride;
+  SD(q2_d, ptmp_src_v);
+}
+
+static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
+                     int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptmp_src_u = src_u - 4;
+  uint8_t* ptmp_src_v = src_v - 4;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  v16u8 row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+  LD_UB8(ptmp_src_u, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(ptmp_src_v, stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+  ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+  ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+  ptmp_src_u += 1;
+  ST6x4_UB(tmp3, 0, tmp2, 0, ptmp_src_u, stride);
+  ptmp_src_u += 4 * stride;
+  ST6x4_UB(tmp4, 0, tmp2, 4, ptmp_src_u, stride);
+  ptmp_src_v += 1;
+  ST6x4_UB(tmp6, 0, tmp5, 0, ptmp_src_v, stride);
+  ptmp_src_v += 4 * stride;
+  ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
+}
+
+static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  uint64_t p1_d, p0_d, q0_d, q1_d;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB8(src_u, stride, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+  src_u += (5 * stride);
+  LD_UB8(src_v, stride, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+  src_v += (5 * stride);
+  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  p1_d = __msa_copy_s_d((v2i64)p1, 0);
+  p0_d = __msa_copy_s_d((v2i64)p0, 0);
+  q0_d = __msa_copy_s_d((v2i64)q0, 0);
+  q1_d = __msa_copy_s_d((v2i64)q1, 0);
+  SD4(q1_d, q0_d, p0_d, p1_d, src_u, -stride);
+  p1_d = __msa_copy_s_d((v2i64)p1, 1);
+  p0_d = __msa_copy_s_d((v2i64)p0, 1);
+  q0_d = __msa_copy_s_d((v2i64)q0, 1);
+  q1_d = __msa_copy_s_d((v2i64)q1, 1);
+  SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
+}
+
+static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  v16u8 row9, row10, row11, row12, row13, row14, row15;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB8(src_u, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(src_v, stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
+  ILVL_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
+  src_u += 2;
+  ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, src_u, stride);
+  src_u += 4 * stride;
+  ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src_u, stride);
+  src_v += 2;
+  ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src_v, stride);
+  src_v += 4 * stride;
+  ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, src_v, stride);
+}
+
+static void SimpleVFilter16(uint8_t* src, int stride, int b_limit_in) {
+  v16u8 p1, p0, q1, q0, mask;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB4(src - 2 * stride, stride, p1, p0, q0, q1);
+  LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+  LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
+  ST_UB2(p0, q0, src - stride, stride);
+}
+
+static void SimpleHFilter16(uint8_t* src, int stride, int b_limit_in) {
+  v16u8 p1, p0, q1, q0, mask, row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  uint8_t* ptemp_src = src - 2;
+
+  LD_UB8(ptemp_src, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(ptemp_src + 8 * stride, stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p1, p0, q0, q1);
+  LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+  LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
+  ILVRL_B2_SH(q0, p0, tmp1, tmp0);
+  ptemp_src += 1;
+  ST2x4_UB(tmp1, 0, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+  ST2x4_UB(tmp1, 4, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+  ST2x4_UB(tmp0, 0, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+  ST2x4_UB(tmp0, 4, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+}
+
+static void SimpleVFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
+  SimpleVFilter16(src_y +  4 * stride, stride, b_limit_in);
+  SimpleVFilter16(src_y +  8 * stride, stride, b_limit_in);
+  SimpleVFilter16(src_y + 12 * stride, stride, b_limit_in);
+}
+
+static void SimpleHFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
+  SimpleHFilter16(src_y +  4, stride, b_limit_in);
+  SimpleHFilter16(src_y +  8, stride, b_limit_in);
+  SimpleHFilter16(src_y + 12, stride, b_limit_in);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+//------------------------------------------------------------------------------
+
+// 4x4
+
+static void DC4(uint8_t* dst) {   // DC
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
+  dc >>= 3;
+  dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
+  SW4(dc, dc, dc, dc, dst, BPS);
+}
+
+static void TM4(uint8_t* dst) {
+  const uint8_t* const ptemp = dst - BPS - 1;
+  v8i16 T, d, r0, r1, r2, r3;
+  const v16i8 zero = { 0 };
+  const v8i16 TL = (v8i16)__msa_fill_h(ptemp[0 * BPS]);
+  const v8i16 L0 = (v8i16)__msa_fill_h(ptemp[1 * BPS]);
+  const v8i16 L1 = (v8i16)__msa_fill_h(ptemp[2 * BPS]);
+  const v8i16 L2 = (v8i16)__msa_fill_h(ptemp[3 * BPS]);
+  const v8i16 L3 = (v8i16)__msa_fill_h(ptemp[4 * BPS]);
+  const v16u8 T1 = LD_UB(ptemp + 1);
+
+  T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+  d = T - TL;
+  ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
+  CLIP_SH4_0_255(r0, r1, r2, r3);
+  PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
+}
+
+static void VE4(uint8_t* dst) {    // vertical
+  const uint8_t* const ptop = dst - BPS - 1;
+  const uint32_t val0 = LW(ptop + 0);
+  const uint32_t val1 = LW(ptop + 4);
+  uint32_t out;
+  v16u8 A = { 0 }, B, C, AC, B2, R;
+
+  INSERT_W2_UB(val0, val1, A);
+  B = SLDI_UB(A, A, 1);
+  C = SLDI_UB(A, A, 2);
+  AC = __msa_ave_u_b(A, C);
+  B2 = __msa_ave_u_b(B, B);
+  R = __msa_aver_u_b(AC, B2);
+  out = __msa_copy_s_w((v4i32)R, 0);
+  SW4(out, out, out, out, dst, BPS);
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  const uint8_t* const ptop = dst - 1 - BPS;
+  uint32_t val0 = LW(ptop + 0);
+  uint32_t val1 = LW(ptop + 4);
+  uint32_t val2, val3;
+  v16u8 A, B, C, AC, B2, R, A1 = { 0 };
+
+  INSERT_W2_UB(val0, val1, A1);
+  A = SLDI_UB(A1, A1, 12);
+  A = (v16u8)__msa_insert_b((v16i8)A, 3, ptop[1 * BPS]);
+  A = (v16u8)__msa_insert_b((v16i8)A, 2, ptop[2 * BPS]);
+  A = (v16u8)__msa_insert_b((v16i8)A, 1, ptop[3 * BPS]);
+  A = (v16u8)__msa_insert_b((v16i8)A, 0, ptop[4 * BPS]);
+  B = SLDI_UB(A, A, 1);
+  C = SLDI_UB(A, A, 2);
+  AC = __msa_ave_u_b(A, C);
+  B2 = __msa_ave_u_b(B, B);
+  R = __msa_aver_u_b(AC, B2);
+  val3 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val2 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val1 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val0 = __msa_copy_s_w((v4i32)R, 0);
+  SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  const uint8_t* const ptop = dst - BPS;
+  uint32_t val0 = LW(ptop + 0);
+  uint32_t val1 = LW(ptop + 4);
+  uint32_t val2, val3;
+  v16u8 A = { 0 }, B, C, AC, B2, R;
+
+  INSERT_W2_UB(val0, val1, A);
+  B = SLDI_UB(A, A, 1);
+  C = SLDI_UB(A, A, 2);
+  C = (v16u8)__msa_insert_b((v16i8)C, 6, ptop[7]);
+  AC = __msa_ave_u_b(A, C);
+  B2 = __msa_ave_u_b(B, B);
+  R = __msa_aver_u_b(AC, B2);
+  val0 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val1 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val2 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val3 = __msa_copy_s_w((v4i32)R, 0);
+  SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+// 16x16
+
+static void DC16(uint8_t* dst) {   // DC
+  uint32_t dc = 16;
+  int i;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+  v16u8 out;
+
+  for (i = 0; i < 16; ++i) {
+    dc += dst[-1 + i * BPS];
+  }
+  dc += HADD_UH_U32(dctop);
+  out = (v16u8)__msa_fill_b(dc >> 5);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void TM16(uint8_t* dst) {
+  int j;
+  v8i16 d1, d2;
+  const v16i8 zero = { 0 };
+  const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);
+  const v16i8 T = LD_SB(dst - BPS);
+
+  ILVRL_B2_SH(zero, T, d1, d2);
+  SUB2(d1, TL, d2, TL, d1, d2);
+  for (j = 0; j < 16; j += 4) {
+    v16i8 t0, t1, t2, t3;
+    v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
+    const v8i16 L0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);
+    const v8i16 L1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);
+    const v8i16 L2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);
+    const v8i16 L3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);
+    ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
+    ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
+    CLIP_SH4_0_255(r0, r1, r2, r3);
+    CLIP_SH4_0_255(r4, r5, r6, r7);
+    PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
+    ST_SB4(t0, t1, t2, t3, dst, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void VE16(uint8_t* dst) {   // vertical
+  const v16u8 rtop = LD_UB(dst - BPS);
+  ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst, BPS);
+  ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst + 8 * BPS, BPS);
+}
+
+static void HE16(uint8_t* dst) {   // horizontal
+  int j;
+  for (j = 16; j > 0; j -= 4) {
+    const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);
+    const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);
+    const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);
+    const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);
+    ST_UB4(L0, L1, L2, L3, dst, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+  int j;
+  uint32_t dc = 8;
+  v16u8 out;
+
+  for (j = 0; j < 16; ++j) {
+    dc += dst[-1 + j * BPS];
+  }
+  out = (v16u8)__msa_fill_b(dc >> 4);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void DC16NoLeft(uint8_t* dst) {   // DC with left samples not available
+  uint32_t dc = 8;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+  v16u8 out;
+
+  dc += HADD_UH_U32(dctop);
+  out = (v16u8)__msa_fill_b(dc >> 4);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void DC16NoTopLeft(uint8_t* dst) {   // DC with nothing
+  const v16u8 out = (v16u8)__msa_fill_b(0x80);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+// Chroma
+
+#define STORE8x8(out, dst) do {                 \
+  SD4(out, out, out, out, dst + 0 * BPS, BPS);  \
+  SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
+} while (0)
+
+static void DC8uv(uint8_t* dst) {   // DC
+  uint32_t dc = 8;
+  int i;
+  uint64_t out;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 temp0 = __msa_hadd_u_h(rtop, rtop);
+  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);
+  const v2u64 temp2 = __msa_hadd_u_d(temp1, temp1);
+  v16u8 dctemp;
+
+  for (i = 0; i < 8; ++i) {
+    dc += dst[-1 + i * BPS];
+  }
+  dc += __msa_copy_s_w((v4i32)temp2, 0);
+  dctemp = (v16u8)__msa_fill_b(dc >> 4);
+  out = __msa_copy_s_d((v2i64)dctemp, 0);
+  STORE8x8(out, dst);
+}
+
+static void TM8uv(uint8_t* dst) {
+  int j;
+  const v16i8 T1 = LD_SB(dst - BPS);
+  const v16i8 zero = { 0 };
+  const v8i16 T  = (v8i16)__msa_ilvr_b(zero, T1);
+  const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);
+  const v8i16 d = T - TL;
+
+  for (j = 0; j < 8; j += 4) {
+    v16i8 t0, t1;
+    v8i16 r0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);
+    v8i16 r1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);
+    v8i16 r2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);
+    v8i16 r3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);
+    ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
+    CLIP_SH4_0_255(r0, r1, r2, r3);
+    PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
+    ST4x4_UB(t0, t1, 0, 2, 0, 2, dst, BPS);
+    ST4x4_UB(t0, t1, 1, 3, 1, 3, dst + 4, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void VE8uv(uint8_t* dst) {   // vertical
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const uint64_t out = __msa_copy_s_d((v2i64)rtop, 0);
+  STORE8x8(out, dst);
+}
+
+static void HE8uv(uint8_t* dst) {   // horizontal
+  int j;
+  for (j = 0; j < 8; j += 4) {
+    const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);
+    const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);
+    const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);
+    const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);
+    const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
+    const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
+    const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
+    const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
+    SD4(out0, out1, out2, out3, dst, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  const uint32_t dc = 4;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 temp0 = __msa_hadd_u_h(rtop, rtop);
+  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);
+  const v2u64 temp2 = __msa_hadd_u_d(temp1, temp1);
+  const uint32_t sum_m = __msa_copy_s_w((v4i32)temp2, 0);
+  const v16u8 dcval = (v16u8)__msa_fill_b((dc + sum_m) >> 3);
+  const uint64_t out = __msa_copy_s_d((v2i64)dcval, 0);
+  STORE8x8(out, dst);
+}
+
+static void DC8uvNoTop(uint8_t* dst) {   // DC with no top samples
+  uint32_t dc = 4;
+  int i;
+  uint64_t out;
+  v16u8 dctemp;
+
+  for (i = 0; i < 8; ++i) {
+    dc += dst[-1 + i * BPS];
+  }
+  dctemp = (v16u8)__msa_fill_b(dc >> 3);
+  out = __msa_copy_s_d((v2i64)dctemp, 0);
+  STORE8x8(out, dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t* dst) {   // DC with nothing
+  const uint64_t out = 0x8080808080808080ULL;
+  STORE8x8(out, dst);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
+  VP8TransformWHT = TransformWHT;
+  VP8Transform = TransformTwo;
+  VP8TransformDC = TransformDC;
+  VP8TransformAC3 = TransformAC3;
+
+  VP8VFilter16  = VFilter16;
+  VP8HFilter16  = HFilter16;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8  = VFilter8;
+  VP8HFilter8  = HFilter8;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16  = SimpleVFilter16;
+  VP8SimpleHFilter16  = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8DspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/dec_neon.c b/media/libwebp/dsp/dec_neon.c
index e8341327e4..ea690646d6 100644
--- a/media/libwebp/dsp/dec_neon.c
+++ b/media/libwebp/dsp/dec_neon.c
@@ -1283,12 +1283,12 @@ static void DC4_NEON(uint8_t* dst) {    // DC
   const uint8x8_t A = vld1_u8(dst - BPS);  // top row
   const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
   const uint16x4_t p1 = vpadd_u16(p0, p0);
-  const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
-  const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
-  const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
-  const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
-  const uint16x8_t s0 = vaddq_u16(L0, L1);
-  const uint16x8_t s1 = vaddq_u16(L2, L3);
+  const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+  const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+  const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+  const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+  const uint16x8_t s0 = vaddl_u8(L0, L1);
+  const uint16x8_t s1 = vaddl_u8(L2, L3);
   const uint16x8_t s01 = vaddq_u16(s0, s1);
   const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
@@ -1361,7 +1361,8 @@ static void RD4_NEON(uint8_t* dst) {   // Down-right
   const uint32_t J = dst[-1 + 1 * BPS];
   const uint32_t K = dst[-1 + 2 * BPS];
   const uint32_t L = dst[-1 + 3 * BPS];
-  const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
+  const uint64x1_t LKJI____ =
+      vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
   const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
   const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
   const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
@@ -1427,25 +1428,30 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
 
   if (do_top) {
     const uint8x8_t A = vld1_u8(dst - BPS);  // top row
+#if defined(__aarch64__)
+    const uint16_t p2 = vaddlv_u8(A);
+    sum_top = vdupq_n_u16(p2);
+#else
     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
     const uint16x4_t p1 = vpadd_u16(p0, p0);
     const uint16x4_t p2 = vpadd_u16(p1, p1);
     sum_top = vcombine_u16(p2, p2);
+#endif
   }
 
   if (do_left) {
-    const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
-    const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
-    const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
-    const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
-    const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
-    const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
-    const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
-    const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
-    const uint16x8_t s0 = vaddq_u16(L0, L1);
-    const uint16x8_t s1 = vaddq_u16(L2, L3);
-    const uint16x8_t s2 = vaddq_u16(L4, L5);
-    const uint16x8_t s3 = vaddq_u16(L6, L7);
+    const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+    const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+    const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+    const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+    const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
+    const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
+    const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
+    const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
+    const uint16x8_t s0 = vaddl_u8(L0, L1);
+    const uint16x8_t s1 = vaddl_u8(L2, L3);
+    const uint16x8_t s2 = vaddl_u8(L4, L5);
+    const uint16x8_t s3 = vaddl_u8(L6, L7);
     const uint16x8_t s01 = vaddq_u16(s0, s1);
     const uint16x8_t s23 = vaddq_u16(s2, s3);
     sum_left = vaddq_u16(s01, s23);
@@ -1505,29 +1511,34 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
 
   if (do_top) {
     const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
+#if defined(__aarch64__)
+    const uint16_t p3 = vaddlvq_u8(A);
+    sum_top = vdupq_n_u16(p3);
+#else
     const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
     const uint16x4_t p2 = vpadd_u16(p1, p1);
     const uint16x4_t p3 = vpadd_u16(p2, p2);
     sum_top = vcombine_u16(p3, p3);
+#endif
   }
 
   if (do_left) {
     int i;
     sum_left = vdupq_n_u16(0);
     for (i = 0; i < 16; i += 8) {
-      const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
-      const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
-      const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
-      const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
-      const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
-      const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
-      const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
-      const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
-      const uint16x8_t s0 = vaddq_u16(L0, L1);
-      const uint16x8_t s1 = vaddq_u16(L2, L3);
-      const uint16x8_t s2 = vaddq_u16(L4, L5);
-      const uint16x8_t s3 = vaddq_u16(L6, L7);
+      const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
+      const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
+      const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
+      const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
+      const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
+      const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
+      const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
+      const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
+      const uint16x8_t s0 = vaddl_u8(L0, L1);
+      const uint16x8_t s1 = vaddl_u8(L2, L3);
+      const uint16x8_t s2 = vaddl_u8(L4, L5);
+      const uint16x8_t s3 = vaddl_u8(L6, L7);
       const uint16x8_t s01 = vaddq_u16(s0, s1);
       const uint16x8_t s23 = vaddq_u16(s2, s3);
       const uint16x8_t sum = vaddq_u16(s01, s23);
diff --git a/media/libwebp/dsp/dec_sse2.c b/media/libwebp/dsp/dec_sse2.c
index f187a5bb48..b90c082793 100644
--- a/media/libwebp/dsp/dec_sse2.c
+++ b/media/libwebp/dsp/dec_sse2.c
@@ -326,7 +326,7 @@ static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
   const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
   const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
   const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
-  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i sign_bit = _mm_set1_epi8((char)0x80);
   *pi = _mm_adds_epi8(*pi, delta);
   *qi = _mm_subs_epi8(*qi, delta);
   FLIP_SIGN_BIT2(*pi, *qi);
@@ -338,9 +338,9 @@ static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
                                          const __m128i* const q0,
                                          const __m128i* const q1,
                                          int thresh, __m128i* const mask) {
-  const __m128i m_thresh = _mm_set1_epi8(thresh);
+  const __m128i m_thresh = _mm_set1_epi8((char)thresh);
   const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
-  const __m128i kFE = _mm_set1_epi8(0xFE);
+  const __m128i kFE = _mm_set1_epi8((char)0xFE);
   const __m128i t2 = _mm_and_si128(t1, kFE);  // set lsb of each byte to zero
   const __m128i t3 = _mm_srli_epi16(t2, 1);   // abs(p1 - q1) / 2
 
@@ -360,7 +360,7 @@ static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0,
                                        __m128i* const q0, __m128i* const q1,
                                        int thresh) {
   __m128i a, mask;
-  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i sign_bit = _mm_set1_epi8((char)0x80);
   // convert p1/q1 to int8_t (for GetBaseDelta_SSE2)
   const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
   const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
@@ -380,7 +380,7 @@ static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
                                        const __m128i* const mask,
                                        int hev_thresh) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i sign_bit = _mm_set1_epi8((char)0x80);
   const __m128i k64 = _mm_set1_epi8(64);
   const __m128i k3 = _mm_set1_epi8(3);
   const __m128i k4 = _mm_set1_epi8(4);
@@ -427,7 +427,7 @@ static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
                                        const __m128i* const mask,
                                        int hev_thresh) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i sign_bit = _mm_set1_epi8((char)0x80);
   __m128i a, not_hev;
 
   // compute hev mask
@@ -941,7 +941,7 @@ static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
   const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
   const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
   const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
-  const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0);
+  const __m128i IXABCD = _mm_insert_epi16(_XABCD, (short)(I | (X << 8)), 0);
   const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
diff --git a/media/libwebp/dsp/dsp.h b/media/libwebp/dsp/dsp.h
index 4e509bd2ca..ce1679ea53 100644
--- a/media/libwebp/dsp/dsp.h
+++ b/media/libwebp/dsp/dsp.h
@@ -27,6 +27,23 @@ extern "C" {
 #define BPS 32   // this is the common stride for enc/dec
 
 //------------------------------------------------------------------------------
+// WEBP_RESTRICT
+
+// Declares a pointer with the restrict type qualifier if available.
+// This allows code to hint to the compiler that only this pointer references a
+// particular object or memory region within the scope of the block in which it
+// is declared. This may allow for improved optimizations due to the lack of
+// pointer aliasing. See also:
+// https://en.cppreference.com/w/c/language/restrict
+#if defined(__GNUC__)
+#define WEBP_RESTRICT __restrict__
+#elif defined(_MSC_VER)
+#define WEBP_RESTRICT __restrict
+#else
+#define WEBP_RESTRICT
+#endif
+
+//------------------------------------------------------------------------------
 // CPU detection
 
 #if defined(__GNUC__)
@@ -51,9 +68,7 @@ extern "C" {
 # define __has_builtin(x) 0
 #endif
 
-// for now, none of the optimizations below are available in emscripten
-#if !defined(EMSCRIPTEN)
-
+#if !defined(HAVE_CONFIG_H)
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
     (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
@@ -63,23 +78,37 @@ extern "C" {
     (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
 #endif
+#endif
 
 // WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
 // files without intrinsics, allowing the corresponding Init() to be called.
 // Files containing intrinsics will need to be built targeting the instruction
 // set so should succeed on one of the earlier tests.
-#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
+#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2))
 #define WEBP_USE_SSE2
 #endif
 
-#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
+#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2)
+#define WEBP_HAVE_SSE2
+#endif
+
+#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \
+    (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41))
 #define WEBP_USE_SSE41
 #endif
 
+#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41)
+#define WEBP_HAVE_SSE41
+#endif
+
+#undef WEBP_MSC_SSE41
+#undef WEBP_MSC_SSE2
+
 // The intrinsics currently cause compiler errors with arm-nacl-gcc and the
 // inline assembly would need to be modified for use with Native Client.
-#if (defined(__ARM_NEON__) || \
-     defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
+#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \
+     (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
     !defined(__native_client__)
 #define WEBP_USE_NEON
 #endif
@@ -90,11 +119,20 @@ extern "C" {
 #define WEBP_USE_NEON
 #endif
 
-#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
+// Note: ARM64 is supported in Visual Studio 2017, but requires the direct
+// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in
+// arm_neon.h.
+#if defined(_MSC_VER) && \
+  ((_MSC_VER >= 1700 && defined(_M_ARM)) || \
+   (_MSC_VER >= 1920 && defined(_M_ARM64)))
 #define WEBP_USE_NEON
 #define WEBP_USE_INTRINSICS
 #endif
 
+#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
+#define WEBP_HAVE_NEON
+#endif
+
 #if defined(__mips__) && !defined(__mips64) && \
     defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
 #define WEBP_USE_MIPS32
@@ -110,13 +148,11 @@ extern "C" {
 #define WEBP_USE_MSA
 #endif
 
-#endif  /* EMSCRIPTEN */
-
 #ifndef WEBP_DSP_OMIT_C_CODE
 #define WEBP_DSP_OMIT_C_CODE 1
 #endif
 
-#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
+#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE
 #define WEBP_NEON_OMIT_C_CODE 1
 #else
 #define WEBP_NEON_OMIT_C_CODE 0
@@ -193,6 +229,12 @@ extern "C" {
 #endif
 #endif
 
+// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
+// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
+#if !defined(WEBP_OFFSET_PTR)
+#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
+#endif
+
 // Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
 #if !defined(WEBP_SWAP_16BIT_CSP)
 #define WEBP_SWAP_16BIT_CSP 0
@@ -246,9 +288,9 @@ extern VP8Fdct VP8FTransform2;   // performs two transforms at a time
 extern VP8WHT VP8FTransformWHT;
 // Predictions
 // *dst is the destination block. *top and *left can be NULL.
-typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left,
+typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left,
                               const uint8_t* top);
-typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top);
+typedef void (*VP8Intra4Preds)(uint8_t* dst, const uint8_t* top);
 extern VP8Intra4Preds VP8EncPredLuma4;
 extern VP8IntraPreds VP8EncPredLuma16;
 extern VP8IntraPreds VP8EncPredChroma8;
@@ -572,26 +614,29 @@ extern void (*WebPApplyAlphaMultiply4444)(
 
 // Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
 // Returns true if alpha[] plane has non-trivial values different from 0xff.
-extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
-                                int width, int height,
-                                uint8_t* dst, int dst_stride);
+extern int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT alpha,
+                                int alpha_stride, int width, int height,
+                                uint8_t* WEBP_RESTRICT dst, int dst_stride);
 
 // Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
 // A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
-extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
-                                        int width, int height,
-                                        uint32_t* dst, int dst_stride);
+extern void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT alpha,
+                                        int alpha_stride, int width, int height,
+                                        uint32_t* WEBP_RESTRICT dst,
+                                        int dst_stride);
 
 // Extract the alpha values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlpha).
 // Returns true if there's only trivial 0xff alpha values.
-extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
-                               int width, int height,
-                               uint8_t* alpha, int alpha_stride);
+extern int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT argb,
+                               int argb_stride, int width, int height,
+                               uint8_t* WEBP_RESTRICT alpha,
+                               int alpha_stride);
 
 // Extract the green values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlphaToGreen).
-extern void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+extern void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
+                                uint8_t* WEBP_RESTRICT alpha, int size);
 
 // Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
 // Un-Multiply operation transforms x into x * 255 / A.
@@ -604,34 +649,42 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
                       int inverse);
 
 // Same for a row of single values, with side alpha values.
-extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+extern void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
+                           const uint8_t* WEBP_RESTRICT const alpha,
                            int width, int inverse);
 
 // Same a WebPMultRow(), but for several 'num_rows' rows.
-void WebPMultRows(uint8_t* ptr, int stride,
-                  const uint8_t* alpha, int alpha_stride,
+void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
+                  const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
                   int width, int num_rows, int inverse);
 
 // Plain-C versions, used as fallback by some implementations.
-void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
+                   const uint8_t* WEBP_RESTRICT const alpha,
                    int width, int inverse);
 void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
 
 #ifdef WORDS_BIGENDIAN
 // ARGB packing function: a/r/g/b input is rgba or bgra order.
-extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
-                            const uint8_t* g, const uint8_t* b, int len,
-                            uint32_t* out);
+extern void (*WebPPackARGB)(const uint8_t* WEBP_RESTRICT a,
+                            const uint8_t* WEBP_RESTRICT r,
+                            const uint8_t* WEBP_RESTRICT g,
+                            const uint8_t* WEBP_RESTRICT b,
+                            int len, uint32_t* WEBP_RESTRICT out);
 #endif
 
 // RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
-extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                           int len, int step, uint32_t* out);
+extern void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
+                           const uint8_t* WEBP_RESTRICT g,
+                           const uint8_t* WEBP_RESTRICT b,
+                           int len, int step, uint32_t* WEBP_RESTRICT out);
 
 // This function returns true if src[i] contains a value different from 0xff.
 extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
 // This function returns true if src[4*i] contains a value different from 0xff.
 extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+// replaces transparent values in src[] by 'color'.
+extern void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
 
 // To be called first before using the above.
 void WebPInitAlphaProcessing(void);
diff --git a/media/libwebp/dsp/enc.c b/media/libwebp/dsp/enc.c
new file mode 100644
index 0000000000..69a2f5c577
--- /dev/null
+++ b/media/libwebp/dsp/enc.c
@@ -0,0 +1,830 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical encoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>  // for abs()
+
+#include "../dsp/dsp.h"
+#include "../enc/vp8i_enc.h"
+
+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+}
+
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE int clip_max(int v, int max) {
+  return (v > max) ? max : v;
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+const int VP8DspScan[16 + 4 + 4] = {
+  // Luma
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
+
+  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
+  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
+};
+
+// general-purpose util function
+void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
+                         VP8Histogram* const histo) {
+  int max_value = 0, last_non_zero = 1;
+  int k;
+  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
+    const int value = distribution[k];
+    if (value > 0) {
+      if (value > max_value) max_value = value;
+      last_non_zero = k;
+    }
+  }
+  histo->max_value = max_value;
+  histo->last_non_zero = last_non_zero;
+}
+
+#if !WEBP_NEON_OMIT_C_CODE
+static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
+                               int start_block, int end_block,
+                               VP8Histogram* const histo) {
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int k;
+    int16_t out[16];
+
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin.
+    for (k = 0; k < 16; ++k) {
+      const int v = abs(out[k]) >> 3;
+      const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
+      ++distribution[clipped_value];
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+// run-time tables (~4k)
+
+static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
+
+// We declare this variable 'volatile' to prevent instruction reordering
+// and make sure it's set to true _last_ (so as to be thread-safe)
+static volatile int tables_ok = 0;
+
+static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
+  if (!tables_ok) {
+    int i;
+    for (i = -255; i <= 255 + 255; ++i) {
+      clip1[255 + i] = clip_8b(i);
+    }
+    tables_ok = 1;
+  }
+}
+
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+#if !WEBP_NEON_OMIT_C_CODE
+
+#define STORE(x, y, v) \
+  dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  int C[4 * 4], *tmp;
+  int i;
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // vertical pass
+    const int a = in[0] + in[8];
+    const int b = in[0] - in[8];
+    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
+    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
+    tmp[0] = a + d;
+    tmp[1] = b + c;
+    tmp[2] = b - c;
+    tmp[3] = a - d;
+    tmp += 4;
+    in++;
+  }
+
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // horizontal pass
+    const int dc = tmp[0] + 4;
+    const int a =  dc +  tmp[8];
+    const int b =  dc -  tmp[8];
+    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
+    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+    STORE(0, i, a + d);
+    STORE(1, i, b + c);
+    STORE(2, i, b - c);
+    STORE(3, i, a - d);
+    tmp++;
+  }
+}
+
+static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                         int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  int i;
+  int tmp[16];
+  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
+    const int d0 = src[0] - ref[0];   // 9bit dynamic range ([-255,255])
+    const int d1 = src[1] - ref[1];
+    const int d2 = src[2] - ref[2];
+    const int d3 = src[3] - ref[3];
+    const int a0 = (d0 + d3);         // 10b                      [-510,510]
+    const int a1 = (d1 + d2);
+    const int a2 = (d1 - d2);
+    const int a3 = (d0 - d3);
+    tmp[0 + i * 4] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
+    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
+    tmp[2 + i * 4] = (a0 - a1) * 8;
+    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int a0 = (tmp[0 + i] + tmp[12 + i]);  // 15b
+    const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
+    const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
+    const int a3 = (tmp[0 + i] - tmp[12 + i]);
+    out[0 + i] = (a0 + a1 + 7) >> 4;            // 12b
+    out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
+    out[8 + i] = (a0 - a1 + 7) >> 4;
+    out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
+  }
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
+                          int16_t* out) {
+  VP8FTransform(src, ref, out);
+  VP8FTransform(src + 4, ref + 4, out + 16);
+}
+
+#if !WEBP_NEON_OMIT_C_CODE
+static void FTransformWHT_C(const int16_t* in, int16_t* out) {
+  // input is 12b signed
+  int32_t tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i, in += 64) {
+    const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
+    const int a1 = (in[1 * 16] + in[3 * 16]);
+    const int a2 = (in[1 * 16] - in[3 * 16]);
+    const int a3 = (in[0 * 16] - in[2 * 16]);
+    tmp[0 + i * 4] = a0 + a1;   // 14b
+    tmp[1 + i * 4] = a3 + a2;
+    tmp[2 + i * 4] = a3 - a2;
+    tmp[3 + i * 4] = a0 - a1;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int a0 = (tmp[0 + i] + tmp[8 + i]);  // 15b
+    const int a1 = (tmp[4 + i] + tmp[12+ i]);
+    const int a2 = (tmp[4 + i] - tmp[12+ i]);
+    const int a3 = (tmp[0 + i] - tmp[8 + i]);
+    const int b0 = a0 + a1;    // 16b
+    const int b1 = a3 + a2;
+    const int b2 = a3 - a2;
+    const int b3 = a0 - a1;
+    out[ 0 + i] = b0 >> 1;     // 15b
+    out[ 4 + i] = b1 >> 1;
+    out[ 8 + i] = b2 >> 1;
+    out[12 + i] = b3 >> 1;
+  }
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+#undef MUL
+#undef STORE
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
+  int j;
+  for (j = 0; j < size; ++j) {
+    memset(dst + j * BPS, value, size);
+  }
+}
+
+static WEBP_INLINE void VerticalPred(uint8_t* dst,
+                                     const uint8_t* top, int size) {
+  int j;
+  if (top != NULL) {
+    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
+  } else {
+    Fill(dst, 127, size);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred(uint8_t* dst,
+                                       const uint8_t* left, int size) {
+  if (left != NULL) {
+    int j;
+    for (j = 0; j < size; ++j) {
+      memset(dst + j * BPS, left[j], size);
+    }
+  } else {
+    Fill(dst, 129, size);
+  }
+}
+
+static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top, int size) {
+  int y;
+  if (left != NULL) {
+    if (top != NULL) {
+      const uint8_t* const clip = clip1 + 255 - left[-1];
+      for (y = 0; y < size; ++y) {
+        const uint8_t* const clip_table = clip + left[y];
+        int x;
+        for (x = 0; x < size; ++x) {
+          dst[x] = clip_table[top[x]];
+        }
+        dst += BPS;
+      }
+    } else {
+      HorizontalPred(dst, left, size);
+    }
+  } else {
+    // true motion without left samples (hence: with default 129 value)
+    // is equivalent to VE prediction where you just copy the top samples.
+    // Note that if top samples are not available, the default value is
+    // then 129, and not 127 as in the VerticalPred case.
+    if (top != NULL) {
+      VerticalPred(dst, top, size);
+    } else {
+      Fill(dst, 129, size);
+    }
+  }
+}
+
+static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
+                               const uint8_t* top,
+                               int size, int round, int shift) {
+  int DC = 0;
+  int j;
+  if (top != NULL) {
+    for (j = 0; j < size; ++j) DC += top[j];
+    if (left != NULL) {   // top and left present
+      for (j = 0; j < size; ++j) DC += left[j];
+    } else {      // top, but no left
+      DC += DC;
+    }
+    DC = (DC + round) >> shift;
+  } else if (left != NULL) {   // left but no top
+    for (j = 0; j < size; ++j) DC += left[j];
+    DC += DC;
+    DC = (DC + round) >> shift;
+  } else {   // no top, no left, nothing.
+    DC = 0x80;
+  }
+  Fill(dst, DC, size);
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
+                               const uint8_t* top) {
+  // U block
+  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
+  VerticalPred(C8VE8 + dst, top, 8);
+  HorizontalPred(C8HE8 + dst, left, 8);
+  TrueMotion(C8TM8 + dst, left, top, 8);
+  // V block
+  dst += 8;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
+  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
+  VerticalPred(C8VE8 + dst, top, 8);
+  HorizontalPred(C8HE8 + dst, left, 8);
+  TrueMotion(C8TM8 + dst, left, top, 8);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds_C(uint8_t* dst,
+                           const uint8_t* left, const uint8_t* top) {
+  DCMode(I16DC16 + dst, left, top, 16, 16, 5);
+  VerticalPred(I16VE16 + dst, top, 16);
+  HorizontalPred(I16HE16 + dst, left, 16);
+  TrueMotion(I16TM16 + dst, left, top, 16);
+}
+
+//------------------------------------------------------------------------------
+// luma 4x4 prediction
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+  const uint8_t vals[4] = {
+    AVG3(top[-1], top[0], top[1]),
+    AVG3(top[ 0], top[1], top[2]),
+    AVG3(top[ 1], top[2], top[3]),
+    AVG3(top[ 2], top[3], top[4])
+  };
+  int i;
+  for (i = 0; i < 4; ++i) {
+    memcpy(dst + i * BPS, vals, 4);
+  }
+}
+
+static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
+}
+
+static void DC4(uint8_t* dst, const uint8_t* top) {
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+  Fill(dst, dc >> 3, 4);
+}
+
+static void RD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
+  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
+  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
+  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
+  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
+  DST(3, 0)                                     = AVG3(D, C, B);
+}
+
+static void LD4(uint8_t* dst, const uint8_t* top) {
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  const int E = top[4];
+  const int F = top[5];
+  const int G = top[6];
+  const int H = top[7];
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
+  DST(3, 3)                                     = AVG3(G, H, H);
+}
+
+static void VR4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
+
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
+}
+
+static void VL4(uint8_t* dst, const uint8_t* top) {
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  const int E = top[4];
+  const int F = top[5];
+  const int G = top[6];
+  const int H = top[7];
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
+}
+
+static void HU4(uint8_t* dst, const uint8_t* top) {
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static void HD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+static void TM4(uint8_t* dst, const uint8_t* top) {
+  int x, y;
+  const uint8_t* const clip = clip1 + 255 - top[-1];
+  for (y = 0; y < 4; ++y) {
+    const uint8_t* const clip_table = clip + top[-2 - y];
+    for (x = 0; x < 4; ++x) {
+      dst[x] = clip_table[top[x]];
+    }
+    dst += BPS;
+  }
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
+                              int w, int h) {
+  int count = 0;
+  int y, x;
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      const int diff = (int)a[x] - b[x];
+      count += diff * diff;
+    }
+    a += BPS;
+    b += BPS;
+  }
+  return count;
+}
+
+static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 16, 16);
+}
+static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 16, 8);
+}
+static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 8, 8);
+}
+static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 4, 4);
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
+  int k, x, y;
+  for (k = 0; k < 4; ++k) {
+    uint32_t avg = 0;
+    for (y = 0; y < 4; ++y) {
+      for (x = 0; x < 4; ++x) {
+        avg += ref[x + y * BPS];
+      }
+    }
+    dc[k] = avg;
+    ref += 4;   // go to next 4x4 block.
+  }
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+#if !WEBP_NEON_OMIT_C_CODE
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
+static int TTransform(const uint8_t* in, const uint16_t* w) {
+  int sum = 0;
+  int tmp[16];
+  int i;
+  // horizontal pass
+  for (i = 0; i < 4; ++i, in += BPS) {
+    const int a0 = in[0] + in[2];
+    const int a1 = in[1] + in[3];
+    const int a2 = in[1] - in[3];
+    const int a3 = in[0] - in[2];
+    tmp[0 + i * 4] = a0 + a1;
+    tmp[1 + i * 4] = a3 + a2;
+    tmp[2 + i * 4] = a3 - a2;
+    tmp[3 + i * 4] = a0 - a1;
+  }
+  // vertical pass
+  for (i = 0; i < 4; ++i, ++w) {
+    const int a0 = tmp[0 + i] + tmp[8 + i];
+    const int a1 = tmp[4 + i] + tmp[12+ i];
+    const int a2 = tmp[4 + i] - tmp[12+ i];
+    const int a3 = tmp[0 + i] - tmp[8 + i];
+    const int b0 = a0 + a1;
+    const int b1 = a3 + a2;
+    const int b2 = a3 - a2;
+    const int b3 = a0 - a1;
+
+    sum += w[ 0] * abs(b0);
+    sum += w[ 4] * abs(b1);
+    sum += w[ 8] * abs(b2);
+    sum += w[12] * abs(b3);
+  }
+  return sum;
+}
+
+static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  const int sum1 = TTransform(a, w);
+  const int sum2 = TTransform(b, w);
+  return abs(sum2 - sum1) >> 5;
+}
+
+static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_C(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+// Simple quantization
+static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
+                           const VP8Matrix* const mtx) {
+  int last = -1;
+  int n;
+  for (n = 0; n < 16; ++n) {
+    const int j = kZigzag[n];
+    const int sign = (in[j] < 0);
+    const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    if (coeff > mtx->zthresh_[j]) {
+      const uint32_t Q = mtx->q_[j];
+      const uint32_t iQ = mtx->iq_[j];
+      const uint32_t B = mtx->bias_[j];
+      int level = QUANTDIV(coeff, iQ, B);
+      if (level > MAX_LEVEL) level = MAX_LEVEL;
+      if (sign) level = -level;
+      in[j] = level * (int)Q;
+      out[n] = level;
+      if (level) last = n;
+    } else {
+      out[n] = 0;
+      in[j] = 0;
+    }
+  }
+  return (last >= 0);
+}
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
+                             const VP8Matrix* const mtx) {
+  int nz;
+  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+
+//------------------------------------------------------------------------------
+// Block copy
+
+static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
+  int y;
+  for (y = 0; y < h; ++y) {
+    memcpy(dst, src, w);
+    src += BPS;
+    dst += BPS;
+  }
+}
+
+static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
+  Copy(src, dst, 4, 4);
+}
+
+static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
+  Copy(src, dst, 16, 8);
+}
+
+//------------------------------------------------------------------------------
+// Initialization
+
+// Speed-critical function pointers. We have to initialize them to the default
+// implementations within VP8EncDspInit().
+VP8CHisto VP8CollectHistogram;
+VP8Idct VP8ITransform;
+VP8Fdct VP8FTransform;
+VP8Fdct VP8FTransform2;
+VP8WHT VP8FTransformWHT;
+VP8Intra4Preds VP8EncPredLuma4;
+VP8IntraPreds VP8EncPredLuma16;
+VP8IntraPreds VP8EncPredChroma8;
+VP8Metric VP8SSE16x16;
+VP8Metric VP8SSE8x8;
+VP8Metric VP8SSE16x8;
+VP8Metric VP8SSE4x4;
+VP8WMetric VP8TDisto4x4;
+VP8WMetric VP8TDisto16x16;
+VP8MeanMetric VP8Mean16x4;
+VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8Quantize2Blocks VP8EncQuantize2Blocks;
+VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
+VP8BlockCopy VP8Copy4x4;
+VP8BlockCopy VP8Copy16x8;
+
+extern void VP8EncDspInitSSE2(void);
+extern void VP8EncDspInitSSE41(void);
+extern void VP8EncDspInitNEON(void);
+extern void VP8EncDspInitMIPS32(void);
+extern void VP8EncDspInitMIPSdspR2(void);
+extern void VP8EncDspInitMSA(void);
+
+WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
+  VP8DspInit();  // common inverse transforms
+  InitTables();
+
+  // default C implementations
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8ITransform = ITransform_C;
+  VP8FTransform = FTransform_C;
+  VP8FTransformWHT = FTransformWHT_C;
+  VP8TDisto4x4 = Disto4x4_C;
+  VP8TDisto16x16 = Disto16x16_C;
+  VP8CollectHistogram = CollectHistogram_C;
+  VP8SSE16x16 = SSE16x16_C;
+  VP8SSE16x8 = SSE16x8_C;
+  VP8SSE8x8 = SSE8x8_C;
+  VP8SSE4x4 = SSE4x4_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+  VP8EncQuantizeBlock = QuantizeBlock_C;
+  VP8EncQuantize2Blocks = Quantize2Blocks_C;
+#endif
+
+  VP8FTransform2 = FTransform2_C;
+  VP8EncPredLuma4 = Intra4Preds_C;
+  VP8EncPredLuma16 = Intra16Preds_C;
+  VP8EncPredChroma8 = IntraChromaPreds_C;
+  VP8Mean16x4 = Mean16x4_C;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
+  VP8Copy4x4 = Copy4x4_C;
+  VP8Copy16x8 = Copy16x8_C;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_HAVE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspInitSSE2();
+#if defined(WEBP_HAVE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8EncDspInitSSE41();
+      }
+#endif
+    }
+#endif
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8EncDspInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspInitMIPSdspR2();
+    }
+#endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8EncDspInitMSA();
+    }
+#endif
+  }
+
+#if defined(WEBP_HAVE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8EncDspInitNEON();
+  }
+#endif
+
+  assert(VP8ITransform != NULL);
+  assert(VP8FTransform != NULL);
+  assert(VP8FTransformWHT != NULL);
+  assert(VP8TDisto4x4 != NULL);
+  assert(VP8TDisto16x16 != NULL);
+  assert(VP8CollectHistogram != NULL);
+  assert(VP8SSE16x16 != NULL);
+  assert(VP8SSE16x8 != NULL);
+  assert(VP8SSE8x8 != NULL);
+  assert(VP8SSE4x4 != NULL);
+  assert(VP8EncQuantizeBlock != NULL);
+  assert(VP8EncQuantize2Blocks != NULL);
+  assert(VP8FTransform2 != NULL);
+  assert(VP8EncPredLuma4 != NULL);
+  assert(VP8EncPredLuma16 != NULL);
+  assert(VP8EncPredChroma8 != NULL);
+  assert(VP8Mean16x4 != NULL);
+  assert(VP8EncQuantizeBlockWHT != NULL);
+  assert(VP8Copy4x4 != NULL);
+  assert(VP8Copy16x8 != NULL);
+}
diff --git a/media/libwebp/dsp/enc_mips32.c b/media/libwebp/dsp/enc_mips32.c
new file mode 100644
index 0000000000..ee26dfb493
--- /dev/null
+++ b/media/libwebp/dsp/enc_mips32.c
@@ -0,0 +1,677 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of speed-critical encoding functions.
+//
+// Author(s): Djordje Pesut    (djordje.pesut@imgtec.com)
+//            Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+//            Slobodan Prijic  (slobodan.prijic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../dsp/mips_macro.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/cost_enc.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+// macro for one vertical pass in ITransformOne
+// MUL macro inlined
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to load from in buffer
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+// TEMP4..TEMP5 - temporary registers
+#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
+  "lh      %[temp16],      " #A "(%[temp20])                 \n\t"          \
+  "lh      %[temp18],      " #B "(%[temp20])                 \n\t"          \
+  "lh      %[temp17],      " #C "(%[temp20])                 \n\t"          \
+  "lh      %[temp19],      " #D "(%[temp20])                 \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp16],      %[temp18]       \n\t"          \
+  "subu    %[temp16],      %[temp16],      %[temp18]         \n\t"          \
+  "mul     %[" #TEMP0 "],    %[temp17],      %[kC2]          \n\t"          \
+  "mul     %[temp18],      %[temp19],      %[kC1]            \n\t"          \
+  "mul     %[temp17],      %[temp17],      %[kC1]            \n\t"          \
+  "mul     %[temp19],      %[temp19],      %[kC2]            \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\n"          \
+  "sra     %[temp18],      %[temp18],      16                \n\n"          \
+  "sra     %[temp17],      %[temp17],      16                \n\n"          \
+  "sra     %[temp19],      %[temp19],      16                \n\n"          \
+  "subu    %[" #TEMP2 "],    %[" #TEMP0 "],    %[temp18]     \n\t"          \
+  "addu    %[" #TEMP3 "],    %[temp17],      %[temp19]       \n\t"          \
+  "addu    %[" #TEMP0 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"          \
+  "addu    %[" #TEMP1 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP2 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP3 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"
+
+// macro for one horizontal pass in ITransformOne
+// MUL and STORE macros inlined
+// a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A - offset in bytes to load from ref and store to dst buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12)                       \
+  "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4               \n\t"          \
+  "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]          \n\t"          \
+  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]          \n\t"          \
+  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]          \n\t"          \
+  "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]          \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16              \n\t"          \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16              \n\t"          \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16              \n\t"          \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16              \n\t"          \
+  "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]    \n\t"          \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[temp19]         \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[temp18]         \n\t"          \
+  "subu    %[" #TEMP8 "],    %[temp17],      %[temp18]         \n\t"          \
+  "subu    %[" #TEMP12 "],   %[temp16],      %[temp19]         \n\t"          \
+  "lw      %[temp20],      0(%[args])                          \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    3               \n\t"          \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    3               \n\t"          \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    3               \n\t"          \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   3               \n\t"          \
+  "lbu     %[temp16],      0+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp17],      1+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp18],      2+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp19],      3+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[" #TEMP0 "]     \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[" #TEMP4 "]     \n\t"          \
+  "addu    %[" #TEMP8 "],    %[temp18],      %[" #TEMP8 "]     \n\t"          \
+  "addu    %[" #TEMP12 "],   %[temp19],      %[" #TEMP12 "]    \n\t"          \
+  "slt     %[temp16],      %[" #TEMP0 "],    $zero             \n\t"          \
+  "slt     %[temp17],      %[" #TEMP4 "],    $zero             \n\t"          \
+  "slt     %[temp18],      %[" #TEMP8 "],    $zero             \n\t"          \
+  "slt     %[temp19],      %[" #TEMP12 "],   $zero             \n\t"          \
+  "movn    %[" #TEMP0 "],    $zero,          %[temp16]         \n\t"          \
+  "movn    %[" #TEMP4 "],    $zero,          %[temp17]         \n\t"          \
+  "movn    %[" #TEMP8 "],    $zero,          %[temp18]         \n\t"          \
+  "movn    %[" #TEMP12 "],   $zero,          %[temp19]         \n\t"          \
+  "addiu   %[temp20],      $zero,          255                 \n\t"          \
+  "slt     %[temp16],      %[" #TEMP0 "],    %[temp20]         \n\t"          \
+  "slt     %[temp17],      %[" #TEMP4 "],    %[temp20]         \n\t"          \
+  "slt     %[temp18],      %[" #TEMP8 "],    %[temp20]         \n\t"          \
+  "slt     %[temp19],      %[" #TEMP12 "],   %[temp20]         \n\t"          \
+  "movz    %[" #TEMP0 "],    %[temp20],      %[temp16]         \n\t"          \
+  "movz    %[" #TEMP4 "],    %[temp20],      %[temp17]         \n\t"          \
+  "lw      %[temp16],      8(%[args])                          \n\t"          \
+  "movz    %[" #TEMP8 "],    %[temp20],      %[temp18]         \n\t"          \
+  "movz    %[" #TEMP12 "],   %[temp20],      %[temp19]         \n\t"          \
+  "sb      %[" #TEMP0 "],    0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP4 "],    1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP8 "],    2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
+
+// Does one or two inverse transforms.
+static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
+                                             const int16_t* in,
+                                             uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
+  int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
+  const int* args[3] = {(const int*)ref, (const int*)in, (const int*)dst};
+
+  __asm__ volatile(
+    "lw      %[temp20],      4(%[args])                      \n\t"
+    VERTICAL_PASS(0, 16,  8, 24, temp4,  temp0,  temp1,  temp2,  temp3)
+    VERTICAL_PASS(2, 18, 10, 26, temp8,  temp4,  temp5,  temp6,  temp7)
+    VERTICAL_PASS(4, 20, 12, 28, temp12, temp8,  temp9,  temp10, temp11)
+    VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
+
+    HORIZONTAL_PASS(0, temp0, temp4, temp8,  temp12)
+    HORIZONTAL_PASS(1, temp1, temp5, temp9,  temp13)
+    HORIZONTAL_PASS(2, temp2, temp6, temp10, temp14)
+    HORIZONTAL_PASS(3, temp3, temp7, temp11, temp15)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+    : [args]"r"(args), [kC1]"r"(kC1), [kC2]"r"(kC2)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
+                              uint8_t* dst, int do_two) {
+  ITransformOne_MIPS32(ref, in, dst);
+  if (do_two) {
+    ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
+  }
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+// macro for one pass through for loop in QuantizeBlock
+// QUANTDIV macro inlined
+// J - offset in bytes (kZigzag[n] * 2)
+// K - offset in bytes (kZigzag[n] * 4)
+// N - offset in bytes (n * 2)
+#define QUANTIZE_ONE(J, K, N)                                               \
+  "lh           %[temp0],       " #J "(%[ppin])                     \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppsharpen])                \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppzthresh])                \n\t"   \
+  "sra          %[sign],        %[temp0],           15              \n\t"   \
+  "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \
+  "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \
+  "addu         %[coeff],       %[coeff],           %[temp1]        \n\t"   \
+  "slt          %[temp4],       %[temp2],           %[coeff]        \n\t"   \
+  "addiu        %[temp5],       $zero,              0               \n\t"   \
+  "addiu        %[level],       $zero,              0               \n\t"   \
+  "beqz         %[temp4],       2f                                  \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppiq])                     \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppbias])                   \n\t"   \
+  "lhu          %[temp3],       " #J "(%[ppq])                      \n\t"   \
+  "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \
+  "addu         %[level],       %[level],           %[temp2]        \n\t"   \
+  "sra          %[level],       %[level],           17              \n\t"   \
+  "slt          %[temp4],       %[max_level],       %[level]        \n\t"   \
+  "movn         %[level],       %[max_level],       %[temp4]        \n\t"   \
+  "xor          %[level],       %[level],           %[sign]         \n\t"   \
+  "subu         %[level],       %[level],           %[sign]         \n\t"   \
+  "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \
+"2:                                                                 \n\t"   \
+  "sh           %[temp5],       " #J "(%[ppin])                     \n\t"   \
+  "sh           %[level],       " #N "(%[pout])                     \n\t"
+
+static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
+                                const VP8Matrix* const mtx) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  int sign, coeff, level, i;
+  int max_level = MAX_LEVEL;
+
+  int16_t* ppin             = &in[0];
+  int16_t* pout             = &out[0];
+  const uint16_t* ppsharpen = &mtx->sharpen_[0];
+  const uint32_t* ppzthresh = &mtx->zthresh_[0];
+  const uint16_t* ppq       = &mtx->q_[0];
+  const uint16_t* ppiq      = &mtx->iq_[0];
+  const uint32_t* ppbias    = &mtx->bias_[0];
+
+  __asm__ volatile(
+    QUANTIZE_ONE( 0,  0,  0)
+    QUANTIZE_ONE( 2,  4,  2)
+    QUANTIZE_ONE( 8, 16,  4)
+    QUANTIZE_ONE(16, 32,  6)
+    QUANTIZE_ONE(10, 20,  8)
+    QUANTIZE_ONE( 4,  8, 10)
+    QUANTIZE_ONE( 6, 12, 12)
+    QUANTIZE_ONE(12, 24, 14)
+    QUANTIZE_ONE(18, 36, 16)
+    QUANTIZE_ONE(24, 48, 18)
+    QUANTIZE_ONE(26, 52, 20)
+    QUANTIZE_ONE(20, 40, 22)
+    QUANTIZE_ONE(14, 28, 24)
+    QUANTIZE_ONE(22, 44, 26)
+    QUANTIZE_ONE(28, 56, 28)
+    QUANTIZE_ONE(30, 60, 30)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
+      [level]"=&r"(level)
+    : [pout]"r"(pout), [ppin]"r"(ppin),
+      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
+      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
+      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
+    : "memory", "hi", "lo"
+  );
+
+  // moved out from macro to increase possibility for earlier breaking
+  for (i = 15; i >= 0; i--) {
+    if (out[i]) return 1;
+  }
+  return 0;
+}
+
+static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
+                                  const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
+#undef QUANTIZE_ONE
+
+// macro for one horizontal pass in Disto4x4 (TTransform)
+// two calls of function TTransform are merged into single one
+// A - offset in bytes to load from a and b buffers
+// E..H - offsets in bytes to store first results to tmp buffer
+// E1..H1 - offsets in bytes to store second results to tmp buffer
+#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1)                  \
+  "lbu    %[temp0],  0+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp1],  1+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp2],  2+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp3],  3+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp4],  0+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp5],  1+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp6],  2+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp7],  3+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]         \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]         \n\t"                \
+  "addu   %[temp2],  %[temp1],    %[temp3]         \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp3]         \n\t"                \
+  "addu   %[temp3],  %[temp4],    %[temp6]         \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp6]         \n\t"                \
+  "addu   %[temp6],  %[temp5],    %[temp7]         \n\t"                \
+  "subu   %[temp5],  %[temp5],    %[temp7]         \n\t"                \
+  "addu   %[temp7],  %[temp8],    %[temp2]         \n\t"                \
+  "subu   %[temp2],  %[temp8],    %[temp2]         \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]         \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]         \n\t"                \
+  "addu   %[temp1],  %[temp3],    %[temp6]         \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp6]         \n\t"                \
+  "addu   %[temp6],  %[temp4],    %[temp5]         \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp5]         \n\t"                \
+  "sw     %[temp7],  " #E "(%[tmp])                \n\t"                \
+  "sw     %[temp2],  " #H "(%[tmp])                \n\t"                \
+  "sw     %[temp8],  " #F "(%[tmp])                \n\t"                \
+  "sw     %[temp0],  " #G "(%[tmp])                \n\t"                \
+  "sw     %[temp1],  " #E1 "(%[tmp])               \n\t"                \
+  "sw     %[temp3],  " #H1 "(%[tmp])               \n\t"                \
+  "sw     %[temp6],  " #F1 "(%[tmp])               \n\t"                \
+  "sw     %[temp4],  " #G1 "(%[tmp])               \n\t"
+
+// macro for one vertical pass in Disto4x4 (TTransform)
+// two calls of function TTransform are merged into single one
+// since only one accu is available in mips32r1 instruction set
+//   first is done second call of function TTransform and after
+//   that first one.
+//   const int sum1 = TTransform(a, w);
+//   const int sum2 = TTransform(b, w);
+//   return abs(sum2 - sum1) >> 5;
+//   (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1)
+// A..D - offsets in bytes to load first results from tmp buffer
+// A1..D1 - offsets in bytes to load second results from tmp buffer
+// E..H - offsets in bytes to load from w buffer
+#define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H)     \
+  "lw     %[temp0],  " #A1 "(%[tmp])         \n\t"                \
+  "lw     %[temp1],  " #C1 "(%[tmp])         \n\t"                \
+  "lw     %[temp2],  " #B1 "(%[tmp])         \n\t"                \
+  "lw     %[temp3],  " #D1 "(%[tmp])         \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
+  "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \
+  "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \
+  "subu   %[temp8],  %[temp8],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp0],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
+  "sra    %[temp4],  %[temp3],    31         \n\t"                \
+  "sra    %[temp5],  %[temp1],    31         \n\t"                \
+  "sra    %[temp6],  %[temp0],    31         \n\t"                \
+  "sra    %[temp7],  %[temp8],    31         \n\t"                \
+  "xor    %[temp3],  %[temp3],    %[temp4]   \n\t"                \
+  "xor    %[temp1],  %[temp1],    %[temp5]   \n\t"                \
+  "xor    %[temp0],  %[temp0],    %[temp6]   \n\t"                \
+  "xor    %[temp8],  %[temp8],    %[temp7]   \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp4]   \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp5]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp6]   \n\t"                \
+  "subu   %[temp8],  %[temp8],    %[temp7]   \n\t"                \
+  "lhu    %[temp4],  " #E "(%[w])            \n\t"                \
+  "lhu    %[temp5],  " #F "(%[w])            \n\t"                \
+  "lhu    %[temp6],  " #G "(%[w])            \n\t"                \
+  "lhu    %[temp7],  " #H "(%[w])            \n\t"                \
+  "madd   %[temp4],  %[temp3]                \n\t"                \
+  "madd   %[temp5],  %[temp1]                \n\t"                \
+  "madd   %[temp6],  %[temp0]                \n\t"                \
+  "madd   %[temp7],  %[temp8]                \n\t"                \
+  "lw     %[temp0],  " #A "(%[tmp])          \n\t"                \
+  "lw     %[temp1],  " #C "(%[tmp])          \n\t"                \
+  "lw     %[temp2],  " #B "(%[tmp])          \n\t"                \
+  "lw     %[temp3],  " #D "(%[tmp])          \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
+  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
+  "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \
+  "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \
+  "subu   %[temp1],  %[temp8],    %[temp1]   \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
+  "sra    %[temp2],  %[temp3],    31         \n\t"                \
+  "xor    %[temp3],  %[temp3],    %[temp2]   \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp2]   \n\t"                \
+  "msub   %[temp4],  %[temp3]                \n\t"                \
+  "sra    %[temp2],  %[temp8],    31         \n\t"                \
+  "sra    %[temp3],  %[temp0],    31         \n\t"                \
+  "sra    %[temp4],  %[temp1],    31         \n\t"                \
+  "xor    %[temp8],  %[temp8],    %[temp2]   \n\t"                \
+  "xor    %[temp0],  %[temp0],    %[temp3]   \n\t"                \
+  "xor    %[temp1],  %[temp1],    %[temp4]   \n\t"                \
+  "subu   %[temp8],  %[temp8],    %[temp2]   \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp3]   \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp4]   \n\t"                \
+  "msub   %[temp5],  %[temp8]                \n\t"                \
+  "msub   %[temp6],  %[temp0]                \n\t"                \
+  "msub   %[temp7],  %[temp1]                \n\t"
+
+static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
+  int tmp[32];
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+
+  __asm__ volatile(
+    HORIZONTAL_PASS(0,   0,  4,  8, 12,    64,  68,  72,  76)
+    HORIZONTAL_PASS(1,  16, 20, 24, 28,    80,  84,  88,  92)
+    HORIZONTAL_PASS(2,  32, 36, 40, 44,    96, 100, 104, 108)
+    HORIZONTAL_PASS(3,  48, 52, 56, 60,   112, 116, 120, 124)
+    "mthi   $zero                             \n\t"
+    "mtlo   $zero                             \n\t"
+    VERTICAL_PASS( 0, 16, 32, 48,     64, 80,  96, 112,   0,  8, 16, 24)
+    VERTICAL_PASS( 4, 20, 36, 52,     68, 84, 100, 116,   2, 10, 18, 26)
+    VERTICAL_PASS( 8, 24, 40, 56,     72, 88, 104, 120,   4, 12, 20, 28)
+    VERTICAL_PASS(12, 28, 44, 60,     76, 92, 108, 124,   6, 14, 22, 30)
+    "mflo   %[temp0]                          \n\t"
+    "sra    %[temp1],  %[temp0],  31          \n\t"
+    "xor    %[temp0],  %[temp0],  %[temp1]    \n\t"
+    "subu   %[temp0],  %[temp0],  %[temp1]    \n\t"
+    "sra    %[temp0],  %[temp0],  5           \n\t"
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [a]"r"(a), [b]"r"(b), [w]"r"(w), [tmp]"r"(tmp)
+    : "memory", "hi", "lo"
+  );
+
+  return temp0;
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
+                             const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+// macro for one horizontal pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A - offset in bytes to load from src and ref buffers
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                  \
+  "lw     %[" #TEMP1 "],  0(%[args])                           \n\t"    \
+  "lw     %[" #TEMP2 "],  4(%[args])                           \n\t"    \
+  "lbu    %[temp16],    0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp17],    0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "lbu    %[temp18],    1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp19],    1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[temp20],    %[temp16],    %[temp17]                \n\t"    \
+  "lbu    %[temp16],    2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp17],    2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[" #TEMP0 "],  %[temp18],    %[temp19]              \n\t"    \
+  "lbu    %[temp18],    3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp19],    3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[" #TEMP1 "],  %[temp16],    %[temp17]              \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp18],    %[temp19]              \n\t"    \
+  "addu   %[" #TEMP3 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \
+  "addu   %[temp20],    %[" #TEMP0 "],  %[" #TEMP1 "]          \n\t"    \
+  "subu   %[" #TEMP0 "],  %[" #TEMP0 "],  %[" #TEMP1 "]        \n\t"    \
+  "mul    %[temp16],    %[" #TEMP2 "],  %[c5352]               \n\t"    \
+  "mul    %[temp17],    %[" #TEMP2 "],  %[c2217]               \n\t"    \
+  "mul    %[temp18],    %[" #TEMP0 "],  %[c5352]               \n\t"    \
+  "mul    %[temp19],    %[" #TEMP0 "],  %[c2217]               \n\t"    \
+  "addu   %[" #TEMP1 "],  %[" #TEMP3 "],  %[temp20]            \n\t"    \
+  "subu   %[temp20],    %[" #TEMP3 "],  %[temp20]              \n\t"    \
+  "sll    %[" #TEMP0 "],  %[" #TEMP1 "],  3                    \n\t"    \
+  "sll    %[" #TEMP2 "],  %[temp20],    3                      \n\t"    \
+  "addiu  %[temp16],    %[temp16],    1812                     \n\t"    \
+  "addiu  %[temp17],    %[temp17],    937                      \n\t"    \
+  "addu   %[temp16],    %[temp16],    %[temp19]                \n\t"    \
+  "subu   %[temp17],    %[temp17],    %[temp18]                \n\t"    \
+  "sra    %[" #TEMP1 "],  %[temp16],    9                      \n\t"    \
+  "sra    %[" #TEMP3 "],  %[temp17],    9                      \n\t"
+
+// macro for one vertical pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to store to out buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)    \
+  "addu   %[temp16],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "subu   %[temp19],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "addu   %[temp17],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "subu   %[temp18],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "mul    %[" #TEMP8 "],  %[temp19],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP12 "], %[temp18],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP4 "],  %[temp19],    %[c5352]         \n\t"    \
+  "mul    %[temp18],    %[temp18],    %[c5352]           \n\t"    \
+  "addiu  %[temp16],    %[temp16],    7                  \n\t"    \
+  "addu   %[" #TEMP0 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP0 "],  %[" #TEMP0 "],  4              \n\t"    \
+  "addu   %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "]  \n\t"    \
+  "subu   %[" #TEMP4 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP4 "],  %[" #TEMP4 "],  4              \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  30000          \n\t"    \
+  "addiu  %[" #TEMP12 "], %[" #TEMP12 "], 12000          \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  21000          \n\t"    \
+  "subu   %[" #TEMP8 "],  %[" #TEMP8 "],  %[temp18]      \n\t"    \
+  "sra    %[" #TEMP12 "], %[" #TEMP12 "], 16             \n\t"    \
+  "sra    %[" #TEMP8 "],  %[" #TEMP8 "],  16             \n\t"    \
+  "addiu  %[temp16],    %[" #TEMP12 "], 1                \n\t"    \
+  "movn   %[" #TEMP12 "], %[temp16],    %[temp19]        \n\t"    \
+  "sh     %[" #TEMP0 "],  " #A "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP4 "],  " #C "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"
+
+static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
+                              int16_t* out) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+  int temp17, temp18, temp19, temp20;
+  const int c2217 = 2217;
+  const int c5352 = 5352;
+  const int* const args[3] =
+      { (const int*)src, (const int*)ref, (const int*)out };
+
+  __asm__ volatile(
+    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
+    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
+    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
+    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
+    "lw   %[temp20],    8(%[args])                     \n\t"
+    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
+    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
+    VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
+    VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
+      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
+      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
+      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+    : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
+    : "memory", "hi", "lo"
+  );
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+#if !defined(WORK_AROUND_GCC)
+
+#define GET_SSE_INNER(A, B, C, D)                               \
+  "lbu     %[temp0],    " #A "(%[a])                 \n\t"      \
+  "lbu     %[temp1],    " #A "(%[b])                 \n\t"      \
+  "lbu     %[temp2],    " #B "(%[a])                 \n\t"      \
+  "lbu     %[temp3],    " #B "(%[b])                 \n\t"      \
+  "lbu     %[temp4],    " #C "(%[a])                 \n\t"      \
+  "lbu     %[temp5],    " #C "(%[b])                 \n\t"      \
+  "lbu     %[temp6],    " #D "(%[a])                 \n\t"      \
+  "lbu     %[temp7],    " #D "(%[b])                 \n\t"      \
+  "subu    %[temp0],    %[temp0],     %[temp1]       \n\t"      \
+  "subu    %[temp2],    %[temp2],     %[temp3]       \n\t"      \
+  "subu    %[temp4],    %[temp4],     %[temp5]       \n\t"      \
+  "subu    %[temp6],    %[temp6],     %[temp7]       \n\t"      \
+  "madd    %[temp0],    %[temp0]                     \n\t"      \
+  "madd    %[temp2],    %[temp2]                     \n\t"      \
+  "madd    %[temp4],    %[temp4]                     \n\t"      \
+  "madd    %[temp6],    %[temp6]                     \n\t"
+
+#define GET_SSE(A, B, C, D)               \
+  GET_SSE_INNER(A, A + 1, A + 2, A + 3)   \
+  GET_SSE_INNER(B, B + 1, B + 2, B + 3)   \
+  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
+  GET_SSE_INNER(D, D + 1, D + 2, D + 3)
+
+static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+     GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
+     GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
+     GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
+     GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
+     GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
+     GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
+     GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
+     GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
+     GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
+     GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
+     GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+  __asm__ volatile(
+     "mult   $zero,    $zero                            \n\t"
+
+     GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
+
+    "mflo    %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+#undef GET_SSE
+#undef GET_SSE_INNER
+
+#endif  // !WORK_AROUND_GCC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
+  VP8ITransform = ITransform_MIPS32;
+  VP8FTransform = FTransform_MIPS32;
+
+  VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
+
+  VP8TDisto4x4 = Disto4x4_MIPS32;
+  VP8TDisto16x16 = Disto16x16_MIPS32;
+
+#if !defined(WORK_AROUND_GCC)
+  VP8SSE16x16 = SSE16x16_MIPS32;
+  VP8SSE8x8 = SSE8x8_MIPS32;
+  VP8SSE16x8 = SSE16x8_MIPS32;
+  VP8SSE4x4 = SSE4x4_MIPS32;
+#endif
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/enc_mips_dsp_r2.c b/media/libwebp/dsp/enc_mips_dsp_r2.c
new file mode 100644
index 0000000000..cf4c85b59c
--- /dev/null
+++ b/media/libwebp/dsp/enc_mips_dsp_r2.c
@@ -0,0 +1,1517 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of speed-critical encoding functions.
+//
+// Author(s): Darko Laus (darko.laus@imgtec.com)
+//            Mirko Raus (mirko.raus@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/mips_macro.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+// O - output
+// I - input (macro doesn't change it)
+#define ADD_SUB_HALVES_X4(O0, O1, O2, O3, O4, O5, O6, O7,                      \
+                          I0, I1, I2, I3, I4, I5, I6, I7)                      \
+  "addq.ph          %[" #O0 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
+  "subq.ph          %[" #O1 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
+  "addq.ph          %[" #O2 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
+  "subq.ph          %[" #O3 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
+  "addq.ph          %[" #O4 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
+  "subq.ph          %[" #O5 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
+  "addq.ph          %[" #O6 "],   %[" #I6 "],  %[" #I7 "]     \n\t"            \
+  "subq.ph          %[" #O7 "],   %[" #I6 "],  %[" #I7 "]     \n\t"
+
+// IO - input/output
+#define ABS_X8(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7)                         \
+  "absq_s.ph        %[" #IO0 "],   %[" #IO0 "]                \n\t"            \
+  "absq_s.ph        %[" #IO1 "],   %[" #IO1 "]                \n\t"            \
+  "absq_s.ph        %[" #IO2 "],   %[" #IO2 "]                \n\t"            \
+  "absq_s.ph        %[" #IO3 "],   %[" #IO3 "]                \n\t"            \
+  "absq_s.ph        %[" #IO4 "],   %[" #IO4 "]                \n\t"            \
+  "absq_s.ph        %[" #IO5 "],   %[" #IO5 "]                \n\t"            \
+  "absq_s.ph        %[" #IO6 "],   %[" #IO6 "]                \n\t"            \
+  "absq_s.ph        %[" #IO7 "],   %[" #IO7 "]                \n\t"
+
+// dpa.w.ph $ac0 temp0 ,temp1
+//  $ac += temp0[31..16] * temp1[31..16] + temp0[15..0] * temp1[15..0]
+// dpax.w.ph $ac0 temp0 ,temp1
+//  $ac += temp0[31..16] * temp1[15..0] + temp0[15..0] * temp1[31..16]
+// O - output
+// I - input (macro doesn't change it)
+#define MUL_HALF(O0, I0, I1, I2, I3, I4, I5, I6, I7,                           \
+                 I8, I9, I10, I11, I12, I13, I14, I15)                         \
+    "mult            $ac0,      $zero,     $zero              \n\t"            \
+    "dpa.w.ph        $ac0,      %[" #I2 "],  %[" #I0 "]       \n\t"            \
+    "dpax.w.ph       $ac0,      %[" #I5 "],  %[" #I6 "]       \n\t"            \
+    "dpa.w.ph        $ac0,      %[" #I8 "],  %[" #I9 "]       \n\t"            \
+    "dpax.w.ph       $ac0,      %[" #I11 "], %[" #I4 "]       \n\t"            \
+    "dpa.w.ph        $ac0,      %[" #I12 "], %[" #I7 "]       \n\t"            \
+    "dpax.w.ph       $ac0,      %[" #I13 "], %[" #I1 "]       \n\t"            \
+    "dpa.w.ph        $ac0,      %[" #I14 "], %[" #I3 "]       \n\t"            \
+    "dpax.w.ph       $ac0,      %[" #I15 "], %[" #I10 "]      \n\t"            \
+    "mflo            %[" #O0 "],  $ac0                        \n\t"
+
+#define OUTPUT_EARLY_CLOBBER_REGS_17()                                         \
+  OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
+  [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
+  [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
+  [temp17]"=&r"(temp17)
+
+// macro for one horizontal pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A - offset in bytes to load from src and ref buffers
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                         \
+  "lw              %[" #TEMP0 "],   0(%[args])                          \n\t"  \
+  "lw              %[" #TEMP1 "],   4(%[args])                          \n\t"  \
+  "lw              %[" #TEMP2 "],   " XSTR(BPS) "*" #A "(%[" #TEMP0 "]) \n\t"  \
+  "lw              %[" #TEMP3 "],   " XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t"  \
+  "preceu.ph.qbl   %[" #TEMP0 "],   %[" #TEMP2 "]                       \n\t"  \
+  "preceu.ph.qbl   %[" #TEMP1 "],   %[" #TEMP3 "]                       \n\t"  \
+  "preceu.ph.qbr   %[" #TEMP2 "],   %[" #TEMP2 "]                       \n\t"  \
+  "preceu.ph.qbr   %[" #TEMP3 "],   %[" #TEMP3 "]                       \n\t"  \
+  "subq.ph         %[" #TEMP0 "],   %[" #TEMP0 "],   %[" #TEMP1 "]      \n\t"  \
+  "subq.ph         %[" #TEMP2 "],   %[" #TEMP2 "],   %[" #TEMP3 "]      \n\t"  \
+  "rotr            %[" #TEMP0 "],   %[" #TEMP0 "],   16                 \n\t"  \
+  "addq.ph         %[" #TEMP1 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
+  "subq.ph         %[" #TEMP3 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
+  "seh             %[" #TEMP0 "],   %[" #TEMP1 "]                       \n\t"  \
+  "sra             %[temp16],     %[" #TEMP1 "],   16                   \n\t"  \
+  "seh             %[temp19],     %[" #TEMP3 "]                         \n\t"  \
+  "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   16                 \n\t"  \
+  "subu            %[" #TEMP2 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
+  "addu            %[" #TEMP0 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
+  "mul             %[temp17],     %[temp19],     %[c2217]               \n\t"  \
+  "mul             %[temp18],     %[" #TEMP3 "],   %[c5352]             \n\t"  \
+  "mul             %[" #TEMP1 "],   %[temp19],     %[c5352]             \n\t"  \
+  "mul             %[temp16],     %[" #TEMP3 "],   %[c2217]             \n\t"  \
+  "sll             %[" #TEMP2 "],   %[" #TEMP2 "],   3                  \n\t"  \
+  "sll             %[" #TEMP0 "],   %[" #TEMP0 "],   3                  \n\t"  \
+  "subu            %[" #TEMP3 "],   %[temp17],     %[temp18]            \n\t"  \
+  "addu            %[" #TEMP1 "],   %[temp16],     %[" #TEMP1 "]        \n\t"  \
+  "addiu           %[" #TEMP3 "],   %[" #TEMP3 "],   937                \n\t"  \
+  "addiu           %[" #TEMP1 "],   %[" #TEMP1 "],   1812               \n\t"  \
+  "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   9                  \n\t"  \
+  "sra             %[" #TEMP1 "],   %[" #TEMP1 "],   9                  \n\t"
+
+// macro for one vertical pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to store to out buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)                 \
+  "addu            %[temp16],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
+  "subu            %[temp19],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
+  "addu            %[temp17],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
+  "subu            %[temp18],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
+  "mul             %[" #TEMP8 "],   %[temp19],     %[c2217]         \n\t"      \
+  "mul             %[" #TEMP12 "],  %[temp18],     %[c2217]         \n\t"      \
+  "mul             %[" #TEMP4 "],   %[temp19],     %[c5352]         \n\t"      \
+  "mul             %[temp18],     %[temp18],     %[c5352]           \n\t"      \
+  "addiu           %[temp16],     %[temp16],     7                  \n\t"      \
+  "addu            %[" #TEMP0 "],   %[temp16],     %[temp17]        \n\t"      \
+  "sra             %[" #TEMP0 "],   %[" #TEMP0 "],   4              \n\t"      \
+  "addu            %[" #TEMP12 "],  %[" #TEMP12 "],  %[" #TEMP4 "]  \n\t"      \
+  "subu            %[" #TEMP4 "],   %[temp16],     %[temp17]        \n\t"      \
+  "sra             %[" #TEMP4 "],   %[" #TEMP4 "],   4              \n\t"      \
+  "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   30000          \n\t"      \
+  "addiu           %[" #TEMP12 "],  %[" #TEMP12 "],  12000          \n\t"      \
+  "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   21000          \n\t"      \
+  "subu            %[" #TEMP8 "],   %[" #TEMP8 "],   %[temp18]      \n\t"      \
+  "sra             %[" #TEMP12 "],  %[" #TEMP12 "],  16             \n\t"      \
+  "sra             %[" #TEMP8 "],   %[" #TEMP8 "],   16             \n\t"      \
+  "addiu           %[temp16],     %[" #TEMP12 "],  1                \n\t"      \
+  "movn            %[" #TEMP12 "],  %[temp16],     %[temp19]        \n\t"      \
+  "sh              %[" #TEMP0 "],   " #A "(%[temp20])               \n\t"      \
+  "sh              %[" #TEMP4 "],   " #C "(%[temp20])               \n\t"      \
+  "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
+  "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
+
+static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
+                                 int16_t* out) {
+  const int c2217 = 2217;
+  const int c5352 = 5352;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+  int temp17, temp18, temp19, temp20;
+  const int* const args[3] =
+      { (const int*)src, (const int*)ref, (const int*)out };
+
+  __asm__ volatile (
+    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
+    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
+    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
+    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
+    "lw            %[temp20],     8(%[args])                  \n\t"
+    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
+    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
+    VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
+    VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
+    OUTPUT_EARLY_CLOBBER_REGS_18(),
+      [temp0]"=&r"(temp0), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+    : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
+    : "memory", "hi", "lo"
+  );
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ulw              %[temp1],   0(%[in])                 \n\t"
+    "ulw              %[temp2],   16(%[in])                \n\t"
+    LOAD_IN_X2(temp5, temp6, 24, 26)
+    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
+    LOAD_IN_X2(temp1, temp2, 8, 10)
+    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
+                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
+                  temp13, temp11, temp14, temp12)
+    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
+    "ulw              %[temp17],  4(%[in])                 \n\t"
+    "ulw              %[temp18],  20(%[in])                \n\t"
+    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
+    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
+    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
+    LOAD_IN_X2(temp17, temp18, 12, 14)
+    LOAD_IN_X2(temp9, temp10, 28, 30)
+    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
+                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
+                  temp15, temp4, temp16, temp17)
+    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
+    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
+    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
+
+    // horizontal
+    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
+    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
+    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
+    "repl.ph          %[temp2],   0x4                      \n\t"
+    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
+    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
+    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
+    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
+    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
+    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
+                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
+                  temp6, temp17, temp8, temp18)
+    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
+                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
+                  temp18, temp12, temp17, temp16)
+    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
+    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
+    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
+                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
+                   temp6)
+    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
+                          temp16, temp11, temp10, temp15, temp14)
+    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
+                            temp11, temp10, temp11, temp14, temp15)
+    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
+                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18()
+    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2), [ref]"r"(ref)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
+                                 uint8_t* dst, int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
+                              const uint16_t* const w) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
+
+  __asm__ volatile (
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11,
+                            temp12, temp1, temp2, temp3, temp4)
+    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
+                          temp7, temp2, temp4, temp6, temp8)
+    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
+                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
+    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
+                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
+    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
+                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
+    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
+    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
+                        0, 4, 8, 12,
+                        1, 1, 1, 1,
+                        16)
+    MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11,
+                            temp12, temp1, temp2, temp3, temp4)
+    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
+                          temp7, temp2, temp4, temp6, temp8)
+    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
+                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
+    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
+                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
+    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
+                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
+    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
+    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
+                        0, 4, 8, 12,
+                        1, 1, 1, 1,
+                        16)
+    MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
+    OUTPUT_EARLY_CLOBBER_REGS_17()
+    : [a]"r"(a), [b]"r"(b), [w]"r"(w)
+    : "memory", "hi", "lo"
+  );
+  return abs(temp3 - temp17) >> 5;
+}
+
+static int Disto16x16_MIPSdspR2(const uint8_t* const a,
+                                const uint8_t* const b,
+                                const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+#define FILL_PART(J, SIZE)                                            \
+    "usw        %[value],  0+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+    "usw        %[value],  4+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+  ".if " #SIZE " == 16                                     \n\t"      \
+    "usw        %[value],  8+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+    "usw        %[value], 12+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+  ".endif                                                  \n\t"
+
+#define FILL_8_OR_16(DST, VALUE, SIZE) do {                         \
+  int value = (VALUE);                                              \
+  __asm__ volatile (                                                \
+    "replv.qb   %[value],  %[value]                      \n\t"      \
+    FILL_PART( 0, SIZE)                                             \
+    FILL_PART( 1, SIZE)                                             \
+    FILL_PART( 2, SIZE)                                             \
+    FILL_PART( 3, SIZE)                                             \
+    FILL_PART( 4, SIZE)                                             \
+    FILL_PART( 5, SIZE)                                             \
+    FILL_PART( 6, SIZE)                                             \
+    FILL_PART( 7, SIZE)                                             \
+  ".if " #SIZE " == 16                                   \n\t"      \
+    FILL_PART( 8, 16)                                               \
+    FILL_PART( 9, 16)                                               \
+    FILL_PART(10, 16)                                               \
+    FILL_PART(11, 16)                                               \
+    FILL_PART(12, 16)                                               \
+    FILL_PART(13, 16)                                               \
+    FILL_PART(14, 16)                                               \
+    FILL_PART(15, 16)                                               \
+  ".endif                                                \n\t"      \
+    : [value]"+&r"(value)                                           \
+    : [dst]"r"((DST))                                               \
+    : "memory"                                                      \
+  );                                                                \
+} while (0)
+
+#define VERTICAL_PRED(DST, TOP, SIZE)                                          \
+static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST),                     \
+                                           const uint8_t* (TOP)) {             \
+  int j;                                                                       \
+  if ((TOP)) {                                                                 \
+    for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE));       \
+  } else {                                                                     \
+    FILL_8_OR_16((DST), 127, (SIZE));                                          \
+  }                                                                            \
+}
+
+VERTICAL_PRED(dst, top, 8)
+VERTICAL_PRED(dst, top, 16)
+
+#undef VERTICAL_PRED
+
+#define HORIZONTAL_PRED(DST, LEFT, SIZE)                                       \
+static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST),                   \
+                                             const uint8_t* (LEFT)) {          \
+  if (LEFT) {                                                                  \
+    int j;                                                                     \
+    for (j = 0; j < (SIZE); ++j) {                                             \
+      memset((DST) + j * BPS, (LEFT)[j], (SIZE));                              \
+    }                                                                          \
+  } else {                                                                     \
+    FILL_8_OR_16((DST), 129, (SIZE));                                          \
+  }                                                                            \
+}
+
+HORIZONTAL_PRED(dst, left, 8)
+HORIZONTAL_PRED(dst, left, 16)
+
+#undef HORIZONTAL_PRED
+
+#define CLIPPING()                                                             \
+  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
+  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
+  "addu.ph         %[temp2],   %[temp2],   %[leftY_1]    \n\t"                 \
+  "addu.ph         %[temp0],   %[temp0],   %[leftY_1]    \n\t"                 \
+  "addu.ph         %[temp3],   %[temp3],   %[leftY_1]    \n\t"                 \
+  "addu.ph         %[temp1],   %[temp1],   %[leftY_1]    \n\t"                 \
+  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
+  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
+  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
+  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
+  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
+  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"
+
+#define CLIP_8B_TO_DST(DST, LEFT, TOP, SIZE) do {                              \
+  int leftY_1 = ((int)(LEFT)[y] << 16) + (LEFT)[y];                            \
+  int temp0, temp1, temp2, temp3;                                              \
+  __asm__ volatile (                                                           \
+    "replv.ph        %[leftY_1], %[leftY_1]              \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "ulw             %[temp1],   4(%[top])               \n\t"                 \
+    "subu.ph         %[leftY_1], %[leftY_1], %[left_1]   \n\t"                 \
+    CLIPPING()                                                                 \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+    "usw             %[temp1],   4(%[dst])               \n\t"                 \
+  ".if " #SIZE " == 16                                   \n\t"                 \
+    "ulw             %[temp0],   8(%[top])               \n\t"                 \
+    "ulw             %[temp1],   12(%[top])              \n\t"                 \
+    CLIPPING()                                                                 \
+    "usw             %[temp0],   8(%[dst])               \n\t"                 \
+    "usw             %[temp1],   12(%[dst])              \n\t"                 \
+  ".endif                                                \n\t"                 \
+    : [leftY_1]"+&r"(leftY_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),       \
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
+    : [left_1]"r"(left_1), [top]"r"((TOP)), [dst]"r"((DST))                    \
+    : "memory"                                                                 \
+  );                                                                           \
+} while (0)
+
+#define CLIP_TO_DST(DST, LEFT, TOP, SIZE) do {                                 \
+  int y;                                                                       \
+  const int left_1 = ((int)(LEFT)[-1] << 16) + (LEFT)[-1];                     \
+  for (y = 0; y < (SIZE); ++y) {                                               \
+    CLIP_8B_TO_DST((DST), (LEFT), (TOP), (SIZE));                              \
+    (DST) += BPS;                                                              \
+  }                                                                            \
+} while (0)
+
+#define TRUE_MOTION(DST, LEFT, TOP, SIZE)                                      \
+static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\
+                                         const uint8_t* (TOP)) {               \
+  if ((LEFT) != NULL) {                                                        \
+    if ((TOP) != NULL) {                                                       \
+      CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE));                               \
+    } else {                                                                   \
+      HorizontalPred##SIZE((DST), (LEFT));                                     \
+    }                                                                          \
+  } else {                                                                     \
+    /* true motion without left samples (hence: with default 129 value)    */  \
+    /* is equivalent to VE prediction where you just copy the top samples. */  \
+    /* Note that if top samples are not available, the default value is    */  \
+    /* then 129, and not 127 as in the VerticalPred case.                  */  \
+    if ((TOP) != NULL) {                                                       \
+      VerticalPred##SIZE((DST), (TOP));                                        \
+    } else {                                                                   \
+      FILL_8_OR_16((DST), 129, (SIZE));                                        \
+    }                                                                          \
+  }                                                                            \
+}
+
+TRUE_MOTION(dst, left, top, 8)
+TRUE_MOTION(dst, left, top, 16)
+
+#undef TRUE_MOTION
+#undef CLIP_TO_DST
+#undef CLIP_8B_TO_DST
+#undef CLIPPING
+
+static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
+                                 const uint8_t* top) {
+  int DC, DC1;
+  int temp0, temp1, temp2, temp3;
+
+  __asm__ volatile(
+    "beqz        %[top],   2f                  \n\t"
+    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, top,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
+    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
+    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
+    "addu        %[DC],    %[temp0], %[temp2]  \n\t"
+    "move        %[DC1],   %[DC]               \n\t"
+    "beqz        %[left],  1f                  \n\t"
+    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
+    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
+    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
+    "addu        %[DC1],   %[temp0], %[temp2]  \n\t"
+  "1:                                          \n\t"
+    "addu        %[DC],   %[DC],     %[DC1]    \n\t"
+    "j           3f                            \n\t"
+  "2:                                          \n\t"
+    "beqz        %[left],  4f                  \n\t"
+    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
+    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
+    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
+    "addu        %[DC],    %[temp0], %[temp2]  \n\t"
+    "addu        %[DC],    %[DC],    %[DC]     \n\t"
+  "3:                                          \n\t"
+    "shra_r.w    %[DC],    %[DC],    5         \n\t"
+    "j           5f                            \n\t"
+  "4:                                          \n\t"
+    "li          %[DC],    0x80                \n\t"
+  "5:                                          \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
+    : [left]"r"(left), [top]"r"(top)
+    : "memory"
+  );
+
+  FILL_8_OR_16(dst, DC, 16);
+}
+
+static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
+                                const uint8_t* top) {
+  int DC, DC1;
+  int temp0, temp1, temp2, temp3;
+
+  __asm__ volatile(
+    "beqz        %[top],   2f                  \n\t"
+    "ulw         %[temp0], 0(%[top])           \n\t"
+    "ulw         %[temp1], 4(%[top])           \n\t"
+    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
+    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
+    "addu        %[DC],    %[temp0], %[temp1]  \n\t"
+    "move        %[DC1],   %[DC]               \n\t"
+    "beqz        %[left],  1f                  \n\t"
+    "ulw         %[temp2], 0(%[left])          \n\t"
+    "ulw         %[temp3], 4(%[left])          \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[DC1],   %[temp2], %[temp3]  \n\t"
+  "1:                                          \n\t"
+    "addu        %[DC],    %[DC],    %[DC1]    \n\t"
+    "j           3f                            \n\t"
+  "2:                                          \n\t"
+    "beqz        %[left],  4f                  \n\t"
+    "ulw         %[temp2], 0(%[left])          \n\t"
+    "ulw         %[temp3], 4(%[left])          \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[DC],    %[temp2], %[temp3]  \n\t"
+    "addu        %[DC],    %[DC],    %[DC]     \n\t"
+  "3:                                          \n\t"
+    "shra_r.w    %[DC], %[DC], 4               \n\t"
+    "j           5f                            \n\t"
+  "4:                                          \n\t"
+    "li          %[DC], 0x80                   \n\t"
+  "5:                                          \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
+    : [left]"r"(left), [top]"r"(top)
+    : "memory"
+  );
+
+  FILL_8_OR_16(dst, DC, 8);
+}
+
+static void DC4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1;
+  __asm__ volatile(
+    "ulw          %[temp0],   0(%[top])               \n\t"
+    "ulw          %[temp1],   -5(%[top])              \n\t"
+    "raddu.w.qb   %[temp0],   %[temp0]                \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]   \n\t"
+    "addiu        %[temp0],   %[temp0],    4          \n\t"
+    "srl          %[temp0],   %[temp0],    3          \n\t"
+    "replv.qb     %[temp0],   %[temp0]                \n\t"
+    "usw          %[temp0],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw          %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw          %[temp0],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw          %[temp0],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void TM4(uint8_t* dst, const uint8_t* top) {
+  int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
+  const int c35 = 0xff00ff;
+  __asm__ volatile (
+    "lbu              %[temp1],  0(%[top])                     \n\t"
+    "lbu              %[a10],    1(%[top])                     \n\t"
+    "lbu              %[temp2],  2(%[top])                     \n\t"
+    "lbu              %[a32],    3(%[top])                     \n\t"
+    "ulw              %[temp0],  -5(%[top])                    \n\t"
+    "lbu              %[temp4],  -1(%[top])                    \n\t"
+    "append           %[a10],    %[temp1],   16                \n\t"
+    "append           %[a32],    %[temp2],   16                \n\t"
+    "replv.ph         %[temp4],  %[temp4]                      \n\t"
+    "shrl.ph          %[temp1],  %[temp0],   8                 \n\t"
+    "and              %[temp0],  %[temp0],   %[c35]            \n\t"
+    "subu.ph          %[temp1],  %[temp1],   %[temp4]          \n\t"
+    "subu.ph          %[temp0],  %[temp0],   %[temp4]          \n\t"
+    "srl              %[temp2],  %[temp1],   16                \n\t"
+    "srl              %[temp3],  %[temp0],   16                \n\t"
+    "replv.ph         %[temp2],  %[temp2]                      \n\t"
+    "replv.ph         %[temp3],  %[temp3]                      \n\t"
+    "replv.ph         %[temp4],  %[temp1]                      \n\t"
+    "replv.ph         %[temp5],  %[temp0]                      \n\t"
+    "addu.ph          %[temp0],  %[temp3],   %[a10]            \n\t"
+    "addu.ph          %[temp1],  %[temp3],   %[a32]            \n\t"
+    "addu.ph          %[temp3],  %[temp2],   %[a10]            \n\t"
+    "addu.ph          %[temp2],  %[temp2],   %[a32]            \n\t"
+    "shll_s.ph        %[temp0],  %[temp0],   7                 \n\t"
+    "shll_s.ph        %[temp1],  %[temp1],   7                 \n\t"
+    "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
+    "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
+    "precrqu_s.qb.ph  %[temp0],  %[temp1],   %[temp0]          \n\t"
+    "precrqu_s.qb.ph  %[temp1],  %[temp2],   %[temp3]          \n\t"
+    "addu.ph          %[temp2],  %[temp5],   %[a10]            \n\t"
+    "addu.ph          %[temp3],  %[temp5],   %[a32]            \n\t"
+    "addu.ph          %[temp5],  %[temp4],   %[a10]            \n\t"
+    "addu.ph          %[temp4],  %[temp4],   %[a32]            \n\t"
+    "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
+    "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
+    "shll_s.ph        %[temp4],  %[temp4],   7                 \n\t"
+    "shll_s.ph        %[temp5],  %[temp5],   7                 \n\t"
+    "precrqu_s.qb.ph  %[temp2],  %[temp3],   %[temp2]          \n\t"
+    "precrqu_s.qb.ph  %[temp3],  %[temp4],   %[temp5]          \n\t"
+    "usw              %[temp1],  0*" XSTR(BPS) "(%[dst])       \n\t"
+    "usw              %[temp0],  1*" XSTR(BPS) "(%[dst])       \n\t"
+    "usw              %[temp3],  2*" XSTR(BPS) "(%[dst])       \n\t"
+    "usw              %[temp2],  3*" XSTR(BPS) "(%[dst])       \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [a10]"=&r"(a10), [a32]"=&r"(a32)
+    : [c35]"r"(c35), [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void VE4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  __asm__ volatile(
+    "ulw             %[temp0],   -1(%[top])              \n\t"
+    "ulh             %[temp1],   3(%[top])               \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
+    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
+    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
+    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
+    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
+    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
+    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
+    "usw             %[temp4],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp4],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp4],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void HE4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  __asm__ volatile(
+    "ulw             %[temp0],   -4(%[top])              \n\t"
+    "lbu             %[temp1],   -5(%[top])              \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
+    "replv.ph        %[temp4],   %[temp1]                \n\t"
+    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "packrl.ph       %[temp6],   %[temp2],    %[temp4]   \n\t"
+    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
+    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
+    "addq.ph         %[temp3],   %[temp3],    %[temp5]   \n\t"
+    "addq.ph         %[temp3],   %[temp3],    %[temp2]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp6]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp4]   \n\t"
+    "shra_r.ph       %[temp3],   %[temp3],    2          \n\t"
+    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
+    "replv.qb        %[temp0],   %[temp3]                \n\t"
+    "replv.qb        %[temp1],   %[temp2]                \n\t"
+    "srl             %[temp3],   %[temp3],    16         \n\t"
+    "srl             %[temp2],   %[temp2],    16         \n\t"
+    "replv.qb        %[temp3],   %[temp3]                \n\t"
+    "replv.qb        %[temp2],   %[temp2]                \n\t"
+    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp2],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp1],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void RD4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  int temp6, temp7, temp8, temp9, temp10, temp11;
+  __asm__ volatile(
+    "ulw             %[temp0],    -5(%[top])               \n\t"
+    "ulw             %[temp1],    -1(%[top])               \n\t"
+    "preceu.ph.qbl   %[temp2],    %[temp0]                 \n\t"
+    "preceu.ph.qbr   %[temp3],    %[temp0]                 \n\t"
+    "preceu.ph.qbr   %[temp4],    %[temp1]                 \n\t"
+    "preceu.ph.qbl   %[temp5],    %[temp1]                 \n\t"
+    "packrl.ph       %[temp6],    %[temp2],    %[temp3]    \n\t"
+    "packrl.ph       %[temp7],    %[temp4],    %[temp2]    \n\t"
+    "packrl.ph       %[temp8],    %[temp5],    %[temp4]    \n\t"
+    "shll.ph         %[temp6],    %[temp6],    1           \n\t"
+    "addq.ph         %[temp9],    %[temp2],    %[temp6]    \n\t"
+    "shll.ph         %[temp7],    %[temp7],    1           \n\t"
+    "addq.ph         %[temp9],    %[temp9],    %[temp3]    \n\t"
+    "shll.ph         %[temp8],    %[temp8],    1           \n\t"
+    "shra_r.ph       %[temp9],    %[temp9],    2           \n\t"
+    "addq.ph         %[temp10],   %[temp4],    %[temp7]    \n\t"
+    "addq.ph         %[temp11],   %[temp5],    %[temp8]    \n\t"
+    "addq.ph         %[temp10],   %[temp10],   %[temp2]    \n\t"
+    "addq.ph         %[temp11],   %[temp11],   %[temp4]    \n\t"
+    "shra_r.ph       %[temp10],   %[temp10],   2           \n\t"
+    "shra_r.ph       %[temp11],   %[temp11],   2           \n\t"
+    "lbu             %[temp0],    3(%[top])                \n\t"
+    "lbu             %[temp1],    2(%[top])                \n\t"
+    "lbu             %[temp2],    1(%[top])                \n\t"
+    "sll             %[temp1],    %[temp1],    1           \n\t"
+    "addu            %[temp0],    %[temp0],    %[temp1]    \n\t"
+    "addu            %[temp0],    %[temp0],    %[temp2]    \n\t"
+    "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]    \n\t"
+    "shra_r.w        %[temp0],    %[temp0],    2           \n\t"
+    "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]   \n\t"
+    "usw             %[temp9],    3*" XSTR(BPS) "(%[dst])  \n\t"
+    "usw             %[temp10],   1*" XSTR(BPS) "(%[dst])  \n\t"
+    "prepend         %[temp9],    %[temp11],   8           \n\t"
+    "prepend         %[temp10],   %[temp0],    8           \n\t"
+    "usw             %[temp9],    2*" XSTR(BPS) "(%[dst])  \n\t"
+    "usw             %[temp10],   0*" XSTR(BPS) "(%[dst])  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void VR4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    "ulw              %[temp0],   -4(%[top])              \n\t"
+    "ulw              %[temp1],   0(%[top])               \n\t"
+    "preceu.ph.qbl    %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbr    %[temp0],   %[temp0]                \n\t"
+    "preceu.ph.qbla   %[temp3],   %[temp1]                \n\t"
+    "preceu.ph.qbra   %[temp1],   %[temp1]                \n\t"
+    "packrl.ph        %[temp7],   %[temp3],    %[temp2]   \n\t"
+    "addqh_r.ph       %[temp4],   %[temp1],    %[temp3]   \n\t"
+    "move             %[temp6],   %[temp1]                \n\t"
+    "append           %[temp1],   %[temp2],    16         \n\t"
+    "shll.ph          %[temp9],   %[temp6],    1          \n\t"
+    "addqh_r.ph       %[temp5],   %[temp7],    %[temp6]   \n\t"
+    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
+    "addu.ph          %[temp3],   %[temp7],    %[temp3]   \n\t"
+    "addu.ph          %[temp1],   %[temp1],    %[temp6]   \n\t"
+    "packrl.ph        %[temp7],   %[temp2],    %[temp0]   \n\t"
+    "addu.ph          %[temp6],   %[temp0],    %[temp2]   \n\t"
+    "addu.ph          %[temp3],   %[temp3],    %[temp9]   \n\t"
+    "addu.ph          %[temp1],   %[temp1],    %[temp8]   \n\t"
+    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
+    "shra_r.ph        %[temp3],   %[temp3],    2          \n\t"
+    "shra_r.ph        %[temp1],   %[temp1],    2          \n\t"
+    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
+    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
+    "precrq.ph.w      %[temp8],   %[temp4],    %[temp5]   \n\t"
+    "append           %[temp4],   %[temp5],    16         \n\t"
+    "precrq.ph.w      %[temp2],   %[temp3],    %[temp1]   \n\t"
+    "append           %[temp3],   %[temp1],    16         \n\t"
+    "precr.qb.ph      %[temp8],   %[temp8],    %[temp4]   \n\t"
+    "precr.qb.ph      %[temp3],   %[temp2],    %[temp3]   \n\t"
+    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "append           %[temp3],   %[temp6],    8          \n\t"
+    "srl              %[temp6],   %[temp6],    16         \n\t"
+    "append           %[temp8],   %[temp6],    8          \n\t"
+    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void LD4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  int temp6, temp7, temp8, temp9, temp10, temp11;
+  __asm__ volatile(
+    "ulw             %[temp0],    0(%[top])               \n\t"
+    "ulw             %[temp1],    4(%[top])               \n\t"
+    "preceu.ph.qbl   %[temp2],    %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp3],    %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp4],    %[temp1]                \n\t"
+    "preceu.ph.qbl   %[temp5],    %[temp1]                \n\t"
+    "packrl.ph       %[temp6],    %[temp2],    %[temp3]   \n\t"
+    "packrl.ph       %[temp7],    %[temp4],    %[temp2]   \n\t"
+    "packrl.ph       %[temp8],    %[temp5],    %[temp4]   \n\t"
+    "shll.ph         %[temp6],    %[temp6],    1          \n\t"
+    "addq.ph         %[temp9],    %[temp2],    %[temp6]   \n\t"
+    "shll.ph         %[temp7],    %[temp7],    1          \n\t"
+    "addq.ph         %[temp9],    %[temp9],    %[temp3]   \n\t"
+    "shll.ph         %[temp8],    %[temp8],    1          \n\t"
+    "shra_r.ph       %[temp9],    %[temp9],    2          \n\t"
+    "addq.ph         %[temp10],   %[temp4],    %[temp7]   \n\t"
+    "addq.ph         %[temp11],   %[temp5],    %[temp8]   \n\t"
+    "addq.ph         %[temp10],   %[temp10],   %[temp2]   \n\t"
+    "addq.ph         %[temp11],   %[temp11],   %[temp4]   \n\t"
+    "shra_r.ph       %[temp10],   %[temp10],   2          \n\t"
+    "shra_r.ph       %[temp11],   %[temp11],   2          \n\t"
+    "srl             %[temp1],    %[temp1],    24         \n\t"
+    "sll             %[temp1],    %[temp1],    1          \n\t"
+    "raddu.w.qb      %[temp5],    %[temp5]                \n\t"
+    "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]   \n\t"
+    "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]  \n\t"
+    "addu            %[temp1],    %[temp1],    %[temp5]   \n\t"
+    "shra_r.w        %[temp1],    %[temp1],    2          \n\t"
+    "usw             %[temp9],    0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp10],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "prepend         %[temp9],    %[temp11],   8          \n\t"
+    "prepend         %[temp10],   %[temp1],    8          \n\t"
+    "usw             %[temp9],    1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp10],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void VL4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    "ulw              %[temp0],   0(%[top])               \n\t"
+    "ulw              %[temp1],   4(%[top])               \n\t"
+    "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
+    "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
+    "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
+    "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
+    "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
+    "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
+    "shll.ph          %[temp9],   %[temp2],    1          \n\t"
+    "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
+    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
+    "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
+    "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
+    "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
+    "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
+    "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
+    "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
+    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
+    "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
+    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
+    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
+    "precrq.ph.w      %[temp8],   %[temp5],    %[temp4]   \n\t"
+    "append           %[temp5],   %[temp4],    16         \n\t"
+    "precrq.ph.w      %[temp3],   %[temp2],    %[temp0]   \n\t"
+    "append           %[temp2],   %[temp0],    16         \n\t"
+    "precr.qb.ph      %[temp8],   %[temp8],    %[temp5]   \n\t"
+    "precr.qb.ph      %[temp3],   %[temp3],    %[temp2]   \n\t"
+    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "prepend          %[temp8],   %[temp6],    8          \n\t"
+    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "srl              %[temp6],   %[temp6],    16         \n\t"
+    "prepend          %[temp3],   %[temp6],    8          \n\t"
+    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void HD4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    "ulw              %[temp0],   -5(%[top])              \n\t"
+    "ulw              %[temp1],   -1(%[top])              \n\t"
+    "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
+    "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
+    "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
+    "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
+    "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
+    "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
+    "shll.ph          %[temp9],   %[temp2],    1          \n\t"
+    "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
+    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
+    "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
+    "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
+    "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
+    "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
+    "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
+    "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
+    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
+    "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
+    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
+    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
+    "precrq.ph.w      %[temp1],   %[temp2],    %[temp5]   \n\t"
+    "precrq.ph.w      %[temp3],   %[temp0],    %[temp4]   \n\t"
+    "precr.qb.ph      %[temp7],   %[temp6],    %[temp1]   \n\t"
+    "precr.qb.ph      %[temp6],   %[temp1],    %[temp3]   \n\t"
+    "usw              %[temp7],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp6],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "append           %[temp2],   %[temp5],    16         \n\t"
+    "append           %[temp0],   %[temp4],    16         \n\t"
+    "precr.qb.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "precr.qb.ph      %[temp4],   %[temp2],    %[temp0]   \n\t"
+    "usw              %[temp5],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void HU4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  __asm__ volatile (
+    "ulw             %[temp0],   -5(%[top])              \n\t"
+    "preceu.ph.qbl   %[temp1],   %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "packrl.ph       %[temp3],   %[temp1],    %[temp2]   \n\t"
+    "replv.qb        %[temp7],   %[temp2]                \n\t"
+    "addqh_r.ph      %[temp4],   %[temp1],    %[temp3]   \n\t"
+    "addqh_r.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "shll.ph         %[temp6],   %[temp3],    1          \n\t"
+    "addu.ph         %[temp3],   %[temp2],    %[temp3]   \n\t"
+    "addu.ph         %[temp6],   %[temp1],    %[temp6]   \n\t"
+    "shll.ph         %[temp0],   %[temp2],    1          \n\t"
+    "addu.ph         %[temp6],   %[temp6],    %[temp2]   \n\t"
+    "addu.ph         %[temp0],   %[temp3],    %[temp0]   \n\t"
+    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
+    "shra_r.ph       %[temp0],   %[temp0],    2          \n\t"
+    "packrl.ph       %[temp3],   %[temp6],    %[temp5]   \n\t"
+    "precrq.ph.w     %[temp2],   %[temp6],    %[temp4]   \n\t"
+    "append          %[temp0],   %[temp5],    16         \n\t"
+    "precr.qb.ph     %[temp3],   %[temp3],    %[temp2]   \n\t"
+    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "precr.qb.ph     %[temp1],   %[temp7],    %[temp0]   \n\t"
+    "usw             %[temp7],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    "packrl.ph       %[temp2],   %[temp1],    %[temp3]   \n\t"
+    "usw             %[temp1],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp2],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
+                                       const uint8_t* top) {
+  // U block
+  DCMode8(C8DC8 + dst, left, top);
+  VerticalPred8(C8VE8 + dst, top);
+  HorizontalPred8(C8HE8 + dst, left);
+  TrueMotion8(C8TM8 + dst, left, top);
+  // V block
+  dst += 8;
+  if (top) top += 8;
+  if (left) left += 16;
+  DCMode8(C8DC8 + dst, left, top);
+  VerticalPred8(C8VE8 + dst, top);
+  HorizontalPred8(C8HE8 + dst, left);
+  TrueMotion8(C8TM8 + dst, left, top);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds_MIPSdspR2(uint8_t* dst,
+                                   const uint8_t* left, const uint8_t* top) {
+  DCMode16(I16DC16 + dst, left, top);
+  VerticalPred16(I16VE16 + dst, top);
+  HorizontalPred16(I16HE16 + dst, left);
+  TrueMotion16(I16TM16 + dst, left, top);
+}
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+#if !defined(WORK_AROUND_GCC)
+
+#define GET_SSE_INNER(A)                                                  \
+  "lw               %[temp0],    " #A "(%[a])                  \n\t"      \
+  "lw               %[temp1],    " #A "(%[b])                  \n\t"      \
+  "preceu.ph.qbr    %[temp2],    %[temp0]                      \n\t"      \
+  "preceu.ph.qbl    %[temp0],    %[temp0]                      \n\t"      \
+  "preceu.ph.qbr    %[temp3],    %[temp1]                      \n\t"      \
+  "preceu.ph.qbl    %[temp1],    %[temp1]                      \n\t"      \
+  "subq.ph          %[temp2],    %[temp2],    %[temp3]         \n\t"      \
+  "subq.ph          %[temp0],    %[temp0],    %[temp1]         \n\t"      \
+  "dpa.w.ph         $ac0,        %[temp2],    %[temp2]         \n\t"      \
+  "dpa.w.ph         $ac0,        %[temp0],    %[temp0]         \n\t"
+
+#define GET_SSE(A, B, C, D)               \
+  GET_SSE_INNER(A)                        \
+  GET_SSE_INNER(B)                        \
+  GET_SSE_INNER(C)                        \
+  GET_SSE_INNER(D)
+
+static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3;
+  __asm__ volatile (
+    "mult   $zero,    $zero                            \n\t"
+    GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+    GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+    GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+    GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+    GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+    GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+    GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+    GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+    GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
+    GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
+    GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
+    GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
+    GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
+    GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
+    GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
+    GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
+    "mflo   %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3;
+  __asm__ volatile (
+    "mult   $zero,    $zero                            \n\t"
+    GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+    GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+    GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+    GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+    GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+    GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+    GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+    GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+    "mflo   %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3;
+  __asm__ volatile (
+    "mult   $zero,    $zero                            \n\t"
+    GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
+    GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
+    GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
+    GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
+    "mflo   %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3;
+  __asm__ volatile (
+    "mult   $zero,    $zero                            \n\t"
+    GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
+    "mflo   %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+#undef GET_SSE
+#undef GET_SSE_INNER
+
+#endif  // !WORK_AROUND_GCC
+
+#undef FILL_8_OR_16
+#undef FILL_PART
+#undef OUTPUT_EARLY_CLOBBER_REGS_17
+#undef MUL_HALF
+#undef ABS_X8
+#undef ADD_SUB_HALVES_X4
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+// macro for one pass through for loop in QuantizeBlock reading 2 values at time
+// QUANTDIV macro inlined
+// J - offset in bytes (kZigzag[n] * 2)
+// K - offset in bytes (kZigzag[n] * 4)
+// N - offset in bytes (n * 2)
+// N1 - offset in bytes ((n + 1) * 2)
+#define QUANTIZE_ONE(J, K, N, N1)                                         \
+  "ulw         %[temp1],     " #J "(%[ppin])                 \n\t"        \
+  "ulw         %[temp2],     " #J "(%[ppsharpen])            \n\t"        \
+  "lhu         %[temp3],     " #K "(%[ppzthresh])            \n\t"        \
+  "lhu         %[temp6],     " #K "+4(%[ppzthresh])          \n\t"        \
+  "absq_s.ph   %[temp4],     %[temp1]                        \n\t"        \
+  "ins         %[temp3],     %[temp6],         16,       16  \n\t"        \
+  "addu.ph     %[coeff],     %[temp4],         %[temp2]      \n\t"        \
+  "shra.ph     %[sign],      %[temp1],         15            \n\t"        \
+  "li          %[level],     0x10001                         \n\t"        \
+  "cmp.lt.ph   %[temp3],     %[coeff]                        \n\t"        \
+  "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
+  "pick.ph     %[temp5],     %[level],         $0            \n\t"        \
+  "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
+  "beqz        %[temp5],     0f                              \n\t"        \
+  "lhu         %[temp3],     " #J "(%[ppq])                  \n\t"        \
+  "beq         %[temp5],     %[level],         1f            \n\t"        \
+  "andi        %[temp5],     %[temp5],         0x1           \n\t"        \
+  "andi        %[temp4],     %[coeff],         0xffff        \n\t"        \
+  "beqz        %[temp5],     2f                              \n\t"        \
+  "mul         %[level],     %[temp4],         %[temp1]      \n\t"        \
+  "sh          $0,           " #J "+2(%[ppin])               \n\t"        \
+  "sh          $0,           " #N1 "(%[pout])                \n\t"        \
+  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
+  "sra         %[level],     %[level],         17            \n\t"        \
+  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
+  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
+  "andi        %[temp6],     %[sign],          0xffff        \n\t"        \
+  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
+  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
+  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
+  "or          %[ret],       %[ret],           %[level]      \n\t"        \
+  "sh          %[level],     " #N "(%[pout])                 \n\t"        \
+  "sh          %[temp5],     " #J "(%[ppin])                 \n\t"        \
+  "j           3f                                            \n\t"        \
+"2:                                                          \n\t"        \
+  "lhu         %[temp1],     " #J "+2(%[ppiq])               \n\t"        \
+  "srl         %[temp5],     %[coeff],         16            \n\t"        \
+  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
+  "lw          %[temp2],     " #K "+4(%[ppbias])             \n\t"        \
+  "lhu         %[temp3],     " #J "+2(%[ppq])                \n\t"        \
+  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
+  "sra         %[level],     %[level],         17            \n\t"        \
+  "srl         %[temp6],     %[sign],          16            \n\t"        \
+  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
+  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
+  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
+  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
+  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
+  "sh          $0,           " #J "(%[ppin])                 \n\t"        \
+  "sh          $0,           " #N "(%[pout])                 \n\t"        \
+  "or          %[ret],       %[ret],           %[level]      \n\t"        \
+  "sh          %[temp5],     " #J "+2(%[ppin])               \n\t"        \
+  "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
+  "j           3f                                            \n\t"        \
+"1:                                                          \n\t"        \
+  "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
+  "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
+  "ulw         %[temp3],     " #J "(%[ppq])                  \n\t"        \
+  "andi        %[temp5],     %[coeff],         0xffff        \n\t"        \
+  "srl         %[temp0],     %[coeff],         16            \n\t"        \
+  "lhu         %[temp6],     " #J "+2(%[ppiq])               \n\t"        \
+  "lw          %[coeff],     " #K "+4(%[ppbias])             \n\t"        \
+  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
+  "mul         %[temp4],     %[temp0],         %[temp6]      \n\t"        \
+  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
+  "addu        %[temp4],     %[temp4],         %[coeff]      \n\t"        \
+  "precrq.ph.w %[level],     %[temp4],         %[level]      \n\t"        \
+  "shra.ph     %[level],     %[level],         1             \n\t"        \
+  "cmp.lt.ph   %[max_level1],%[level]                        \n\t"        \
+  "pick.ph     %[level],     %[max_level],     %[level]      \n\t"        \
+  "xor         %[level],     %[level],         %[sign]       \n\t"        \
+  "subu.ph     %[level],     %[level],         %[sign]       \n\t"        \
+  "mul.ph      %[temp3],     %[level],         %[temp3]      \n\t"        \
+  "or          %[ret],       %[ret],           %[level]      \n\t"        \
+  "sh          %[level],     " #N "(%[pout])                 \n\t"        \
+  "srl         %[level],     %[level],         16            \n\t"        \
+  "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
+  "usw         %[temp3],     " #J "(%[ppin])                 \n\t"        \
+  "j           3f                                            \n\t"        \
+"0:                                                          \n\t"        \
+  "sh          $0,           " #N "(%[pout])                 \n\t"        \
+  "sh          $0,           " #N1 "(%[pout])                \n\t"        \
+  "usw         $0,           " #J "(%[ppin])                 \n\t"        \
+"3:                                                          \n\t"
+
+static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
+                                   const VP8Matrix* const mtx) {
+  int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
+  int sign, coeff, level;
+  int max_level = MAX_LEVEL;
+  int max_level1 = max_level << 16 | max_level;
+  int ret = 0;
+
+  int16_t* ppin             = &in[0];
+  int16_t* pout             = &out[0];
+  const uint16_t* ppsharpen = &mtx->sharpen_[0];
+  const uint32_t* ppzthresh = &mtx->zthresh_[0];
+  const uint16_t* ppq       = &mtx->q_[0];
+  const uint16_t* ppiq      = &mtx->iq_[0];
+  const uint32_t* ppbias    = &mtx->bias_[0];
+
+  __asm__ volatile (
+    QUANTIZE_ONE( 0,  0,  0,  2)
+    QUANTIZE_ONE( 4,  8, 10, 12)
+    QUANTIZE_ONE( 8, 16,  4,  8)
+    QUANTIZE_ONE(12, 24, 14, 24)
+    QUANTIZE_ONE(16, 32,  6, 16)
+    QUANTIZE_ONE(20, 40, 22, 26)
+    QUANTIZE_ONE(24, 48, 18, 20)
+    QUANTIZE_ONE(28, 56, 28, 30)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
+      [level]"=&r"(level), [temp6]"=&r"(temp6), [ret]"+&r"(ret)
+    : [ppin]"r"(ppin), [pout]"r"(pout), [max_level1]"r"(max_level1),
+      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
+      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
+      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
+    : "memory", "hi", "lo"
+  );
+
+  return (ret != 0);
+}
+
+static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
+                                     const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
+#undef QUANTIZE_ONE
+
+// macro for one horizontal pass in FTransformWHT
+// temp0..temp7 holds tmp[0]..tmp[15]
+// A, B, C, D - offset in bytes to load from in buffer
+// TEMP0, TEMP1 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS_WHT(A, B, C, D, TEMP0, TEMP1)                          \
+  "lh              %[" #TEMP0 "],  " #A "(%[in])            \n\t"              \
+  "lh              %[" #TEMP1 "],  " #B "(%[in])            \n\t"              \
+  "lh              %[temp8],     " #C "(%[in])              \n\t"              \
+  "lh              %[temp9],     " #D "(%[in])              \n\t"              \
+  "ins             %[" #TEMP1 "],  %[" #TEMP0 "],  16,  16  \n\t"              \
+  "ins             %[temp9],     %[temp8],     16,  16      \n\t"              \
+  "subq.ph         %[temp8],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
+  "addq.ph         %[temp9],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
+  "precrq.ph.w     %[" #TEMP0 "],  %[temp8],     %[temp9]   \n\t"              \
+  "append          %[temp8],     %[temp9],     16           \n\t"              \
+  "subq.ph         %[" #TEMP1 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
+  "addq.ph         %[" #TEMP0 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
+  "rotr            %[" #TEMP1 "],  %[" #TEMP1 "],  16       \n\t"
+
+// macro for one vertical pass in FTransformWHT
+// temp0..temp7 holds tmp[0]..tmp[15]
+// A, B, C, D - offsets in bytes to store to out buffer
+// TEMP0, TEMP2, TEMP4 and TEMP6 - registers for corresponding tmp elements
+#define VERTICAL_PASS_WHT(A, B, C, D, TEMP0, TEMP2, TEMP4, TEMP6)              \
+  "addq.ph         %[temp8],     %[" #TEMP0 "],  %[" #TEMP4 "]    \n\t"        \
+  "addq.ph         %[temp9],     %[" #TEMP2 "],  %[" #TEMP6 "]    \n\t"        \
+  "subq.ph         %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
+  "subq.ph         %[" #TEMP6 "],  %[" #TEMP0 "],  %[" #TEMP4 "]  \n\t"        \
+  "addqh.ph        %[" #TEMP0 "],  %[temp8],     %[temp9]         \n\t"        \
+  "subqh.ph        %[" #TEMP4 "],  %[" #TEMP6 "],  %[" #TEMP2 "]  \n\t"        \
+  "addqh.ph        %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
+  "subqh.ph        %[" #TEMP6 "],  %[temp8],     %[temp9]         \n\t"        \
+  "usw             %[" #TEMP0 "],  " #A "(%[out])                 \n\t"        \
+  "usw             %[" #TEMP2 "],  " #B "(%[out])                 \n\t"        \
+  "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
+  "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
+
+static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+
+  __asm__ volatile (
+    HORIZONTAL_PASS_WHT(  0,  32,  64,  96, temp0, temp1)
+    HORIZONTAL_PASS_WHT(128, 160, 192, 224, temp2, temp3)
+    HORIZONTAL_PASS_WHT(256, 288, 320, 352, temp4, temp5)
+    HORIZONTAL_PASS_WHT(384, 416, 448, 480, temp6, temp7)
+    VERTICAL_PASS_WHT(0,  8, 16, 24, temp0, temp2, temp4, temp6)
+    VERTICAL_PASS_WHT(4, 12, 20, 28, temp1, temp3, temp5, temp7)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [in]"r"(in), [out]"r"(out)
+    : "memory"
+  );
+}
+
+#undef VERTICAL_PASS_WHT
+#undef HORIZONTAL_PASS_WHT
+
+// macro for converting coefficients to bin
+// convert 8 coeffs at time
+// A, B, C, D - offsets in bytes to load from out buffer
+#define CONVERT_COEFFS_TO_BIN(A, B, C, D)                                      \
+  "ulw        %[temp0],  " #A "(%[out])                \n\t"                   \
+  "ulw        %[temp1],  " #B "(%[out])                \n\t"                   \
+  "ulw        %[temp2],  " #C "(%[out])                \n\t"                   \
+  "ulw        %[temp3],  " #D "(%[out])                \n\t"                   \
+  "absq_s.ph  %[temp0],  %[temp0]                      \n\t"                   \
+  "absq_s.ph  %[temp1],  %[temp1]                      \n\t"                   \
+  "absq_s.ph  %[temp2],  %[temp2]                      \n\t"                   \
+  "absq_s.ph  %[temp3],  %[temp3]                      \n\t"                   \
+  "shra.ph    %[temp0],  %[temp0],    3                \n\t"                   \
+  "shra.ph    %[temp1],  %[temp1],    3                \n\t"                   \
+  "shra.ph    %[temp2],  %[temp2],    3                \n\t"                   \
+  "shra.ph    %[temp3],  %[temp3],    3                \n\t"                   \
+  "shll_s.ph  %[temp0],  %[temp0],    10               \n\t"                   \
+  "shll_s.ph  %[temp1],  %[temp1],    10               \n\t"                   \
+  "shll_s.ph  %[temp2],  %[temp2],    10               \n\t"                   \
+  "shll_s.ph  %[temp3],  %[temp3],    10               \n\t"                   \
+  "shrl.ph    %[temp0],  %[temp0],    10               \n\t"                   \
+  "shrl.ph    %[temp1],  %[temp1],    10               \n\t"                   \
+  "shrl.ph    %[temp2],  %[temp2],    10               \n\t"                   \
+  "shrl.ph    %[temp3],  %[temp3],    10               \n\t"                   \
+  "shll.ph    %[temp0],  %[temp0],    2                \n\t"                   \
+  "shll.ph    %[temp1],  %[temp1],    2                \n\t"                   \
+  "shll.ph    %[temp2],  %[temp2],    2                \n\t"                   \
+  "shll.ph    %[temp3],  %[temp3],    2                \n\t"                   \
+  "ext        %[temp4],  %[temp0],    0,       16      \n\t"                   \
+  "ext        %[temp0],  %[temp0],    16,      16      \n\t"                   \
+  "addu       %[temp4],  %[temp4],    %[dist]          \n\t"                   \
+  "addu       %[temp0],  %[temp0],    %[dist]          \n\t"                   \
+  "ext        %[temp5],  %[temp1],    0,       16      \n\t"                   \
+  "lw         %[temp8],  0(%[temp4])                   \n\t"                   \
+  "ext        %[temp1],  %[temp1],    16,      16      \n\t"                   \
+  "addu       %[temp5],  %[temp5],    %[dist]          \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp4])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp0])                   \n\t"                   \
+  "addu       %[temp1],  %[temp1],    %[dist]          \n\t"                   \
+  "ext        %[temp6],  %[temp2],    0,       16      \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp0])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp5])                   \n\t"                   \
+  "ext        %[temp2],  %[temp2],    16,      16      \n\t"                   \
+  "addu       %[temp6],  %[temp6],    %[dist]          \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp5])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp1])                   \n\t"                   \
+  "addu       %[temp2],  %[temp2],    %[dist]          \n\t"                   \
+  "ext        %[temp7],  %[temp3],    0,       16      \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp1])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp6])                   \n\t"                   \
+  "ext        %[temp3],  %[temp3],    16,      16      \n\t"                   \
+  "addu       %[temp7],  %[temp7],    %[dist]          \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp6])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp2])                   \n\t"                   \
+  "addu       %[temp3],  %[temp3],    %[dist]          \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp2])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp7])                   \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp7])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp3])                   \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp3])                   \n\t"
+
+static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
+                                       int start_block, int end_block,
+                                       VP8Histogram* const histo) {
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin.
+    __asm__ volatile (
+      CONVERT_COEFFS_TO_BIN( 0,  4,  8, 12)
+      CONVERT_COEFFS_TO_BIN(16, 20, 24, 28)
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+        [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+      : [dist]"r"(distribution), [out]"r"(out), [max_coeff]"r"(max_coeff)
+      : "memory"
+    );
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+#undef CONVERT_COEFFS_TO_BIN
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
+  VP8FTransform = FTransform_MIPSdspR2;
+  VP8FTransformWHT = FTransformWHT_MIPSdspR2;
+  VP8ITransform = ITransform_MIPSdspR2;
+
+  VP8TDisto4x4 = Disto4x4_MIPSdspR2;
+  VP8TDisto16x16 = Disto16x16_MIPSdspR2;
+
+  VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
+  VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
+  VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
+
+#if !defined(WORK_AROUND_GCC)
+  VP8SSE16x16 = SSE16x16_MIPSdspR2;
+  VP8SSE8x8 = SSE8x8_MIPSdspR2;
+  VP8SSE16x8 = SSE16x8_MIPSdspR2;
+  VP8SSE4x4 = SSE4x4_MIPSdspR2;
+#endif
+
+  VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
+
+  VP8CollectHistogram = CollectHistogram_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/enc_msa.c b/media/libwebp/dsp/enc_msa.c
new file mode 100644
index 0000000000..229582e4a6
--- /dev/null
+++ b/media/libwebp/dsp/enc_msa.c
@@ -0,0 +1,896 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of encoder dsp functions.
+//
+// Author:  Prashant Patil   (prashant.patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include <stdlib.h>
+#include "../dsp/msa_macro.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Transforms
+
+#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  v4i32 a1_m, b1_m, c1_m, d1_m;                                     \
+  const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091);              \
+  const v4i32 sinpi8sqrt2 = __msa_fill_w(35468);                    \
+  v4i32 c_tmp1_m = in1 * sinpi8sqrt2;                               \
+  v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1;                         \
+  v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1;                         \
+  v4i32 d_tmp2_m = in3 * sinpi8sqrt2;                               \
+                                                                    \
+  ADDSUB2(in0, in2, a1_m, b1_m);                                    \
+  SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16);                               \
+  c_tmp2_m = c_tmp2_m + in3;                                        \
+  c1_m = c_tmp1_m - c_tmp2_m;                                       \
+  SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16);                               \
+  d_tmp1_m = d_tmp1_m + in1;                                        \
+  d1_m = d_tmp1_m + d_tmp2_m;                                       \
+  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);      \
+} while (0)
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  v8i16 input0, input1;
+  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+  v4i32 res0, res1, res2, res3;
+  v16i8 dest0, dest1, dest2, dest3;
+  const v16i8 zero = { 0 };
+
+  LD_SH2(in, 8, input0, input1);
+  UNPCK_SH_SW(input0, in0, in1);
+  UNPCK_SH_SW(input1, in2, in3);
+  IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+  IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+  SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+  TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+  LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
+  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+             res0, res1, res2, res3);
+  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+             res0, res1, res2, res3);
+  ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+  CLIP_SW4_0_255(res0, res1, res2, res3);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                           int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
+                           int16_t* out) {
+  uint64_t out0, out1, out2, out3;
+  uint32_t in0, in1, in2, in3;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  v8i16 t0, t1, t2, t3;
+  v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
+  const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+  const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
+  const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+  const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
+  const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };
+
+  LW4(src, BPS, in0, in1, in2, in3);
+  INSERT_W4_UB(in0, in1, in2, in3, src0);
+  LW4(ref, BPS, in0, in1, in2, in3);
+  INSERT_W4_UB(in0, in1, in2, in3, src1);
+  ILVRL_B2_UB(src0, src1, srcl0, srcl1);
+  HSUB_UB2_SH(srcl0, srcl1, t0, t1);
+  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
+  ADDSUB2(t2, t3, t0, t1);
+  t0 = SRLI_H(t0, 3);
+  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
+  tmp0 = __msa_hadd_s_w(t3, t3);
+  tmp2 = __msa_hsub_s_w(t3, t3);
+  FILL_W2_SW(1812, 937, tmp1, tmp3);
+  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
+  SRAI_W2_SW(tmp1, tmp3, 9);
+  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
+  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
+  ADDSUB2(t2, t3, t0, t1);
+  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
+  tmp0 = __msa_hadd_s_w(t3, t3);
+  tmp2 = __msa_hsub_s_w(t3, t3);
+  ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
+  SRAI_W2_SW(tmp0, tmp2, 4);
+  FILL_W2_SW(12000, 51000, tmp1, tmp3);
+  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
+  SRAI_W2_SW(tmp1, tmp3, 16);
+  UNPCK_R_SH_SW(t1, tmp4);
+  tmp5 = __msa_ceqi_w(tmp4, 0);
+  tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
+  tmp5 = __msa_fill_w(1);
+  tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
+  tmp1 += tmp5;
+  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
+  out0 = __msa_copy_s_d((v2i64)t0, 0);
+  out1 = __msa_copy_s_d((v2i64)t0, 1);
+  out2 = __msa_copy_s_d((v2i64)t1, 0);
+  out3 = __msa_copy_s_d((v2i64)t1, 1);
+  SD4(out0, out1, out2, out3, out, 8);
+}
+
+static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
+  v8i16 in0 = { 0 };
+  v8i16 in1 = { 0 };
+  v8i16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0, out1;
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+
+  in0 = __msa_insert_h(in0, 0, in[  0]);
+  in0 = __msa_insert_h(in0, 1, in[ 64]);
+  in0 = __msa_insert_h(in0, 2, in[128]);
+  in0 = __msa_insert_h(in0, 3, in[192]);
+  in0 = __msa_insert_h(in0, 4, in[ 16]);
+  in0 = __msa_insert_h(in0, 5, in[ 80]);
+  in0 = __msa_insert_h(in0, 6, in[144]);
+  in0 = __msa_insert_h(in0, 7, in[208]);
+  in1 = __msa_insert_h(in1, 0, in[ 48]);
+  in1 = __msa_insert_h(in1, 1, in[112]);
+  in1 = __msa_insert_h(in1, 2, in[176]);
+  in1 = __msa_insert_h(in1, 3, in[240]);
+  in1 = __msa_insert_h(in1, 4, in[ 32]);
+  in1 = __msa_insert_h(in1, 5, in[ 96]);
+  in1 = __msa_insert_h(in1, 6, in[160]);
+  in1 = __msa_insert_h(in1, 7, in[224]);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, out0, out1);
+  SRAI_H2_SH(out0, out1, 1);
+  ST_SH2(out0, out1, out, 8);
+}
+
+static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
+  int sum;
+  uint32_t in0_m, in1_m, in2_m, in3_m;
+  v16i8 src0 = { 0 };
+  v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
+  v4i32 dst0, dst1;
+  const v16i8 zero = { 0 };
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+
+  LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
+  INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
+  ILVRL_B2_SH(zero, src0, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+  tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
+  tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
+  LD_SH2(w, 8, tmp2, tmp3);
+  DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
+  dst0 = dst0 + dst1;
+  sum = HADD_SW_S32(dst0);
+  return sum;
+}
+
+static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
+  const int sum1 = TTransform_MSA(a, w);
+  const int sum2 = TTransform_MSA(b, w);
+  return abs(sum2 - sum1) >> 5;
+}
+
+static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
+                          const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_MSA(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Histogram
+
+static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
+                                 int start_block, int end_block,
+                                 VP8Histogram* const histo) {
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    {
+      int k;
+      v8i16 coeff0, coeff1;
+      const v8i16 zero = { 0 };
+      const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);
+      LD_SH2(&out[0], 8, coeff0, coeff1);
+      coeff0 = __msa_add_a_h(coeff0, zero);
+      coeff1 = __msa_add_a_h(coeff1, zero);
+      SRAI_H2_SH(coeff0, coeff1, 3);
+      coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);
+      coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);
+      ST_SH2(coeff0, coeff1, &out[0], 8);
+      for (k = 0; k < 16; ++k) {
+        ++distribution[out[k]];
+      }
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+// luma 4x4 prediction
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+  const v16u8 A1 = { 0 };
+  const uint64_t val_m = LD(top - 1);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+  const v16u8 B = SLDI_UB(A, A, 1);
+  const v16u8 C = SLDI_UB(A, A, 2);
+  const v16u8 AC = __msa_ave_u_b(A, C);
+  const v16u8 B2 = __msa_ave_u_b(B, B);
+  const v16u8 R = __msa_aver_u_b(AC, B2);
+  const uint32_t out = __msa_copy_s_w((v4i32)R, 0);
+  SW4(out, out, out, out, dst, BPS);
+}
+
+static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
+}
+
+static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+  dc >>= 3;
+  dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
+  SW4(dc, dc, dc, dc, dst, BPS);
+}
+
+static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
+  const v16u8 A2 = { 0 };
+  const uint64_t val_m = LD(top - 5);
+  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
+  const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
+  const v16u8 B = SLDI_UB(A, A, 1);
+  const v16u8 C = SLDI_UB(A, A, 2);
+  const v16u8 AC = __msa_ave_u_b(A, C);
+  const v16u8 B2 = __msa_ave_u_b(B, B);
+  const v16u8 R0 = __msa_aver_u_b(AC, B2);
+  const v16u8 R1 = SLDI_UB(R0, R0, 1);
+  const v16u8 R2 = SLDI_UB(R1, R1, 1);
+  const v16u8 R3 = SLDI_UB(R2, R2, 1);
+  const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
+  const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
+  const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
+  const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
+  SW4(val3, val2, val1, val0, dst, BPS);
+}
+
+static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
+  const v16u8 A1 = { 0 };
+  const uint64_t val_m = LD(top);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+  const v16u8 B = SLDI_UB(A, A, 1);
+  const v16u8 C1 = SLDI_UB(A, A, 2);
+  const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
+  const v16u8 AC = __msa_ave_u_b(A, C);
+  const v16u8 B2 = __msa_ave_u_b(B, B);
+  const v16u8 R0 = __msa_aver_u_b(AC, B2);
+  const v16u8 R1 = SLDI_UB(R0, R0, 1);
+  const v16u8 R2 = SLDI_UB(R1, R1, 1);
+  const v16u8 R3 = SLDI_UB(R2, R2, 1);
+  const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
+  const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
+  const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
+  const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
+  SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
+}
+
+static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  const int E = top[4];
+  const int F = top[5];
+  const int G = top[6];
+  const int H = top[7];
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
+}
+
+static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+  const v16i8 zero = { 0 };
+  const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
+  const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
+  const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]);
+  const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]);
+  const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]);
+  const v16u8 T1 = LD_UB(top);
+  const v8i16 T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+  const v8i16 d = T - TL;
+  v8i16 r0, r1, r2, r3;
+  ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
+  CLIP_SH4_0_255(r0, r1, r2, r3);
+  PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
+}
+
+// luma 16x16 prediction
+
+#define STORE16x16(out, dst) do {                                        \
+    ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS);  \
+    ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);  \
+} while (0)
+
+static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
+  if (top != NULL) {
+    const v16u8 out = LD_UB(top);
+    STORE16x16(out, dst);
+  } else {
+    const v16u8 out = (v16u8)__msa_fill_b(0x7f);
+    STORE16x16(out, dst);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
+                                            const uint8_t* left) {
+  if (left != NULL) {
+    int j;
+    for (j = 0; j < 16; j += 4) {
+      const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
+      const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
+      const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
+      const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
+      ST_UB4(L0, L1, L2, L3, dst, BPS);
+      dst += 4 * BPS;
+      left += 4;
+    }
+  } else {
+    const v16u8 out = (v16u8)__msa_fill_b(0x81);
+    STORE16x16(out, dst);
+  }
+}
+
+static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
+                                        const uint8_t* top) {
+  if (left != NULL) {
+    if (top != NULL) {
+      int j;
+      v8i16 d1, d2;
+      const v16i8 zero = { 0 };
+      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
+      const v16u8 T = LD_UB(top);
+      ILVRL_B2_SH(zero, T, d1, d2);
+      SUB2(d1, TL, d2, TL, d1, d2);
+      for (j = 0; j < 16; j += 4) {
+        v16i8 t0, t1, t2, t3;
+        v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
+        const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
+        const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
+        const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
+        const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
+        ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
+        ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
+        CLIP_SH4_0_255(r0, r1, r2, r3);
+        CLIP_SH4_0_255(r4, r5, r6, r7);
+        PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
+        ST_SB4(t0, t1, t2, t3, dst, BPS);
+        dst += 4 * BPS;
+      }
+    } else {
+      HorizontalPred16x16(dst, left);
+    }
+  } else {
+    if (top != NULL) {
+      VerticalPred16x16(dst, top);
+    } else {
+      const v16u8 out = (v16u8)__msa_fill_b(0x81);
+      STORE16x16(out, dst);
+    }
+  }
+}
+
+static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
+                                    const uint8_t* top) {
+  int DC;
+  v16u8 out;
+  if (top != NULL && left != NULL) {
+    const v16u8 rtop = LD_UB(top);
+    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+    const v16u8 rleft = LD_UB(left);
+    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
+    const v8u16 dctemp = dctop + dcleft;
+    DC = HADD_UH_U32(dctemp);
+    DC = (DC + 16) >> 5;
+  } else if (left != NULL) {   // left but no top
+    const v16u8 rleft = LD_UB(left);
+    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
+    DC = HADD_UH_U32(dcleft);
+    DC = (DC + DC + 16) >> 5;
+  } else if (top != NULL) {   // top but no left
+    const v16u8 rtop = LD_UB(top);
+    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+    DC = HADD_UH_U32(dctop);
+    DC = (DC + DC + 16) >> 5;
+  } else {   // no top, no left, nothing.
+    DC = 0x80;
+  }
+  out = (v16u8)__msa_fill_b(DC);
+  STORE16x16(out, dst);
+}
+
+static void Intra16Preds_MSA(uint8_t* dst,
+                             const uint8_t* left, const uint8_t* top) {
+  DCMode16x16(I16DC16 + dst, left, top);
+  VerticalPred16x16(I16VE16 + dst, top);
+  HorizontalPred16x16(I16HE16 + dst, left);
+  TrueMotion16x16(I16TM16 + dst, left, top);
+}
+
+// Chroma 8x8 prediction
+
+#define CALC_DC8(in, out) do {                              \
+  const v8u16 temp0 = __msa_hadd_u_h(in, in);               \
+  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);         \
+  const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1);  \
+  const v2i64 temp3 = __msa_splati_d(temp2, 1);             \
+  const v2i64 temp4 = temp3 + temp2;                        \
+  const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4);       \
+  const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0);      \
+  out = __msa_copy_s_d(temp6, 0);                           \
+} while (0)
+
+#define STORE8x8(out, dst) do {                 \
+  SD4(out, out, out, out, dst + 0 * BPS, BPS);  \
+  SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
+} while (0)
+
+static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
+  if (top != NULL) {
+    const uint64_t out = LD(top);
+    STORE8x8(out, dst);
+  } else {
+    const uint64_t out = 0x7f7f7f7f7f7f7f7fULL;
+    STORE8x8(out, dst);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
+  if (left != NULL) {
+    int j;
+    for (j = 0; j < 8; j += 4) {
+      const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
+      const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
+      const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
+      const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
+      const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
+      const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
+      const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
+      const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
+      SD4(out0, out1, out2, out3, dst, BPS);
+      dst += 4 * BPS;
+      left += 4;
+    }
+  } else {
+    const uint64_t out = 0x8181818181818181ULL;
+    STORE8x8(out, dst);
+  }
+}
+
+static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
+                                      const uint8_t* top) {
+  if (left != NULL) {
+    if (top != NULL) {
+      int j;
+      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
+      const v16u8 T1 = LD_UB(top);
+      const v16i8 zero = { 0 };
+      const v8i16 T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+      const v8i16 d = T - TL;
+      for (j = 0; j < 8; j += 4) {
+        uint64_t out0, out1, out2, out3;
+        v16i8 t0, t1;
+        v8i16 r0 = (v8i16)__msa_fill_h(left[j + 0]);
+        v8i16 r1 = (v8i16)__msa_fill_h(left[j + 1]);
+        v8i16 r2 = (v8i16)__msa_fill_h(left[j + 2]);
+        v8i16 r3 = (v8i16)__msa_fill_h(left[j + 3]);
+        ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
+        CLIP_SH4_0_255(r0, r1, r2, r3);
+        PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
+        out0 = __msa_copy_s_d((v2i64)t0, 0);
+        out1 = __msa_copy_s_d((v2i64)t0, 1);
+        out2 = __msa_copy_s_d((v2i64)t1, 0);
+        out3 = __msa_copy_s_d((v2i64)t1, 1);
+        SD4(out0, out1, out2, out3, dst, BPS);
+        dst += 4 * BPS;
+      }
+    } else {
+      HorizontalPred8x8(dst, left);
+    }
+  } else {
+    if (top != NULL) {
+      VerticalPred8x8(dst, top);
+    } else {
+      const uint64_t out = 0x8181818181818181ULL;
+      STORE8x8(out, dst);
+    }
+  }
+}
+
+static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
+  uint64_t out;
+  v16u8 src = { 0 };
+  if (top != NULL && left != NULL) {
+    const uint64_t left_m = LD(left);
+    const uint64_t top_m = LD(top);
+    INSERT_D2_UB(left_m, top_m, src);
+    CALC_DC8(src, out);
+  } else if (left != NULL) {   // left but no top
+    const uint64_t left_m = LD(left);
+    INSERT_D2_UB(left_m, left_m, src);
+    CALC_DC8(src, out);
+  } else if (top != NULL) {   // top but no left
+    const uint64_t top_m = LD(top);
+    INSERT_D2_UB(top_m, top_m, src);
+    CALC_DC8(src, out);
+  } else {   // no top, no left, nothing.
+    src = (v16u8)__msa_fill_b(0x80);
+    out = __msa_copy_s_d((v2i64)src, 0);
+  }
+  STORE8x8(out, dst);
+}
+
+static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
+                                 const uint8_t* top) {
+  // U block
+  DCMode8x8(C8DC8 + dst, left, top);
+  VerticalPred8x8(C8VE8 + dst, top);
+  HorizontalPred8x8(C8HE8 + dst, left);
+  TrueMotion8x8(C8TM8 + dst, left, top);
+  // V block
+  dst += 8;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
+  DCMode8x8(C8DC8 + dst, left, top);
+  VerticalPred8x8(C8VE8 + dst, top);
+  HorizontalPred8x8(C8HE8 + dst, left);
+  TrueMotion8x8(C8TM8 + dst, left, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+#define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  v16u8 tmp0, tmp1;                                                        \
+  v8i16 tmp2, tmp3;                                                        \
+  ILVRL_B2_UB(in0, in1, tmp0, tmp1);                                       \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                     \
+  DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1);                         \
+  ILVRL_B2_UB(in2, in3, tmp0, tmp1);                                       \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                     \
+  DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
+} while (0)
+
+#define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  v16u8 tmp0, tmp1;                                                         \
+  v8i16 tmp2, tmp3;                                                         \
+  ILVRL_B2_UB(in0, in1, tmp0, tmp1);                                        \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                      \
+  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1);                         \
+  ILVRL_B2_UB(in2, in3, tmp0, tmp1);                                        \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                      \
+  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
+} while (0)
+
+static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v4i32 out0, out1, out2, out3;
+
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+  a += 8 * BPS;
+  b += 8 * BPS;
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+  out0 += out1;
+  out2 += out3;
+  out0 += out2;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v4i32 out0, out1, out2, out3;
+
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+  out0 += out1;
+  out2 += out3;
+  out0 += out2;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v16u8 t0, t1, t2, t3;
+  v4i32 out0, out1, out2, out3;
+
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3);
+  PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
+  ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3);
+  PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
+  out0 += out1;
+  out2 += out3;
+  out0 += out2;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum = 0;
+  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
+  v8i16 diff0, diff1;
+  v4i32 out0, out1;
+
+  LW4(a, BPS, src0, src1, src2, src3);
+  LW4(b, BPS, ref0, ref1, ref2, ref3);
+  INSERT_W4_UB(src0, src1, src2, src3, src);
+  INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+  ILVRL_B2_UB(src, ref, tmp0, tmp1);
+  HSUB_UB2_SH(tmp0, tmp1, diff0, diff1);
+  DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1);
+  out0 += out1;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+
+static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
+                             const VP8Matrix* const mtx) {
+  int sum;
+  v8i16 in0, in1, sh0, sh1, out0, out1;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
+  v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3;
+  const v8i16 zero = { 0 };
+  const v8i16 zigzag0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
+  const v8i16 zigzag1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
+  const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL);
+
+  LD_SH2(&in[0], 8, in0, in1);
+  LD_SH2(&mtx->sharpen_[0], 8, sh0, sh1);
+  tmp4 = __msa_add_a_h(in0, zero);
+  tmp5 = __msa_add_a_h(in1, zero);
+  ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1);
+  ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3);
+  HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3);
+  sign0 = (in0 < zero);
+  sign1 = (in1 < zero);                           // sign
+  LD_SH2(&mtx->iq_[0], 8, tmp0, tmp1);            // iq
+  ILVRL_H2_SW(zero, tmp0, t0, t1);
+  ILVRL_H2_SW(zero, tmp1, t2, t3);
+  LD_SW4(&mtx->bias_[0], 4, b0, b1, b2, b3);      // bias
+  MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3);
+  ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3);
+  SRAI_W4_SW(b0, b1, b2, b3, 17);
+  PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3);
+  tmp0 = (tmp2 > maxlevel);
+  tmp1 = (tmp3 > maxlevel);
+  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
+  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
+  SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
+  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
+  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
+  LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3);   // zthresh
+  t0 = (s0 > t0);
+  t1 = (s1 > t1);
+  t2 = (s2 > t2);
+  t3 = (s3 > t3);
+  PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1);
+  tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0);
+  tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1);
+  LD_SH2(&mtx->q_[0], 8, tmp0, tmp1);
+  MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1);
+  VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1);
+  ST_SH2(in0, in1, &in[0], 8);
+  ST_SH2(out0, out1, &out[0], 8);
+  out0 = __msa_add_a_h(out0, out1);
+  sum = HADD_SH_S32(out0);
+  return (sum > 0);
+}
+
+static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
+                               const VP8Matrix* const mtx) {
+  int nz;
+  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
+  VP8ITransform = ITransform_MSA;
+  VP8FTransform = FTransform_MSA;
+  VP8FTransformWHT = FTransformWHT_MSA;
+
+  VP8TDisto4x4 = Disto4x4_MSA;
+  VP8TDisto16x16 = Disto16x16_MSA;
+  VP8CollectHistogram = CollectHistogram_MSA;
+
+  VP8EncPredLuma4 = Intra4Preds_MSA;
+  VP8EncPredLuma16 = Intra16Preds_MSA;
+  VP8EncPredChroma8 = IntraChromaPreds_MSA;
+
+  VP8SSE16x16 = SSE16x16_MSA;
+  VP8SSE16x8 = SSE16x8_MSA;
+  VP8SSE8x8 = SSE8x8_MSA;
+  VP8SSE4x4 = SSE4x4_MSA;
+
+  VP8EncQuantizeBlock = QuantizeBlock_MSA;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/enc_neon.c b/media/libwebp/dsp/enc_neon.c
new file mode 100644
index 0000000000..657be9b21b
--- /dev/null
+++ b/media/libwebp/dsp/enc_neon.c
@@ -0,0 +1,938 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of speed-critical encoding functions.
+//
+// adapted from libvpx (https://www.webmproject.org/code/)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <assert.h>
+
+#include "../dsp/neon.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+// Inverse transform.
+// This code is pretty much the same as TransformOne in the dec_neon.c, except
+// for subtraction to *ref. See the comments there for algorithmic explanations.
+
+static const int16_t kC1 = 20091;
+static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
+
+// This code works but is *slower* than the inlined-asm version below
+// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
+// WEBP_USE_INTRINSICS define.
+// With gcc-4.8, it's a little faster speed than inlined-assembly.
+#if defined(WEBP_USE_INTRINSICS)
+
+// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
+static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
+  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
+}
+
+// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
+// to the corresponding rows of 'dst'.
+static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
+                                                 const int16x8_t dst01,
+                                                 const int16x8_t dst23) {
+  // Unsigned saturate to 8b.
+  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
+  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
+
+  // Store the results.
+  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
+  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
+  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
+  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
+}
+
+static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
+                                    const int16x8_t row23,
+                                    const uint8_t* const ref,
+                                    uint8_t* const dst) {
+  uint32x2_t dst01 = vdup_n_u32(0);
+  uint32x2_t dst23 = vdup_n_u32(0);
+
+  // Load the source pixels.
+  dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
+  dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
+  dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
+  dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
+
+  {
+    // Convert to 16b.
+    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
+    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
+
+    // Descale with rounding.
+    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
+    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
+    // Add the inverse transform.
+    SaturateAndStore4x4_NEON(dst, out01, out23);
+  }
+}
+
+static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
+                                          const int16x8_t in1,
+                                          int16x8x2_t* const out) {
+  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
+  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
+  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
+                                                  // b0 d0 b1 d1 b2 d2 ...
+  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
+}
+
+static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
+  // {rows} = in0 | in4
+  //          in8 | in12
+  // B1 = in4 | in12
+  const int16x8_t B1 =
+      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
+  // C0 = kC1 * in4 | kC1 * in12
+  // C1 = kC2 * in4 | kC2 * in12
+  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
+  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
+  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
+                                vget_low_s16(rows->val[1]));   // in0 + in8
+  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
+                                vget_low_s16(rows->val[1]));   // in0 - in8
+  // c = kC2 * in4 - kC1 * in12
+  // d = kC1 * in4 + kC2 * in12
+  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
+  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
+  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
+  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
+  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
+  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
+  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
+  Transpose8x2_NEON(E0, E1, rows);
+}
+
+static void ITransformOne_NEON(const uint8_t* ref,
+                               const int16_t* in, uint8_t* dst) {
+  int16x8x2_t rows;
+  INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
+  TransformPass_NEON(&rows);
+  TransformPass_NEON(&rows);
+  Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
+}
+
+#else
+
+static void ITransformOne_NEON(const uint8_t* ref,
+                               const int16_t* in, uint8_t* dst) {
+  const int kBPS = BPS;
+  const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
+
+  __asm__ volatile (
+    "vld1.16         {q1, q2}, [%[in]]           \n"
+    "vld1.16         {d0}, [%[kC1C2]]            \n"
+
+    // d2: in[0]
+    // d3: in[8]
+    // d4: in[4]
+    // d5: in[12]
+    "vswp            d3, d4                      \n"
+
+    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
+    // q9 = {in[4], in[12]} * kC2 >> 16
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    // d22 = a = in[0] + in[8]
+    // d23 = b = in[0] - in[8]
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    //  q8 = in[4]/[12] * kC1 >> 16
+    "vshr.s16        q8, q8, #1                  \n"
+
+    // Add {in[4], in[12]} back after the multiplication.
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    // d20 = c = in[4]*kC2 - in[12]*kC1
+    // d21 = d = in[4]*kC1 + in[12]*kC2
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    // d2 = tmp[0] = a + d
+    // d3 = tmp[1] = b + c
+    // d4 = tmp[2] = b - c
+    // d5 = tmp[3] = a - d
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    "vswp            d3, d4                      \n"
+
+    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
+    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    // d22 = a = tmp[0] + tmp[8]
+    // d23 = b = tmp[0] - tmp[8]
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    "vshr.s16        q8, q8, #1                  \n"
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    // d20 = c = in[4]*kC2 - in[12]*kC1
+    // d21 = d = in[4]*kC1 + in[12]*kC2
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    // d2 = tmp[0] = a + d
+    // d3 = tmp[1] = b + c
+    // d4 = tmp[2] = b - c
+    // d5 = tmp[3] = a - d
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
+
+    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
+
+    // (val) + 4 >> 3
+    "vrshr.s16       d2, d2, #3                  \n"
+    "vrshr.s16       d3, d3, #3                  \n"
+    "vrshr.s16       d4, d4, #3                  \n"
+    "vrshr.s16       d5, d5, #3                  \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    // Must accumulate before saturating
+    "vmovl.u8        q8, d6                      \n"
+    "vmovl.u8        q9, d7                      \n"
+
+    "vqadd.s16       q1, q1, q8                  \n"
+    "vqadd.s16       q2, q2, q9                  \n"
+
+    "vqmovun.s16     d0, q1                      \n"
+    "vqmovun.s16     d1, q2                      \n"
+
+    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[1], [%[dst]]             \n"
+
+    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
+    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
+    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
+  );
+}
+
+#endif    // WEBP_USE_INTRINSICS
+
+static void ITransform_NEON(const uint8_t* ref,
+                            const int16_t* in, uint8_t* dst, int do_two) {
+  ITransformOne_NEON(ref, in, dst);
+  if (do_two) {
+    ITransformOne_NEON(ref + 4, in + 16, dst + 4);
+  }
+}
+
+// Load all 4x4 pixels into a single uint8x16_t variable.
+static uint8x16_t Load4x4_NEON(const uint8_t* src) {
+  uint32x4_t out = vdupq_n_u32(0);
+  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
+  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
+  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
+  out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
+  return vreinterpretq_u8_u32(out);
+}
+
+// Forward transform.
+
+#if defined(WEBP_USE_INTRINSICS)
+
+static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
+                                              const int16x4_t B,
+                                              const int16x4_t C,
+                                              const int16x4_t D,
+                                              int16x8_t* const out01,
+                                              int16x8_t* const out32) {
+  const int16x4x2_t AB = vtrn_s16(A, B);
+  const int16x4x2_t CD = vtrn_s16(C, D);
+  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
+                                     vreinterpret_s32_s16(CD.val[0]));
+  const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
+                                     vreinterpret_s32_s16(CD.val[1]));
+  *out01 = vreinterpretq_s16_s64(
+      vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
+                   vreinterpret_s64_s32(tmp13.val[0])));
+  *out32 = vreinterpretq_s16_s64(
+      vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
+                   vreinterpret_s64_s32(tmp02.val[1])));
+}
+
+static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
+                                              const uint8x8_t b) {
+  return vreinterpretq_s16_u16(vsubl_u8(a, b));
+}
+
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
+  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
+  {
+    const uint8x16_t S0 = Load4x4_NEON(src);
+    const uint8x16_t R0 = Load4x4_NEON(ref);
+    const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
+    const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
+    const int16x4_t D0 = vget_low_s16(D0D1);
+    const int16x4_t D1 = vget_high_s16(D0D1);
+    const int16x4_t D2 = vget_low_s16(D2D3);
+    const int16x4_t D3 = vget_high_s16(D2D3);
+    Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
+  }
+  {    // 1rst pass
+    const int32x4_t kCst937 = vdupq_n_s32(937);
+    const int32x4_t kCst1812 = vdupq_n_s32(1812);
+    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
+    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
+    const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
+    const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
+                                    vget_high_s16(a0a1_2));
+    const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
+                                    vget_high_s16(a0a1_2));
+    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
+    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
+    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
+    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
+    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
+    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
+    Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
+  }
+  {    // 2nd pass
+    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
+    const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
+    const int32x4_t kCst51000 = vdupq_n_s32(51000);
+    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
+    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
+    const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
+    const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
+    const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
+    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
+    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
+    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
+    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
+    const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
+    const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
+    const int16x4_t a3_eq_0 =
+        vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
+    const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
+    vst1_s16(out +  0, out0);
+    vst1_s16(out +  4, out1);
+    vst1_s16(out +  8, out2);
+    vst1_s16(out + 12, out3);
+  }
+}
+
+#else
+
+// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
+static const int16_t kCoeff16[] = {
+  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
+};
+static const int32_t kCoeff32[] = {
+   1812,  1812,  1812,  1812,
+    937,   937,   937,   937,
+  12000, 12000, 12000, 12000,
+  51000, 51000, 51000, 51000
+};
+
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
+  const int kBPS = BPS;
+  const uint8_t* src_ptr = src;
+  const uint8_t* ref_ptr = ref;
+  const int16_t* coeff16 = kCoeff16;
+  const int32_t* coeff32 = kCoeff32;
+
+  __asm__ volatile (
+    // load src into q4, q5 in high half
+    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d11}, [%[src_ptr]]               \n"
+
+    // load ref into q6, q7 in high half
+    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d15}, [%[ref_ptr]]               \n"
+
+    // Pack the high values in to q4 and q6
+    "vtrn.32     q4, q5                       \n"
+    "vtrn.32     q6, q7                       \n"
+
+    // d[0-3] = src - ref
+    "vsubl.u8    q0, d8, d12                  \n"
+    "vsubl.u8    q1, d9, d13                  \n"
+
+    // load coeff16 into q8(d16=5352, d17=2217)
+    "vld1.16     {q8}, [%[coeff16]]           \n"
+
+    // load coeff32 high half into q9 = 1812, q10 = 937
+    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
+
+    // load coeff32 low half into q11=12000, q12=51000
+    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
+
+    // part 1
+    // Transpose. Register dN is the same as dN in C
+    "vtrn.32         d0, d2                   \n"
+    "vtrn.32         d1, d3                   \n"
+    "vtrn.16         d0, d1                   \n"
+    "vtrn.16         d2, d3                   \n"
+
+    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
+    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
+    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
+    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
+
+    "vadd.s16        d0, d4, d5               \n" // a0 + a1
+    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
+    "vsub.s16        d2, d4, d5               \n" // a0 - a1
+    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
+
+    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
+    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
+    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
+    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
+
+    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
+    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
+    "vshrn.s32       d1, q9, #9               \n"
+    "vshrn.s32       d3, q10, #9              \n"
+
+    // part 2
+    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+    "vtrn.32         d0, d2                   \n"
+    "vtrn.32         d1, d3                   \n"
+    "vtrn.16         d0, d1                   \n"
+    "vtrn.16         d2, d3                   \n"
+
+    "vmov.s16        d26, #7                  \n"
+
+    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
+    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
+    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
+    "vadd.s16        d4, d4, d26              \n" // a1 + 7
+    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
+
+    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
+    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
+
+    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
+    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
+
+    "vceq.s16        d4, d7, #0               \n"
+
+    "vshr.s16        d0, d0, #4               \n"
+    "vshr.s16        d2, d2, #4               \n"
+
+    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
+    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
+
+    "vmvn            d4, d4                   \n" // !(d1 == 0)
+    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
+    "vshrn.s32       d1, q11, #16             \n"
+    // op[4] += (d1!=0)
+    "vsub.s16        d1, d1, d4               \n"
+    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
+    "vshrn.s32       d3, q12, #16             \n"
+
+    // set result to out array
+    "vst1.16         {q0, q1}, [%[out]]   \n"
+    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
+      [coeff32] "+r"(coeff32)          // modified registers
+    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
+      [out] "r"(out)                   // constants
+    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+      "q10", "q11", "q12", "q13"       // clobbered
+  );
+}
+
+#endif
+
+#define LOAD_LANE_16b(VALUE, LANE) do {             \
+  (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
+  src += stride;                                    \
+} while (0)
+
+static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
+  const int stride = 16;
+  const int16x4_t zero = vdup_n_s16(0);
+  int32x4x4_t tmp0;
+  int16x4x4_t in;
+  INIT_VECTOR4(in, zero, zero, zero, zero);
+  LOAD_LANE_16b(in.val[0], 0);
+  LOAD_LANE_16b(in.val[1], 0);
+  LOAD_LANE_16b(in.val[2], 0);
+  LOAD_LANE_16b(in.val[3], 0);
+  LOAD_LANE_16b(in.val[0], 1);
+  LOAD_LANE_16b(in.val[1], 1);
+  LOAD_LANE_16b(in.val[2], 1);
+  LOAD_LANE_16b(in.val[3], 1);
+  LOAD_LANE_16b(in.val[0], 2);
+  LOAD_LANE_16b(in.val[1], 2);
+  LOAD_LANE_16b(in.val[2], 2);
+  LOAD_LANE_16b(in.val[3], 2);
+  LOAD_LANE_16b(in.val[0], 3);
+  LOAD_LANE_16b(in.val[1], 3);
+  LOAD_LANE_16b(in.val[2], 3);
+  LOAD_LANE_16b(in.val[3], 3);
+
+  {
+    // a0 = in[0 * 16] + in[2 * 16]
+    // a1 = in[1 * 16] + in[3 * 16]
+    // a2 = in[1 * 16] - in[3 * 16]
+    // a3 = in[0 * 16] - in[2 * 16]
+    const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
+    const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
+    const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
+    const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
+    tmp0.val[0] = vaddq_s32(a0, a1);
+    tmp0.val[1] = vaddq_s32(a3, a2);
+    tmp0.val[2] = vsubq_s32(a3, a2);
+    tmp0.val[3] = vsubq_s32(a0, a1);
+  }
+  {
+    const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
+    // a0 = tmp[0 + i] + tmp[ 8 + i]
+    // a1 = tmp[4 + i] + tmp[12 + i]
+    // a2 = tmp[4 + i] - tmp[12 + i]
+    // a3 = tmp[0 + i] - tmp[ 8 + i]
+    const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
+    const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
+    const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
+    const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
+    const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
+    const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
+    const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
+    const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
+    const int16x4_t out0 = vmovn_s32(b0);
+    const int16x4_t out1 = vmovn_s32(b1);
+    const int16x4_t out2 = vmovn_s32(b2);
+    const int16x4_t out3 = vmovn_s32(b3);
+
+    vst1_s16(out +  0, out0);
+    vst1_s16(out +  4, out1);
+    vst1_s16(out +  8, out2);
+    vst1_s16(out + 12, out3);
+  }
+}
+#undef LOAD_LANE_16b
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// a 0123, b 0123
+// a 4567, b 4567
+// a 89ab, b 89ab
+// a cdef, b cdef
+//
+// transpose
+//
+// a 048c, b 048c
+// a 159d, b 159d
+// a 26ae, b 26ae
+// a 37bf, b 37bf
+//
+static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
+  const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
+  const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
+  const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
+                                        vreinterpretq_s32_s16(q2_tmp1.val[0]));
+  const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
+                                        vreinterpretq_s32_s16(q2_tmp1.val[1]));
+  q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
+  q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
+  q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
+  q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
+  return q4_in;
+}
+
+static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
+    const int16x8x4_t q4_in) {
+  // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
+  // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
+  const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
+  const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
+  const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
+  const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
+  int16x8x4_t q4_out;
+  // tmp[0] = a0 + a1
+  // tmp[1] = a3 + a2
+  // tmp[2] = a3 - a2
+  // tmp[3] = a0 - a1
+  INIT_VECTOR4(q4_out,
+               vabsq_s16(vaddq_s16(q_a0, q_a1)),
+               vabsq_s16(vaddq_s16(q_a3, q_a2)),
+               vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
+  return q4_out;
+}
+
+static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
+  const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
+                                                        q4_in.val[2]));
+  const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
+                                                        q4_in.val[3]));
+  const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
+                                                        q4_in.val[3]));
+  const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
+                                                        q4_in.val[2]));
+  int16x8x4_t q4_out;
+
+  INIT_VECTOR4(q4_out,
+               vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
+               vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
+  return q4_out;
+}
+
+static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
+  const uint16x8_t q_w07 = vld1q_u16(&w[0]);
+  const uint16x8_t q_w8f = vld1q_u16(&w[8]);
+  int16x4x4_t d4_w;
+  INIT_VECTOR4(d4_w,
+               vget_low_s16(vreinterpretq_s16_u16(q_w07)),
+               vget_high_s16(vreinterpretq_s16_u16(q_w07)),
+               vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
+               vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
+  return d4_w;
+}
+
+static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
+                                           const int16x4x4_t d4_w) {
+  int32x2_t d_sum;
+  // sum += w[ 0] * abs(b0);
+  // sum += w[ 4] * abs(b1);
+  // sum += w[ 8] * abs(b2);
+  // sum += w[12] * abs(b3);
+  int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
+  int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
+  int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
+  int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
+  q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
+  q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
+  q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
+  q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
+
+  q_sum0 = vaddq_s32(q_sum0, q_sum1);
+  q_sum2 = vaddq_s32(q_sum2, q_sum3);
+  q_sum2 = vaddq_s32(q_sum0, q_sum2);
+  d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
+  d_sum = vpadd_s32(d_sum, d_sum);
+  return d_sum;
+}
+
+#define LOAD_LANE_32b(src, VALUE, LANE) \
+    (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
+static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
+                         const uint16_t* const w) {
+  uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
+  uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
+  uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
+  uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
+  uint8x8x4_t d4_in;
+
+  // load data a, b
+  LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
+  LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
+  LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
+  LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
+  LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
+  LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
+  LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
+  LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
+  INIT_VECTOR4(d4_in,
+               vreinterpret_u8_u32(d_in_ab_0123),
+               vreinterpret_u8_u32(d_in_ab_4567),
+               vreinterpret_u8_u32(d_in_ab_89ab),
+               vreinterpret_u8_u32(d_in_ab_cdef));
+
+  {
+    // Vertical pass first to avoid a transpose (vertical and horizontal passes
+    // are commutative because w/kWeightY is symmetric) and subsequent
+    // transpose.
+    const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
+    const int16x4x4_t d4_w = DistoLoadW_NEON(w);
+    // horizontal pass
+    const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
+    const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
+    int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
+
+    // abs(sum2 - sum1) >> 5
+    d_sum = vabs_s32(d_sum);
+    d_sum = vshr_n_s32(d_sum, 5);
+    return vget_lane_s32(d_sum, 0);
+  }
+}
+#undef LOAD_LANE_32b
+
+static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_NEON(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+
+static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
+                                  int start_block, int end_block,
+                                  VP8Histogram* const histo) {
+  const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    {
+      int k;
+      const int16x8_t a0 = vld1q_s16(out + 0);
+      const int16x8_t b0 = vld1q_s16(out + 8);
+      const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
+      const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
+      const uint16x8_t a2 = vshrq_n_u16(a1, 3);
+      const uint16x8_t b2 = vshrq_n_u16(b1, 3);
+      const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
+      const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
+      vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
+      vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
+      // Convert coefficients to bin.
+      for (k = 0; k < 16; ++k) {
+        ++distribution[out[k]];
+      }
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
+                                             const uint8_t* const b,
+                                             uint32x4_t* const sum) {
+  const uint8x16_t a0 = vld1q_u8(a);
+  const uint8x16_t b0 = vld1q_u8(b);
+  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
+  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
+                                    vget_low_u8(abs_diff));
+  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
+                                    vget_high_u8(abs_diff));
+  /* pair-wise adds and widen */
+  const uint32x4_t sum1 = vpaddlq_u16(prod1);
+  const uint32x4_t sum2 = vpaddlq_u16(prod2);
+  *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
+}
+
+// Horizontal sum of all four uint32_t values in 'sum'.
+static int SumToInt_NEON(uint32x4_t sum) {
+  const uint64x2_t sum2 = vpaddlq_u32(sum);
+  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
+  return (int)sum3;
+}
+
+static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
+  uint32x4_t sum = vdupq_n_u32(0);
+  int y;
+  for (y = 0; y < 16; ++y) {
+    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
+  }
+  return SumToInt_NEON(sum);
+}
+
+static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
+  uint32x4_t sum = vdupq_n_u32(0);
+  int y;
+  for (y = 0; y < 8; ++y) {
+    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
+  }
+  return SumToInt_NEON(sum);
+}
+
+static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
+  uint32x4_t sum = vdupq_n_u32(0);
+  int y;
+  for (y = 0; y < 8; ++y) {
+    const uint8x8_t a0 = vld1_u8(a + y * BPS);
+    const uint8x8_t b0 = vld1_u8(b + y * BPS);
+    const uint8x8_t abs_diff = vabd_u8(a0, b0);
+    const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
+    sum = vpadalq_u16(sum, prod);
+  }
+  return SumToInt_NEON(sum);
+}
+
+static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
+  const uint8x16_t a0 = Load4x4_NEON(a);
+  const uint8x16_t b0 = Load4x4_NEON(b);
+  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
+  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
+                                    vget_low_u8(abs_diff));
+  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
+                                    vget_high_u8(abs_diff));
+  /* pair-wise adds and widen */
+  const uint32x4_t sum1 = vpaddlq_u16(prod1);
+  const uint32x4_t sum2 = vpaddlq_u16(prod2);
+  return SumToInt_NEON(vaddq_u32(sum1, sum2));
+}
+
+//------------------------------------------------------------------------------
+
+// Compilation with gcc-4.6.x is problematic for now.
+#if !defined(WORK_AROUND_GCC)
+
+static int16x8_t Quantize_NEON(int16_t* const in,
+                               const VP8Matrix* const mtx, int offset) {
+  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
+  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
+  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
+  const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
+  const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
+
+  const int16x8_t a = vld1q_s16(in + offset);                // in
+  const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
+  const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
+  const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
+  const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
+  const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
+  const uint32x4_t m2 = vhaddq_u32(m0, bias0);
+  const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
+  const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
+                                     vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
+  const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
+  const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
+  const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
+  const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
+  vst1q_s16(in + offset, c4);
+  assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
+  return c3;
+}
+
+static const uint8_t kShuffles[4][8] = {
+  { 0,   1,  2,  3,  8,  9, 16, 17 },
+  { 10, 11,  4,  5,  6,  7, 12, 13 },
+  { 18, 19, 24, 25, 26, 27, 20, 21 },
+  { 14, 15, 22, 23, 28, 29, 30, 31 }
+};
+
+static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
+                              const VP8Matrix* const mtx) {
+  const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
+  const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
+  uint8x8x4_t shuffles;
+  // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+  // non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+  uint8x16x2_t all_out;
+  INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
+  INIT_VECTOR4(shuffles,
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
+#else
+  uint8x8x4_t all_out;
+  INIT_VECTOR4(all_out,
+               vreinterpret_u8_s16(vget_low_s16(out0)),
+               vreinterpret_u8_s16(vget_high_s16(out0)),
+               vreinterpret_u8_s16(vget_low_s16(out1)),
+               vreinterpret_u8_s16(vget_high_s16(out1)));
+  INIT_VECTOR4(shuffles,
+               vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
+#endif
+  // Zigzag reordering
+  vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
+  vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
+  vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
+  vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
+  // test zeros
+  if (*(uint64_t*)(out +  0) != 0) return 1;
+  if (*(uint64_t*)(out +  4) != 0) return 1;
+  if (*(uint64_t*)(out +  8) != 0) return 1;
+  if (*(uint64_t*)(out + 12) != 0) return 1;
+  return 0;
+}
+
+static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
+                                const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
+#endif   // !WORK_AROUND_GCC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
+  VP8ITransform = ITransform_NEON;
+  VP8FTransform = FTransform_NEON;
+
+  VP8FTransformWHT = FTransformWHT_NEON;
+
+  VP8TDisto4x4 = Disto4x4_NEON;
+  VP8TDisto16x16 = Disto16x16_NEON;
+  VP8CollectHistogram = CollectHistogram_NEON;
+
+  VP8SSE16x16 = SSE16x16_NEON;
+  VP8SSE16x8 = SSE16x8_NEON;
+  VP8SSE8x8 = SSE8x8_NEON;
+  VP8SSE4x4 = SSE4x4_NEON;
+
+#if !defined(WORK_AROUND_GCC)
+  VP8EncQuantizeBlock = QuantizeBlock_NEON;
+  VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
+#endif
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/media/libwebp/dsp/enc_sse2.c b/media/libwebp/dsp/enc_sse2.c
new file mode 100644
index 0000000000..ff78755111
--- /dev/null
+++ b/media/libwebp/dsp/enc_sse2.c
@@ -0,0 +1,1381 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of speed-critical encoding functions.
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <assert.h>
+#include <stdlib.h>  // for abs()
+#include <emmintrin.h>
+
+#include "../dsp/common_sse2.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+// Does one or two inverse transforms.
+static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                            int do_two) {
+  // This implementation makes use of 16-bit fixed point versions of two
+  // multiply constants:
+  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
+  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
+  //
+  // To be able to use signed 16-bit integers, we use the following trick to
+  // have constants within range:
+  // - Associated constants are obtained by subtracting the 16-bit fixed point
+  //   version of one:
+  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
+  //      K1 = 85267  =>  k1 =  20091
+  //      K2 = 35468  =>  k2 = -30068
+  // - The multiplication of a variable by a constant become the sum of the
+  //   variable and the multiplication of that variable by the associated
+  //   constant:
+  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
+  const __m128i k1 = _mm_set1_epi16(20091);
+  const __m128i k2 = _mm_set1_epi16(-30068);
+  __m128i T0, T1, T2, T3;
+
+  // Load and concatenate the transform coefficients (we'll do two inverse
+  // transforms in parallel). In the case of only one inverse transform, the
+  // second half of the vectors will just contain random value we'll never
+  // use nor store.
+  __m128i in0, in1, in2, in3;
+  {
+    in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
+    in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
+    in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
+    in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
+    // a00 a10 a20 a30   x x x x
+    // a01 a11 a21 a31   x x x x
+    // a02 a12 a22 a32   x x x x
+    // a03 a13 a23 a33   x x x x
+    if (do_two) {
+      const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
+      in0 = _mm_unpacklo_epi64(in0, inB0);
+      in1 = _mm_unpacklo_epi64(in1, inB1);
+      in2 = _mm_unpacklo_epi64(in2, inB2);
+      in3 = _mm_unpacklo_epi64(in3, inB3);
+      // a00 a10 a20 a30   b00 b10 b20 b30
+      // a01 a11 a21 a31   b01 b11 b21 b31
+      // a02 a12 a22 a32   b02 b12 b22 b32
+      // a03 a13 a23 a33   b03 b13 b23 b33
+    }
+  }
+
+  // Vertical pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i a = _mm_add_epi16(in0, in2);
+    const __m128i b = _mm_sub_epi16(in0, in2);
+    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
+    const __m128i c3 = _mm_sub_epi16(in1, in3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
+    const __m128i d3 = _mm_add_epi16(in1, in3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+
+    // Transpose the two 4x4.
+    VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3);
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i four = _mm_set1_epi16(4);
+    const __m128i dc = _mm_add_epi16(T0, four);
+    const __m128i a =  _mm_add_epi16(dc, T2);
+    const __m128i b =  _mm_sub_epi16(dc, T2);
+    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
+    const __m128i c3 = _mm_sub_epi16(T1, T3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
+    const __m128i d3 = _mm_add_epi16(T1, T3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
+    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
+    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
+    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
+
+    // Transpose the two 4x4.
+    VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
+                           &T2, &T3);
+  }
+
+  // Add inverse transform to 'ref' and store.
+  {
+    const __m128i zero = _mm_setzero_si128();
+    // Load the reference(s).
+    __m128i ref0, ref1, ref2, ref3;
+    if (do_two) {
+      // Load eight bytes/pixels per line.
+      ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+      ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+      ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+      ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+    } else {
+      // Load four bytes/pixels per line.
+      ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS]));
+      ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS]));
+      ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS]));
+      ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS]));
+    }
+    // Convert to 16b.
+    ref0 = _mm_unpacklo_epi8(ref0, zero);
+    ref1 = _mm_unpacklo_epi8(ref1, zero);
+    ref2 = _mm_unpacklo_epi8(ref2, zero);
+    ref3 = _mm_unpacklo_epi8(ref3, zero);
+    // Add the inverse transform(s).
+    ref0 = _mm_add_epi16(ref0, T0);
+    ref1 = _mm_add_epi16(ref1, T1);
+    ref2 = _mm_add_epi16(ref2, T2);
+    ref3 = _mm_add_epi16(ref3, T3);
+    // Unsigned saturate to 8b.
+    ref0 = _mm_packus_epi16(ref0, ref0);
+    ref1 = _mm_packus_epi16(ref1, ref1);
+    ref2 = _mm_packus_epi16(ref2, ref2);
+    ref3 = _mm_packus_epi16(ref3, ref3);
+    // Store the results.
+    if (do_two) {
+      // Store eight bytes/pixels per line.
+      _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
+      _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
+      _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
+      _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
+    } else {
+      // Store four bytes/pixels per line.
+      WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0));
+      WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1));
+      WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2));
+      WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3));
+    }
+  }
+}
+
+static void FTransformPass1_SSE2(const __m128i* const in01,
+                                 const __m128i* const in23,
+                                 __m128i* const out01,
+                                 __m128i* const out32) {
+  const __m128i k937 = _mm_set1_epi32(937);
+  const __m128i k1812 = _mm_set1_epi32(1812);
+
+  const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
+  const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
+  const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
+                                            2217, 5352, 2217, 5352);
+  const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
+                                            -5352, 2217, -5352, 2217);
+
+  // *in01 = 00 01 10 11 02 03 12 13
+  // *in23 = 20 21 30 31 22 23 32 33
+  const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1));
+  const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1));
+  // 00 01 10 11 03 02 13 12
+  // 20 21 30 31 23 22 33 32
+  const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
+  const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
+  // 00 01 10 11 20 21 30 31
+  // 03 02 13 12 23 22 33 32
+  const __m128i a01 = _mm_add_epi16(s01, s32);
+  const __m128i a32 = _mm_sub_epi16(s01, s32);
+  // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
+  // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
+
+  const __m128i tmp0   = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
+  const __m128i tmp2   = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
+  const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
+  const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
+  const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
+  const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
+  const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
+  const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
+  const __m128i s03    = _mm_packs_epi32(tmp0, tmp2);
+  const __m128i s12    = _mm_packs_epi32(tmp1, tmp3);
+  const __m128i s_lo   = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
+  const __m128i s_hi   = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
+  const __m128i v23    = _mm_unpackhi_epi32(s_lo, s_hi);
+  *out01 = _mm_unpacklo_epi32(s_lo, s_hi);
+  *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
+}
+
+static void FTransformPass2_SSE2(const __m128i* const v01,
+                                 const __m128i* const v32,
+                                 int16_t* out) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i seven = _mm_set1_epi16(7);
+  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
+                                           5352,  2217, 5352,  2217);
+  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
+                                           2217, -5352, 2217, -5352);
+  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
+  const __m128i k51000 = _mm_set1_epi32(51000);
+
+  // Same operations are done on the (0,3) and (1,2) pairs.
+  // a3 = v0 - v3
+  // a2 = v1 - v2
+  const __m128i a32 = _mm_sub_epi16(*v01, *v32);
+  const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
+
+  const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
+  const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
+  const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
+  const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
+  const __m128i d3 = _mm_add_epi32(c3, k51000);
+  const __m128i e1 = _mm_srai_epi32(d1, 16);
+  const __m128i e3 = _mm_srai_epi32(d3, 16);
+  // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+  // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
+  const __m128i f1 = _mm_packs_epi32(e1, e1);
+  const __m128i f3 = _mm_packs_epi32(e3, e3);
+  // g1 = f1 + (a3 != 0);
+  // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
+  // desired (0, 1), we add one earlier through k12000_plus_one.
+  // -> g1 = f1 + 1 - (a3 == 0)
+  const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
+
+  // a0 = v0 + v3
+  // a1 = v1 + v2
+  const __m128i a01 = _mm_add_epi16(*v01, *v32);
+  const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
+  const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
+  const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
+  const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
+  // d0 = (a0 + a1 + 7) >> 4;
+  // d2 = (a0 - a1 + 7) >> 4;
+  const __m128i d0 = _mm_srai_epi16(c0, 4);
+  const __m128i d2 = _mm_srai_epi16(c2, 4);
+
+  const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
+  const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
+  _mm_storeu_si128((__m128i*)&out[0], d0_g1);
+  _mm_storeu_si128((__m128i*)&out[8], d2_f3);
+}
+
+static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
+  const __m128i zero = _mm_setzero_si128();
+  // Load src.
+  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
+  const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
+  const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
+  const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
+  // 00 01 02 03 *
+  // 10 11 12 13 *
+  // 20 21 22 23 *
+  // 30 31 32 33 *
+  // Shuffle.
+  const __m128i src_0 = _mm_unpacklo_epi16(src0, src1);
+  const __m128i src_1 = _mm_unpacklo_epi16(src2, src3);
+  // 00 01 10 11 02 03 12 13 * * ...
+  // 20 21 30 31 22 22 32 33 * * ...
+
+  // Load ref.
+  const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+  const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+  const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+  const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+  const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1);
+  const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3);
+
+  // Convert both to 16 bit.
+  const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero);
+  const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero);
+  const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero);
+  const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero);
+
+  // Compute the difference.
+  const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b);
+  const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b);
+  __m128i v01, v32;
+
+  // First pass
+  FTransformPass1_SSE2(&row01, &row23, &v01, &v32);
+
+  // Second pass
+  FTransformPass2_SSE2(&v01, &v32, out);
+}
+
+static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
+                             int16_t* out) {
+  const __m128i zero = _mm_setzero_si128();
+
+  // Load src and convert to 16b.
+  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
+  const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
+  const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
+  const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
+  const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
+  const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
+  const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
+  const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
+  // Load ref and convert to 16b.
+  const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+  const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+  const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+  const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+  const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
+  const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
+  const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
+  const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
+  // Compute difference. -> 00 01 02 03  00' 01' 02' 03'
+  const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
+  const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
+  const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
+  const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
+
+  // Unpack and shuffle
+  // 00 01 02 03   0 0 0 0
+  // 10 11 12 13   0 0 0 0
+  // 20 21 22 23   0 0 0 0
+  // 30 31 32 33   0 0 0 0
+  const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1);
+  const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3);
+  const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1);
+  const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3);
+  __m128i v01l, v32l;
+  __m128i v01h, v32h;
+
+  // First pass
+  FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l);
+  FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h);
+
+  // Second pass
+  FTransformPass2_SSE2(&v01l, &v32l, out + 0);
+  FTransformPass2_SSE2(&v01h, &v32h, out + 16);
+}
+
+static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
+  const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
+  const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
+  const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
+  const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]);
+  const __m128i src3 = _mm_loadl_epi64((__m128i*)&in[3 * 16]);
+  const __m128i A01 = _mm_unpacklo_epi16(src0, src1);  // A0 A1 | ...
+  const __m128i A23 = _mm_unpacklo_epi16(src2, src3);  // A2 A3 | ...
+  const __m128i B0 = _mm_adds_epi16(A01, A23);    // a0 | a1 | ...
+  const __m128i B1 = _mm_subs_epi16(A01, A23);    // a3 | a2 | ...
+  const __m128i C0 = _mm_unpacklo_epi32(B0, B1);  // a0 | a1 | a3 | a2 | ...
+  const __m128i C1 = _mm_unpacklo_epi32(B1, B0);  // a3 | a2 | a0 | a1 | ...
+  const __m128i D = _mm_unpacklo_epi64(C0, C1);   // a0 a1 a3 a2 a3 a2 a0 a1
+  *out = _mm_madd_epi16(D, kMult);
+}
+
+static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
+  // Input is 12b signed.
+  __m128i row0, row1, row2, row3;
+  // Rows are 14b signed.
+  FTransformWHTRow_SSE2(in + 0 * 64, &row0);
+  FTransformWHTRow_SSE2(in + 1 * 64, &row1);
+  FTransformWHTRow_SSE2(in + 2 * 64, &row2);
+  FTransformWHTRow_SSE2(in + 3 * 64, &row3);
+
+  {
+    // The a* are 15b signed.
+    const __m128i a0 = _mm_add_epi32(row0, row2);
+    const __m128i a1 = _mm_add_epi32(row1, row3);
+    const __m128i a2 = _mm_sub_epi32(row1, row3);
+    const __m128i a3 = _mm_sub_epi32(row0, row2);
+    const __m128i a0a3 = _mm_packs_epi32(a0, a3);
+    const __m128i a1a2 = _mm_packs_epi32(a1, a2);
+
+    // The b* are 16b signed.
+    const __m128i b0b1 = _mm_add_epi16(a0a3, a1a2);
+    const __m128i b3b2 = _mm_sub_epi16(a0a3, a1a2);
+    const __m128i tmp_b2b3 = _mm_unpackhi_epi64(b3b2, b3b2);
+    const __m128i b2b3 = _mm_unpacklo_epi64(tmp_b2b3, b3b2);
+
+    _mm_storeu_si128((__m128i*)&out[0], _mm_srai_epi16(b0b1, 1));
+    _mm_storeu_si128((__m128i*)&out[8], _mm_srai_epi16(b2b3, 1));
+  }
+}
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
+                                  int start_block, int end_block,
+                                  VP8Histogram* const histo) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int k;
+
+    FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin (within out[]).
+    {
+      // Load.
+      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+      const __m128i d0 = _mm_sub_epi16(zero, out0);
+      const __m128i d1 = _mm_sub_epi16(zero, out1);
+      const __m128i abs0 = _mm_max_epi16(out0, d0);   // abs(v), 16b
+      const __m128i abs1 = _mm_max_epi16(out1, d1);
+      // v = abs(out) >> 3
+      const __m128i v0 = _mm_srai_epi16(abs0, 3);
+      const __m128i v1 = _mm_srai_epi16(abs1, 3);
+      // bin = min(v, MAX_COEFF_THRESH)
+      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+      // Store.
+      _mm_storeu_si128((__m128i*)&out[0], bin0);
+      _mm_storeu_si128((__m128i*)&out[8], bin1);
+    }
+
+    // Convert coefficients to bin.
+    for (k = 0; k < 16; ++k) {
+      ++distribution[out[k]];
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+// helper for chroma-DC predictions
+static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 16; ++j) {
+    _mm_store_si128((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
+  if (size == 4) {
+    int j;
+    for (j = 0; j < 4; ++j) {
+      memset(dst + j * BPS, value, 4);
+    }
+  } else if (size == 8) {
+    Put8x8uv_SSE2(value, dst);
+  } else {
+    Put16_SSE2(value, dst);
+  }
+}
+
+static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
+  int j;
+  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), top_values);
+  }
+}
+
+static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
+  const __m128i top_values = _mm_load_si128((const __m128i*)top);
+  int j;
+  for (j = 0; j < 16; ++j) {
+    _mm_store_si128((__m128i*)(dst + j * BPS), top_values);
+  }
+}
+
+static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
+                                          const uint8_t* top, int size) {
+  if (top != NULL) {
+    if (size == 8) {
+      VE8uv_SSE2(dst, top);
+    } else {
+      VE16_SSE2(dst, top);
+    }
+  } else {
+    Fill_SSE2(dst, 127, size);
+  }
+}
+
+static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
+  int j;
+  for (j = 0; j < 8; ++j) {
+    const __m128i values = _mm_set1_epi8(left[j]);
+    _mm_storel_epi64((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
+  int j;
+  for (j = 0; j < 16; ++j) {
+    const __m128i values = _mm_set1_epi8(left[j]);
+    _mm_store_si128((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
+                                            const uint8_t* left, int size) {
+  if (left != NULL) {
+    if (size == 8) {
+      HE8uv_SSE2(dst, left);
+    } else {
+      HE16_SSE2(dst, left);
+    }
+  } else {
+    Fill_SSE2(dst, 129, size);
+  }
+}
+
+static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
+                                const uint8_t* top, int size) {
+  const __m128i zero = _mm_setzero_si128();
+  int y;
+  if (size == 8) {
+    const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+    for (y = 0; y < 8; ++y, dst += BPS) {
+      const int val = left[y] - left[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+      _mm_storel_epi64((__m128i*)dst, out);
+    }
+  } else {
+    const __m128i top_values = _mm_load_si128((const __m128i*)top);
+    const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero);
+    const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero);
+    for (y = 0; y < 16; ++y, dst += BPS) {
+      const int val = left[y] - left[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out_0 = _mm_add_epi16(base, top_base_0);
+      const __m128i out_1 = _mm_add_epi16(base, top_base_1);
+      const __m128i out = _mm_packus_epi16(out_0, out_1);
+      _mm_store_si128((__m128i*)dst, out);
+    }
+  }
+}
+
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
+                                        const uint8_t* top, int size) {
+  if (left != NULL) {
+    if (top != NULL) {
+      TM_SSE2(dst, left, top, size);
+    } else {
+      HorizontalPred_SSE2(dst, left, size);
+    }
+  } else {
+    // true motion without left samples (hence: with default 129 value)
+    // is equivalent to VE prediction where you just copy the top samples.
+    // Note that if top samples are not available, the default value is
+    // then 129, and not 127 as in the VerticalPred case.
+    if (top != NULL) {
+      VerticalPred_SSE2(dst, top, size);
+    } else {
+      Fill_SSE2(dst, 129, size);
+    }
+  }
+}
+
+static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top) {
+  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+  const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
+  const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
+  const int DC = VP8HorizontalAdd8b(&combined) + 8;
+  Put8x8uv_SSE2(DC >> 4, dst);
+}
+
+static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+  const __m128i sum = _mm_sad_epu8(top_values, zero);
+  const int DC = _mm_cvtsi128_si32(sum) + 4;
+  Put8x8uv_SSE2(DC >> 3, dst);
+}
+
+static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
+  // 'left' is contiguous so we can reuse the top summation.
+  DC8uvNoLeft_SSE2(dst, left);
+}
+
+static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
+  Put8x8uv_SSE2(0x80, dst);
+}
+
+static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
+                                       const uint8_t* top) {
+  if (top != NULL) {
+    if (left != NULL) {  // top and left present
+      DC8uv_SSE2(dst, left, top);
+    } else {  // top, but no left
+      DC8uvNoLeft_SSE2(dst, top);
+    }
+  } else if (left != NULL) {  // left but no top
+    DC8uvNoTop_SSE2(dst, left);
+  } else {  // no top, no left, nothing.
+    DC8uvNoTopLeft_SSE2(dst);
+  }
+}
+
+static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
+  const __m128i top_row = _mm_load_si128((const __m128i*)top);
+  const __m128i left_row = _mm_load_si128((const __m128i*)left);
+  const int DC =
+      VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
+  Put16_SSE2(DC >> 5, dst);
+}
+
+static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
+  const __m128i top_row = _mm_load_si128((const __m128i*)top);
+  const int DC = VP8HorizontalAdd8b(&top_row) + 8;
+  Put16_SSE2(DC >> 4, dst);
+}
+
+static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
+  // 'left' is contiguous so we can reuse the top summation.
+  DC16NoLeft_SSE2(dst, left);
+}
+
+static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
+  Put16_SSE2(0x80, dst);
+}
+
+static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
+                                      const uint8_t* top) {
+  if (top != NULL) {
+    if (left != NULL) {  // top and left present
+      DC16_SSE2(dst, left, top);
+    } else {  // top, but no left
+      DC16NoLeft_SSE2(dst, top);
+    }
+  } else if (left != NULL) {  // left but no top
+    DC16NoTop_SSE2(dst, left);
+  } else {  // no top, no left, nothing.
+    DC16NoTopLeft_SSE2(dst);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 4x4 predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+// We use the following 8b-arithmetic tricks:
+//     (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
+//   where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
+// and:
+//     (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
+//   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
+//   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
+
+static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // vertical
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
+  const __m128i b = _mm_subs_epu8(a, lsb);
+  const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
+  const uint32_t vals = _mm_cvtsi128_si32(avg);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    WebPUint32ToMem(dst + i * BPS, vals);
+  }
+}
+
+static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // horizontal
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
+}
+
+static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+  Fill_SSE2(dst, dc >> 3, 4);
+}
+
+static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Down-Left
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, top[7], 3);
+  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Vertical-Right
+  const __m128i one = _mm_set1_epi8(1);
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int X = top[-1];
+  const __m128i XABCD = _mm_loadl_epi64((const __m128i*)(top - 1));
+  const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
+  const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
+  const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
+  const __m128i IXABCD = _mm_insert_epi16(_XABCD, (short)(I | (X << 8)), 0);
+  const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+
+  // these two are hard to implement in SSE2, so we keep the C-version:
+  DST(0, 2) = AVG3(J, I, X);
+  DST(0, 3) = AVG3(K, J, I);
+}
+
+static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Vertical-Left
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
+  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_);
+  const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_);
+  const __m128i avg3 = _mm_avg_epu8(avg1, avg2);
+  const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one);
+  const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_);
+  const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_);
+  const __m128i abbc = _mm_or_si128(ab, bc);
+  const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
+  const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
+  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+
+  // these two are hard to get and irregular
+  DST(3, 2) = (extra_out >> 0) & 0xff;
+  DST(3, 3) = (extra_out >> 8) & 0xff;
+}
+
+static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Down-right
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
+  const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
+  const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
+  const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
+  const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+  const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+  int y;
+  for (y = 0; y < 4; ++y, dst += BPS) {
+    const int val = top[-2 - y] - top[-1];
+    const __m128i base = _mm_set1_epi16(val);
+    const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+    WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+  }
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+//------------------------------------------------------------------------------
+// luma 4x4 prediction
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
+  DC4_SSE2(I4DC4 + dst, top);
+  TM4_SSE2(I4TM4 + dst, top);
+  VE4_SSE2(I4VE4 + dst, top);
+  HE4_SSE2(I4HE4 + dst, top);
+  RD4_SSE2(I4RD4 + dst, top);
+  VR4_SSE2(I4VR4 + dst, top);
+  LD4_SSE2(I4LD4 + dst, top);
+  VL4_SSE2(I4VL4 + dst, top);
+  HD4_SSE2(I4HD4 + dst, top);
+  HU4_SSE2(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
+  // U block
+  DC8uvMode_SSE2(C8DC8 + dst, left, top);
+  VerticalPred_SSE2(C8VE8 + dst, top, 8);
+  HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+  TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
+  // V block
+  dst += 8;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
+  DC8uvMode_SSE2(C8DC8 + dst, left, top);
+  VerticalPred_SSE2(C8VE8 + dst, top, 8);
+  HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+  TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds_SSE2(uint8_t* dst,
+                              const uint8_t* left, const uint8_t* top) {
+  DC16Mode_SSE2(I16DC16 + dst, left, top);
+  VerticalPred_SSE2(I16VE16 + dst, top, 16);
+  HorizontalPred_SSE2(I16HE16 + dst, left, 16);
+  TrueMotion_SSE2(I16TM16 + dst, left, top, 16);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
+                                                   const __m128i b,
+                                                   __m128i* const sum) {
+  // take abs(a-b) in 8b
+  const __m128i a_b = _mm_subs_epu8(a, b);
+  const __m128i b_a = _mm_subs_epu8(b, a);
+  const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
+  // zero-extend to 16b
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
+  const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
+  // multiply with self
+  const __m128i sum1 = _mm_madd_epi16(C0, C0);
+  const __m128i sum2 = _mm_madd_epi16(C1, C1);
+  *sum = _mm_add_epi32(sum1, sum2);
+}
+
+static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
+                                     int num_pairs) {
+  __m128i sum = _mm_setzero_si128();
+  int32_t tmp[4];
+  int i;
+
+  for (i = 0; i < num_pairs; ++i) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[BPS * 0]);
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[BPS * 0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
+    __m128i sum1, sum2;
+    SubtractAndAccumulate_SSE2(a0, b0, &sum1);
+    SubtractAndAccumulate_SSE2(a1, b1, &sum2);
+    sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
+    a += 2 * BPS;
+    b += 2 * BPS;
+  }
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+
+static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_16xN_SSE2(a, b, 8);
+}
+
+static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_16xN_SSE2(a, b, 4);
+}
+
+#define LOAD_8x16b(ptr) \
+  _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
+
+static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
+  const __m128i zero = _mm_setzero_si128();
+  int num_pairs = 4;
+  __m128i sum = zero;
+  int32_t tmp[4];
+  while (num_pairs-- > 0) {
+    const __m128i a0 = LOAD_8x16b(&a[BPS * 0]);
+    const __m128i a1 = LOAD_8x16b(&a[BPS * 1]);
+    const __m128i b0 = LOAD_8x16b(&b[BPS * 0]);
+    const __m128i b1 = LOAD_8x16b(&b[BPS * 1]);
+    // subtract
+    const __m128i c0 = _mm_subs_epi16(a0, b0);
+    const __m128i c1 = _mm_subs_epi16(a1, b1);
+    // multiply/accumulate with self
+    const __m128i d0 = _mm_madd_epi16(c0, c0);
+    const __m128i d1 = _mm_madd_epi16(c1, c1);
+    // collect
+    const __m128i sum01 = _mm_add_epi32(d0, d1);
+    sum = _mm_add_epi32(sum, sum01);
+    a += 2 * BPS;
+    b += 2 * BPS;
+  }
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+#undef LOAD_8x16b
+
+static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
+  const __m128i zero = _mm_setzero_si128();
+
+  // Load values. Note that we read 8 pixels instead of 4,
+  // but the a/b buffers are over-allocated to that effect.
+  const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]);
+  const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]);
+  const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]);
+  const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]);
+  const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]);
+  const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]);
+  const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]);
+  const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]);
+  // Combine pair of lines.
+  const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
+  const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
+  // Convert to 16b.
+  const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
+  const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
+  const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
+  const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
+  // subtract, square and accumulate
+  const __m128i d0 = _mm_subs_epi16(a01s, b01s);
+  const __m128i d1 = _mm_subs_epi16(a23s, b23s);
+  const __m128i e0 = _mm_madd_epi16(d0, d0);
+  const __m128i e1 = _mm_madd_epi16(d1, d1);
+  const __m128i sum = _mm_add_epi32(e0, e1);
+
+  int32_t tmp[4];
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+
+//------------------------------------------------------------------------------
+
+static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
+  const __m128i mask = _mm_set1_epi16(0x00ff);
+  const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
+  const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
+  const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]);
+  const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]);
+  const __m128i b0 = _mm_srli_epi16(a0, 8);     // hi byte
+  const __m128i b1 = _mm_srli_epi16(a1, 8);
+  const __m128i b2 = _mm_srli_epi16(a2, 8);
+  const __m128i b3 = _mm_srli_epi16(a3, 8);
+  const __m128i c0 = _mm_and_si128(a0, mask);   // lo byte
+  const __m128i c1 = _mm_and_si128(a1, mask);
+  const __m128i c2 = _mm_and_si128(a2, mask);
+  const __m128i c3 = _mm_and_si128(a3, mask);
+  const __m128i d0 = _mm_add_epi32(b0, c0);
+  const __m128i d1 = _mm_add_epi32(b1, c1);
+  const __m128i d2 = _mm_add_epi32(b2, c2);
+  const __m128i d3 = _mm_add_epi32(b3, c3);
+  const __m128i e0 = _mm_add_epi32(d0, d1);
+  const __m128i e1 = _mm_add_epi32(d2, d3);
+  const __m128i f0 = _mm_add_epi32(e0, e1);
+  uint16_t tmp[8];
+  _mm_storeu_si128((__m128i*)tmp, f0);
+  dc[0] = tmp[0] + tmp[1];
+  dc[1] = tmp[2] + tmp[3];
+  dc[2] = tmp[4] + tmp[5];
+  dc[3] = tmp[6] + tmp[7];
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
+static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
+                           const uint16_t* const w) {
+  int32_t sum[4];
+  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
+  const __m128i zero = _mm_setzero_si128();
+
+  // Load and combine inputs.
+  {
+    const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
+    const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
+    const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
+    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
+    const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
+    const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
+    const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
+    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
+
+    // Combine inA and inB (we'll do two transforms in parallel).
+    const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
+    const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
+    const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
+    const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
+    tmp_0 = _mm_unpacklo_epi8(inAB_0, zero);
+    tmp_1 = _mm_unpacklo_epi8(inAB_1, zero);
+    tmp_2 = _mm_unpacklo_epi8(inAB_2, zero);
+    tmp_3 = _mm_unpacklo_epi8(inAB_3, zero);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+  }
+
+  // Vertical pass first to avoid a transpose (vertical and horizontal passes
+  // are commutative because w/kWeightY is symmetric) and subsequent transpose.
+  {
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+
+    // Transpose the two 4x4.
+    VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
+  }
+
+  // Horizontal pass and difference of weighted sums.
+  {
+    // Load all inputs.
+    const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
+    const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
+
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+
+    // Separate the transforms of inA and inB.
+    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
+    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
+    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
+    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
+
+    {
+      const __m128i d0 = _mm_sub_epi16(zero, A_b0);
+      const __m128i d1 = _mm_sub_epi16(zero, A_b2);
+      const __m128i d2 = _mm_sub_epi16(zero, B_b0);
+      const __m128i d3 = _mm_sub_epi16(zero, B_b2);
+      A_b0 = _mm_max_epi16(A_b0, d0);   // abs(v), 16b
+      A_b2 = _mm_max_epi16(A_b2, d1);
+      B_b0 = _mm_max_epi16(B_b0, d2);
+      B_b2 = _mm_max_epi16(B_b2, d3);
+    }
+
+    // weighted sums
+    A_b0 = _mm_madd_epi16(A_b0, w_0);
+    A_b2 = _mm_madd_epi16(A_b2, w_8);
+    B_b0 = _mm_madd_epi16(B_b0, w_0);
+    B_b2 = _mm_madd_epi16(B_b2, w_8);
+    A_b0 = _mm_add_epi32(A_b0, A_b2);
+    B_b0 = _mm_add_epi32(B_b0, B_b2);
+
+    // difference of weighted sums
+    A_b0 = _mm_sub_epi32(A_b0, B_b0);
+    _mm_storeu_si128((__m128i*)&sum[0], A_b0);
+  }
+  return sum[0] + sum[1] + sum[2] + sum[3];
+}
+
+static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
+                         const uint16_t* const w) {
+  const int diff_sum = TTransform_SSE2(a, b, w);
+  return abs(diff_sum) >> 5;
+}
+
+static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_SSE2(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+                                            const uint16_t* const sharpen,
+                                            const VP8Matrix* const mtx) {
+  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i coeff0, coeff8;
+  __m128i out0, out8;
+  __m128i packed_out;
+
+  // Load all inputs.
+  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
+  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
+  const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
+  const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
+  const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
+  const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
+
+  // extract sign(in)  (0x0000 if positive, 0xffff if negative)
+  const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
+  const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);
+
+  // coeff = abs(in) = (in ^ sign) - sign
+  coeff0 = _mm_xor_si128(in0, sign0);
+  coeff8 = _mm_xor_si128(in8, sign8);
+  coeff0 = _mm_sub_epi16(coeff0, sign0);
+  coeff8 = _mm_sub_epi16(coeff8, sign8);
+
+  // coeff = abs(in) + sharpen
+  if (sharpen != NULL) {
+    const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
+    const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
+    coeff0 = _mm_add_epi16(coeff0, sharpen0);
+    coeff8 = _mm_add_epi16(coeff8, sharpen8);
+  }
+
+  // out = (coeff * iQ + B) >> QFIX
+  {
+    // doing calculations with 32b precision (QFIX=17)
+    // out = (coeff * iQ)
+    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
+    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
+    // out = (coeff * iQ + B)
+    const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
+    const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
+    const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
+    const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
+    out_00 = _mm_add_epi32(out_00, bias_00);
+    out_04 = _mm_add_epi32(out_04, bias_04);
+    out_08 = _mm_add_epi32(out_08, bias_08);
+    out_12 = _mm_add_epi32(out_12, bias_12);
+    // out = QUANTDIV(coeff, iQ, B, QFIX)
+    out_00 = _mm_srai_epi32(out_00, QFIX);
+    out_04 = _mm_srai_epi32(out_04, QFIX);
+    out_08 = _mm_srai_epi32(out_08, QFIX);
+    out_12 = _mm_srai_epi32(out_12, QFIX);
+
+    // pack result as 16b
+    out0 = _mm_packs_epi32(out_00, out_04);
+    out8 = _mm_packs_epi32(out_08, out_12);
+
+    // if (coeff > 2047) coeff = 2047
+    out0 = _mm_min_epi16(out0, max_coeff_2047);
+    out8 = _mm_min_epi16(out8, max_coeff_2047);
+  }
+
+  // get sign back (if (sign[j]) out_n = -out_n)
+  out0 = _mm_xor_si128(out0, sign0);
+  out8 = _mm_xor_si128(out8, sign8);
+  out0 = _mm_sub_epi16(out0, sign0);
+  out8 = _mm_sub_epi16(out8, sign8);
+
+  // in = out * Q
+  in0 = _mm_mullo_epi16(out0, q0);
+  in8 = _mm_mullo_epi16(out8, q8);
+
+  _mm_storeu_si128((__m128i*)&in[0], in0);
+  _mm_storeu_si128((__m128i*)&in[8], in8);
+
+  // zigzag the output before storing it.
+  //
+  // The zigzag pattern can almost be reproduced with a small sequence of
+  // shuffles. After it, we only need to swap the 7th (ending up in third
+  // position instead of twelfth) and 8th values.
+  {
+    __m128i outZ0, outZ8;
+    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
+    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
+    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
+    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
+    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
+    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
+    _mm_storeu_si128((__m128i*)&out[0], outZ0);
+    _mm_storeu_si128((__m128i*)&out[8], outZ8);
+    packed_out = _mm_packs_epi16(outZ0, outZ8);
+  }
+  {
+    const int16_t outZ_12 = out[12];
+    const int16_t outZ_3 = out[3];
+    out[3] = outZ_12;
+    out[12] = outZ_3;
+  }
+
+  // detect if all 'out' values are zeroes or not
+  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
+}
+
+static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+                              const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
+}
+
+static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
+                                 const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
+}
+
+static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
+                                const VP8Matrix* const mtx) {
+  int nz;
+  const uint16_t* const sharpen = &mtx->sharpen_[0];
+  nz  = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
+  VP8CollectHistogram = CollectHistogram_SSE2;
+  VP8EncPredLuma16 = Intra16Preds_SSE2;
+  VP8EncPredChroma8 = IntraChromaPreds_SSE2;
+  VP8EncPredLuma4 = Intra4Preds_SSE2;
+  VP8EncQuantizeBlock = QuantizeBlock_SSE2;
+  VP8EncQuantize2Blocks = Quantize2Blocks_SSE2;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2;
+  VP8ITransform = ITransform_SSE2;
+  VP8FTransform = FTransform_SSE2;
+  VP8FTransform2 = FTransform2_SSE2;
+  VP8FTransformWHT = FTransformWHT_SSE2;
+  VP8SSE16x16 = SSE16x16_SSE2;
+  VP8SSE16x8 = SSE16x8_SSE2;
+  VP8SSE8x8 = SSE8x8_SSE2;
+  VP8SSE4x4 = SSE4x4_SSE2;
+  VP8TDisto4x4 = Disto4x4_SSE2;
+  VP8TDisto16x16 = Disto16x16_SSE2;
+  VP8Mean16x4 = Mean16x4_SSE2;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/enc_sse41.c b/media/libwebp/dsp/enc_sse41.c
new file mode 100644
index 0000000000..09ea29361d
--- /dev/null
+++ b/media/libwebp/dsp/enc_sse41.c
@@ -0,0 +1,339 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4 version of some encoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+#include <smmintrin.h>
+#include <stdlib.h>  // for abs()
+
+#include "../dsp/common_sse2.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms.
+
+static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
+                                   int start_block, int end_block,
+                                   VP8Histogram* const histo) {
+  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int k;
+
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin (within out[]).
+    {
+      // Load.
+      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+      // v = abs(out) >> 3
+      const __m128i abs0 = _mm_abs_epi16(out0);
+      const __m128i abs1 = _mm_abs_epi16(out1);
+      const __m128i v0 = _mm_srai_epi16(abs0, 3);
+      const __m128i v1 = _mm_srai_epi16(abs1, 3);
+      // bin = min(v, MAX_COEFF_THRESH)
+      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+      // Store.
+      _mm_storeu_si128((__m128i*)&out[0], bin0);
+      _mm_storeu_si128((__m128i*)&out[8], bin1);
+    }
+
+    // Convert coefficients to bin.
+    for (k = 0; k < 16; ++k) {
+      ++distribution[out[k]];
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
+static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
+                            const uint16_t* const w) {
+  int32_t sum[4];
+  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
+
+  // Load and combine inputs.
+  {
+    const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
+    const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
+    const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
+    // In SSE4.1, with gcc 4.8 at least (maybe other versions),
+    // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump
+    // of inA and inB, _mm_loadl_epi64 is still used not to have an out of
+    // bound read.
+    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
+    const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
+    const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
+    const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
+    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
+
+    // Combine inA and inB (we'll do two transforms in parallel).
+    const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
+    const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
+    const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
+    const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
+    tmp_0 = _mm_cvtepu8_epi16(inAB_0);
+    tmp_1 = _mm_cvtepu8_epi16(inAB_1);
+    tmp_2 = _mm_cvtepu8_epi16(inAB_2);
+    tmp_3 = _mm_cvtepu8_epi16(inAB_3);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+  }
+
+  // Vertical pass first to avoid a transpose (vertical and horizontal passes
+  // are commutative because w/kWeightY is symmetric) and subsequent transpose.
+  {
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+
+    // Transpose the two 4x4.
+    VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
+  }
+
+  // Horizontal pass and difference of weighted sums.
+  {
+    // Load all inputs.
+    const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
+    const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
+
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+
+    // Separate the transforms of inA and inB.
+    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
+    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
+    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
+    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
+
+    A_b0 = _mm_abs_epi16(A_b0);
+    A_b2 = _mm_abs_epi16(A_b2);
+    B_b0 = _mm_abs_epi16(B_b0);
+    B_b2 = _mm_abs_epi16(B_b2);
+
+    // weighted sums
+    A_b0 = _mm_madd_epi16(A_b0, w_0);
+    A_b2 = _mm_madd_epi16(A_b2, w_8);
+    B_b0 = _mm_madd_epi16(B_b0, w_0);
+    B_b2 = _mm_madd_epi16(B_b2, w_8);
+    A_b0 = _mm_add_epi32(A_b0, A_b2);
+    B_b0 = _mm_add_epi32(B_b0, B_b2);
+
+    // difference of weighted sums
+    A_b2 = _mm_sub_epi32(A_b0, B_b0);
+    _mm_storeu_si128((__m128i*)&sum[0], A_b2);
+  }
+  return sum[0] + sum[1] + sum[2] + sum[3];
+}
+
+static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
+                          const uint16_t* const w) {
+  const int diff_sum = TTransform_SSE41(a, b, w);
+  return abs(diff_sum) >> 5;
+}
+
+static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
+                            const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4_SSE41(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+// Generates a pshufb constant for shuffling 16b words.
+#define PSHUFB_CST(A,B,C,D,E,F,G,H) \
+  _mm_set_epi8(2 * (H) + 1, 2 * (H) + 0, 2 * (G) + 1, 2 * (G) + 0, \
+               2 * (F) + 1, 2 * (F) + 0, 2 * (E) + 1, 2 * (E) + 0, \
+               2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
+               2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
+
+static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+                                             const uint16_t* const sharpen,
+                                             const VP8Matrix* const mtx) {
+  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i out0, out8;
+  __m128i packed_out;
+
+  // Load all inputs.
+  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
+  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
+  const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
+  const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
+  const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
+  const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
+
+  // coeff = abs(in)
+  __m128i coeff0 = _mm_abs_epi16(in0);
+  __m128i coeff8 = _mm_abs_epi16(in8);
+
+  // coeff = abs(in) + sharpen
+  if (sharpen != NULL) {
+    const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
+    const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
+    coeff0 = _mm_add_epi16(coeff0, sharpen0);
+    coeff8 = _mm_add_epi16(coeff8, sharpen8);
+  }
+
+  // out = (coeff * iQ + B) >> QFIX
+  {
+    // doing calculations with 32b precision (QFIX=17)
+    // out = (coeff * iQ)
+    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
+    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
+    // out = (coeff * iQ + B)
+    const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
+    const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
+    const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
+    const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
+    out_00 = _mm_add_epi32(out_00, bias_00);
+    out_04 = _mm_add_epi32(out_04, bias_04);
+    out_08 = _mm_add_epi32(out_08, bias_08);
+    out_12 = _mm_add_epi32(out_12, bias_12);
+    // out = QUANTDIV(coeff, iQ, B, QFIX)
+    out_00 = _mm_srai_epi32(out_00, QFIX);
+    out_04 = _mm_srai_epi32(out_04, QFIX);
+    out_08 = _mm_srai_epi32(out_08, QFIX);
+    out_12 = _mm_srai_epi32(out_12, QFIX);
+
+    // pack result as 16b
+    out0 = _mm_packs_epi32(out_00, out_04);
+    out8 = _mm_packs_epi32(out_08, out_12);
+
+    // if (coeff > 2047) coeff = 2047
+    out0 = _mm_min_epi16(out0, max_coeff_2047);
+    out8 = _mm_min_epi16(out8, max_coeff_2047);
+  }
+
+  // put sign back
+  out0 = _mm_sign_epi16(out0, in0);
+  out8 = _mm_sign_epi16(out8, in8);
+
+  // in = out * Q
+  in0 = _mm_mullo_epi16(out0, q0);
+  in8 = _mm_mullo_epi16(out8, q8);
+
+  _mm_storeu_si128((__m128i*)&in[0], in0);
+  _mm_storeu_si128((__m128i*)&in[8], in8);
+
+  // zigzag the output before storing it. The re-ordering is:
+  //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
+  // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
+  // There's only two misplaced entries ([8] and [7]) that are crossing the
+  // reg's boundaries.
+  // We use pshufb instead of pshuflo/pshufhi.
+  {
+    const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
+    const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
+    const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
+    const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7);  // extract #7
+    const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
+    const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
+    const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
+    const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8);  // extract #8
+    const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
+    const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
+    _mm_storeu_si128((__m128i*)&out[0], out_z0);
+    _mm_storeu_si128((__m128i*)&out[8], out_z8);
+    packed_out = _mm_packs_epi16(out_z0, out_z8);
+  }
+
+  // detect if all 'out' values are zeroes or not
+  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
+}
+
+#undef PSHUFB_CST
+
+static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+                               const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
+}
+
+static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
+                                  const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
+}
+
+static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
+                                 const VP8Matrix* const mtx) {
+  int nz;
+  const uint16_t* const sharpen = &mtx->sharpen_[0];
+  nz  = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitSSE41(void);
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
+  VP8CollectHistogram = CollectHistogram_SSE41;
+  VP8EncQuantizeBlock = QuantizeBlock_SSE41;
+  VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
+  VP8TDisto4x4 = Disto4x4_SSE41;
+  VP8TDisto16x16 = Disto16x16_SSE41;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/media/libwebp/dsp/filters.c b/media/libwebp/dsp/filters.c
index dea3eb4101..b0c659478f 100644
--- a/media/libwebp/dsp/filters.c
+++ b/media/libwebp/dsp/filters.c
@@ -33,9 +33,9 @@ static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
                                       uint8_t* dst, int length, int inverse) {
   int i;
   if (inverse) {
-    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
+    for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] + pred[i]);
   } else {
-    for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
+    for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]);
   }
 }
 
@@ -155,7 +155,7 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
       const int pred = GradientPredictor_C(preds[w - 1],
                                            preds[w - stride],
                                            preds[w - stride - 1]);
-      out[w] = in[w] + (inverse ? pred : -pred);
+      out[w] = (uint8_t)(in[w] + (inverse ? pred : -pred));
     }
     ++row;
     preds += stride;
@@ -194,7 +194,7 @@ static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
   uint8_t pred = (prev == NULL) ? 0 : prev[0];
   int i;
   for (i = 0; i < width; ++i) {
-    out[i] = pred + in[i];
+    out[i] = (uint8_t)(pred + in[i]);
     pred = out[i];
   }
 }
@@ -206,7 +206,7 @@ static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
     HorizontalUnfilter_C(NULL, in, out, width);
   } else {
     int i;
-    for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
+    for (i = 0; i < width; ++i) out[i] = (uint8_t)(prev[i] + in[i]);
   }
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
@@ -220,7 +220,7 @@ static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
     int i;
     for (i = 0; i < width; ++i) {
       top = prev[i];  // need to read this first, in case prev==out
-      left = in[i] + GradientPredictor_C(left, top, top_left);
+      left = (uint8_t)(in[i] + GradientPredictor_C(left, top, top_left));
       top_left = top;
       out[i] = left;
     }
@@ -254,7 +254,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
 #endif
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8FiltersInitSSE2();
     }
@@ -271,7 +271,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8FiltersInitNEON();
diff --git a/media/libwebp/dsp/filters_mips_dsp_r2.c b/media/libwebp/dsp/filters_mips_dsp_r2.c
new file mode 100644
index 0000000000..edb1eaac26
--- /dev/null
+++ b/media/libwebp/dsp/filters_mips_dsp_r2.c
@@ -0,0 +1,402 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Spatial prediction using various filters
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+//            Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/dsp.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+# define SANITY_CHECK(in, out)                                                 \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
+  assert(width > 0);                                                           \
+  assert(height > 0);                                                          \
+  assert(stride >= width);                                                     \
+  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
+  (void)height;  // Silence unused warning.
+
+#define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do {                        \
+    const uint8_t* psrc = (uint8_t*)(SRC);                                     \
+    uint8_t* pdst = (uint8_t*)(DST);                                           \
+    const int ilength = (int)(LENGTH);                                         \
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6;                       \
+    __asm__ volatile (                                                         \
+      ".set      push                                   \n\t"                  \
+      ".set      noreorder                              \n\t"                  \
+      "srl       %[temp0],    %[length],    2           \n\t"                  \
+      "beqz      %[temp0],    4f                        \n\t"                  \
+      " andi     %[temp6],    %[length],    3           \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+    "1:                                                 \n\t"                  \
+      "lbu       %[temp1],    -1(%[dst])                \n\t"                  \
+      "lbu       %[temp2],    0(%[src])                 \n\t"                  \
+      "lbu       %[temp3],    1(%[src])                 \n\t"                  \
+      "lbu       %[temp4],    2(%[src])                 \n\t"                  \
+      "lbu       %[temp5],    3(%[src])                 \n\t"                  \
+      "addu      %[temp1],    %[temp1],     %[temp2]    \n\t"                  \
+      "addu      %[temp2],    %[temp1],     %[temp3]    \n\t"                  \
+      "addu      %[temp3],    %[temp2],     %[temp4]    \n\t"                  \
+      "addu      %[temp4],    %[temp3],     %[temp5]    \n\t"                  \
+      "sb        %[temp1],    0(%[dst])                 \n\t"                  \
+      "sb        %[temp2],    1(%[dst])                 \n\t"                  \
+      "sb        %[temp3],    2(%[dst])                 \n\t"                  \
+      "sb        %[temp4],    3(%[dst])                 \n\t"                  \
+      "addiu     %[src],      %[src],       4           \n\t"                  \
+      "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
+      "bnez      %[temp0],    1b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       4           \n\t"                  \
+    ".else                                              \n\t"                  \
+    "1:                                                 \n\t"                  \
+      "ulw       %[temp1],    -1(%[src])                \n\t"                  \
+      "ulw       %[temp2],    0(%[src])                 \n\t"                  \
+      "addiu     %[src],      %[src],       4           \n\t"                  \
+      "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
+      "subu.qb   %[temp3],    %[temp2],     %[temp1]    \n\t"                  \
+      "usw       %[temp3],    0(%[dst])                 \n\t"                  \
+      "bnez      %[temp0],    1b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       4           \n\t"                  \
+    ".endif                                             \n\t"                  \
+    "4:                                                 \n\t"                  \
+      "beqz      %[temp6],    3f                        \n\t"                  \
+      " nop                                             \n\t"                  \
+    "2:                                                 \n\t"                  \
+      "lbu       %[temp2],    0(%[src])                 \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "lbu       %[temp1],    -1(%[dst])                \n\t"                  \
+      "addu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+    ".else                                              \n\t"                  \
+      "lbu       %[temp1],    -1(%[src])                \n\t"                  \
+      "subu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+    ".endif                                             \n\t"                  \
+      "addiu     %[src],      %[src],       1           \n\t"                  \
+      "sb        %[temp3],    0(%[dst])                 \n\t"                  \
+      "addiu     %[temp6],    %[temp6],     -1          \n\t"                  \
+      "bnez      %[temp6],    2b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       1           \n\t"                  \
+    "3:                                                 \n\t"                  \
+      ".set      pop                                    \n\t"                  \
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),         \
+        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
+        [temp6]"=&r"(temp6), [dst]"+&r"(pdst), [src]"+&r"(psrc)                \
+      : [length]"r"(ilength)                                                   \
+      : "memory"                                                               \
+    );                                                                         \
+  } while (0)
+
+static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
+                                              int length) {
+  DO_PREDICT_LINE(src, dst, length, 0);
+}
+
+#define DO_PREDICT_LINE_VERTICAL(SRC, PRED, DST, LENGTH, INVERSE) do {         \
+    const uint8_t* psrc = (uint8_t*)(SRC);                                     \
+    const uint8_t* ppred = (uint8_t*)(PRED);                                   \
+    uint8_t* pdst = (uint8_t*)(DST);                                           \
+    const int ilength = (int)(LENGTH);                                         \
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;                \
+    __asm__ volatile (                                                         \
+      ".set      push                                   \n\t"                  \
+      ".set      noreorder                              \n\t"                  \
+      "srl       %[temp0],    %[length],    0x3         \n\t"                  \
+      "beqz      %[temp0],    4f                        \n\t"                  \
+      " andi     %[temp7],    %[length],    0x7         \n\t"                  \
+    "1:                                                 \n\t"                  \
+      "ulw       %[temp1],    0(%[src])                 \n\t"                  \
+      "ulw       %[temp2],    0(%[pred])                \n\t"                  \
+      "ulw       %[temp3],    4(%[src])                 \n\t"                  \
+      "ulw       %[temp4],    4(%[pred])                \n\t"                  \
+      "addiu     %[src],      %[src],       8           \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "addu.qb   %[temp5],    %[temp1],     %[temp2]    \n\t"                  \
+      "addu.qb   %[temp6],    %[temp3],     %[temp4]    \n\t"                  \
+    ".else                                              \n\t"                  \
+      "subu.qb   %[temp5],    %[temp1],     %[temp2]    \n\t"                  \
+      "subu.qb   %[temp6],    %[temp3],     %[temp4]    \n\t"                  \
+    ".endif                                             \n\t"                  \
+      "addiu     %[pred],     %[pred],      8           \n\t"                  \
+      "usw       %[temp5],    0(%[dst])                 \n\t"                  \
+      "usw       %[temp6],    4(%[dst])                 \n\t"                  \
+      "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
+      "bnez      %[temp0],    1b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       8           \n\t"                  \
+    "4:                                                 \n\t"                  \
+      "beqz      %[temp7],    3f                        \n\t"                  \
+      " nop                                             \n\t"                  \
+    "2:                                                 \n\t"                  \
+      "lbu       %[temp1],    0(%[src])                 \n\t"                  \
+      "lbu       %[temp2],    0(%[pred])                \n\t"                  \
+      "addiu     %[src],      %[src],       1           \n\t"                  \
+      "addiu     %[pred],     %[pred],      1           \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "addu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+    ".else                                              \n\t"                  \
+      "subu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+    ".endif                                             \n\t"                  \
+      "sb        %[temp3],    0(%[dst])                 \n\t"                  \
+      "addiu     %[temp7],    %[temp7],     -1          \n\t"                  \
+      "bnez      %[temp7],    2b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       1           \n\t"                  \
+    "3:                                                 \n\t"                  \
+      ".set      pop                                    \n\t"                  \
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),         \
+        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
+        [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [pred]"+&r"(ppred),          \
+        [dst]"+&r"(pdst), [src]"+&r"(psrc)                                     \
+      : [length]"r"(ilength)                                                   \
+      : "memory"                                                               \
+    );                                                                         \
+  } while (0)
+
+#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST) do {                             \
+    int temp1, temp2, temp3;                                                   \
+    __asm__ volatile (                                                         \
+      "lbu       %[temp1],   0(%[src])               \n\t"                     \
+      "lbu       %[temp2],   0(%[pred])              \n\t"                     \
+      "subu      %[temp3],   %[temp1],   %[temp2]    \n\t"                     \
+      "sb        %[temp3],   0(%[dst])               \n\t"                     \
+      : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)          \
+      : [pred]"r"((PRED)), [dst]"r"((DST)), [src]"r"((SRC))                    \
+      : "memory"                                                               \
+    );                                                                         \
+  } while (0)
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+#define FILTER_LINE_BY_LINE do {                                               \
+    while (row < last_row) {                                                   \
+      PREDICT_LINE_ONE_PASS(in, preds - stride, out);                          \
+      DO_PREDICT_LINE(in + 1, out + 1, width - 1, 0);                          \
+      ++row;                                                                   \
+      preds += stride;                                                         \
+      in += stride;                                                            \
+      out += stride;                                                           \
+    }                                                                          \
+  } while (0)
+
+static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
+                                                     int width, int height,
+                                                     int stride,
+                                                     int row, int num_rows,
+                                                     uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = in;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  FILTER_LINE_BY_LINE;
+}
+#undef FILTER_LINE_BY_LINE
+
+static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
+                                       int width, int height,
+                                       int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                               filtered_data);
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+#define FILTER_LINE_BY_LINE do {                                               \
+    while (row < last_row) {                                                   \
+      DO_PREDICT_LINE_VERTICAL(in, preds, out, width, 0);                      \
+      ++row;                                                                   \
+      preds += stride;                                                         \
+      in += stride;                                                            \
+      out += stride;                                                           \
+    }                                                                          \
+  } while (0)
+
+static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
+                                                   int width, int height,
+                                                   int stride,
+                                                   int row, int num_rows,
+                                                   uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = in;
+
+  if (row == 0) {
+    // Very first top-left pixel is copied.
+    out[0] = in[0];
+    // Rest of top scan-line is left-predicted.
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+    row = 1;
+    in += stride;
+    out += stride;
+  } else {
+    // We are starting from in-between. Make sure 'preds' points to prev row.
+    preds -= stride;
+  }
+
+  // Filter line-by-line.
+  FILTER_LINE_BY_LINE;
+}
+#undef FILTER_LINE_BY_LINE
+
+static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+                                     int stride, uint8_t* filtered_data) {
+  DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                             filtered_data);
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
+  int temp0;
+  __asm__ volatile (
+    "addu             %[temp0],   %[a],       %[b]        \n\t"
+    "subu             %[temp0],   %[temp0],   %[c]        \n\t"
+    "shll_s.w         %[temp0],   %[temp0],   23          \n\t"
+    "precrqu_s.qb.ph  %[temp0],   %[temp0],   $zero       \n\t"
+    "srl              %[temp0],   %[temp0],   24          \n\t"
+    : [temp0]"=&r"(temp0)
+    : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+  );
+  return temp0;
+}
+
+#define FILTER_LINE_BY_LINE(PREDS, OPERATION) do {                             \
+    while (row < last_row) {                                                   \
+      int w;                                                                   \
+      PREDICT_LINE_ONE_PASS(in, PREDS - stride, out);                          \
+      for (w = 1; w < width; ++w) {                                            \
+        const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1],             \
+                                                     PREDS[w - stride],        \
+                                                     PREDS[w - stride - 1]);   \
+        out[w] = in[w] OPERATION pred;                                         \
+      }                                                                        \
+      ++row;                                                                   \
+      in += stride;                                                            \
+      out += stride;                                                           \
+    }                                                                          \
+  } while (0)
+
+static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
+                                       int width, int height, int stride,
+                                       int row, int num_rows, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = in;
+
+  // left prediction for top scan-line
+  if (row == 0) {
+    out[0] = in[0];
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  FILTER_LINE_BY_LINE(in, -);
+}
+#undef FILTER_LINE_BY_LINE
+
+static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+                                     int stride, uint8_t* filtered_data) {
+  DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                             filtered_data);
+}
+
+//------------------------------------------------------------------------------
+
+static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                         uint8_t* out, int width) {
+ out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
+ DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
+}
+
+static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                       uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
+  } else {
+    DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
+  }
+}
+
+static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                       uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
+  } else {
+    uint8_t top = prev[0], top_left = top, left = top;
+    int i;
+    for (i = 0; i < width; ++i) {
+      top = prev[i];  // need to read this first, in case prev==dst
+      left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left);
+      top_left = top;
+      out[i] = left;
+    }
+  }
+}
+
+#undef DO_PREDICT_LINE_VERTICAL
+#undef PREDICT_LINE_ONE_PASS
+#undef DO_PREDICT_LINE
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2;
+
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/filters_msa.c b/media/libwebp/dsp/filters_msa.c
new file mode 100644
index 0000000000..cd32cdabaf
--- /dev/null
+++ b/media/libwebp/dsp/filters_msa.c
@@ -0,0 +1,202 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of alpha filters
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/msa_macro.h"
+
+#include <assert.h>
+
+static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
+                                            const uint8_t* pred,
+                                            uint8_t* dst, int length) {
+  v16u8 src0, pred0, dst0;
+  assert(length >= 0);
+  while (length >= 32) {
+    v16u8 src1, pred1, dst1;
+    LD_UB2(src, 16, src0, src1);
+    LD_UB2(pred, 16, pred0, pred1);
+    SUB2(src0, pred0, src1, pred1, dst0, dst1);
+    ST_UB2(dst0, dst1, dst, 16);
+    src += 32;
+    pred += 32;
+    dst += 32;
+    length -= 32;
+  }
+  if (length > 0) {
+    int i;
+    if (length >= 16) {
+      src0 = LD_UB(src);
+      pred0 = LD_UB(pred);
+      dst0 = src0 - pred0;
+      ST_UB(dst0, dst);
+      src += 16;
+      pred += 16;
+      dst += 16;
+      length -= 16;
+    }
+    for (i = 0; i < length; i++) {
+      dst[i] = src[i] - pred[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+#define SANITY_CHECK(in, out)  \
+  assert(in != NULL);          \
+  assert(out != NULL);         \
+  assert(width > 0);           \
+  assert(height > 0);          \
+  assert(stride >= width);
+
+//------------------------------------------------------------------------------
+// Horrizontal filter
+
+static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
+                                 int stride, uint8_t* filtered_data) {
+  const uint8_t* preds = data;
+  const uint8_t* in = data;
+  uint8_t* out = filtered_data;
+  int row = 1;
+  SANITY_CHECK(in, out);
+
+  // Leftmost pixel is the same as input for topmost scanline.
+  out[0] = in[0];
+  PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+  preds += stride;
+  in += stride;
+  out += stride;
+  // Filter line-by-line.
+  while (row < height) {
+    // Leftmost pixel is predicted from above.
+    PredictLineInverse0(in, preds - stride, out, 1);
+    PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter
+
+static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
+                                            const uint8_t* ppred,
+                                            uint8_t* poutput, int stride,
+                                            int size) {
+  int w;
+  const v16i8 zero = { 0 };
+  while (size >= 16) {
+    v16u8 pred0, dst0;
+    v8i16 a0, a1, b0, b1, c0, c1;
+    const v16u8 tmp0 = LD_UB(ppred - 1);
+    const v16u8 tmp1 = LD_UB(ppred - stride);
+    const v16u8 tmp2 = LD_UB(ppred - stride - 1);
+    const v16u8 src0 = LD_UB(pinput);
+    ILVRL_B2_SH(zero, tmp0, a0, a1);
+    ILVRL_B2_SH(zero, tmp1, b0, b1);
+    ILVRL_B2_SH(zero, tmp2, c0, c1);
+    ADD2(a0, b0, a1, b1, a0, a1);
+    SUB2(a0, c0, a1, c1, a0, a1);
+    CLIP_SH2_0_255(a0, a1);
+    pred0 = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);
+    dst0 = src0 - pred0;
+    ST_UB(dst0, poutput);
+    ppred += 16;
+    pinput += 16;
+    poutput += 16;
+    size -= 16;
+  }
+  for (w = 0; w < size; ++w) {
+    const int pred = ppred[w - 1] + ppred[w - stride] - ppred[w - stride - 1];
+    poutput[w] = pinput[w] - (pred < 0 ? 0 : pred > 255 ? 255 : pred);
+  }
+}
+
+
+static void GradientFilter_MSA(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
+  const uint8_t* in = data;
+  const uint8_t* preds = data;
+  uint8_t* out = filtered_data;
+  int row = 1;
+  SANITY_CHECK(in, out);
+
+  // left prediction for top scan-line
+  out[0] = in[0];
+  PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+  preds += stride;
+  in += stride;
+  out += stride;
+  // Filter line-by-line.
+  while (row < height) {
+    out[0] = in[0] - preds[- stride];
+    PredictLineGradient(preds + 1, in + 1, out + 1, stride, width - 1);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter
+
+static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
+  const uint8_t* in = data;
+  const uint8_t* preds = data;
+  uint8_t* out = filtered_data;
+  int row = 1;
+  SANITY_CHECK(in, out);
+
+  // Very first top-left pixel is copied.
+  out[0] = in[0];
+  // Rest of top scan-line is left-predicted.
+  PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+  in += stride;
+  out += stride;
+
+  // Filter line-by-line.
+  while (row < height) {
+    PredictLineInverse0(in, preds, out, width);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/filters_sse2.c b/media/libwebp/dsp/filters_sse2.c
index 2cc9bb9766..9b91ab680f 100644
--- a/media/libwebp/dsp/filters_sse2.c
+++ b/media/libwebp/dsp/filters_sse2.c
@@ -163,7 +163,8 @@ static void GradientPredictDirect_SSE2(const uint8_t* const row,
     _mm_storel_epi64((__m128i*)(out + i), H);
   }
   for (; i < length; ++i) {
-    out[i] = row[i] - GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+    const int delta = GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+    out[i] = (uint8_t)(row[i] - delta);
   }
 }
 
@@ -188,7 +189,7 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
 
   // Filter line-by-line.
   while (row < last_row) {
-    out[0] = in[0] - in[-stride];
+    out[0] = (uint8_t)(in[0] - in[-stride]);
     GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
     ++row;
     in += stride;
@@ -223,7 +224,7 @@ static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
                                     uint8_t* out, int width) {
   int i;
   __m128i last;
-  out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
+  out[0] = (uint8_t)(in[0] + (prev == NULL ? 0 : prev[0]));
   if (width <= 1) return;
   last = _mm_set_epi32(0, 0, 0, out[0]);
   for (i = 1; i + 8 <= width; i += 8) {
@@ -238,7 +239,7 @@ static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
     _mm_storel_epi64((__m128i*)(out + i), A7);
     last = _mm_srli_epi64(A7, 56);
   }
-  for (; i < width; ++i) out[i] = in[i] + out[i - 1];
+  for (; i < width; ++i) out[i] = (uint8_t)(in[i] + out[i - 1]);
 }
 
 static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
@@ -259,7 +260,7 @@ static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
       _mm_storeu_si128((__m128i*)&out[i +  0], C0);
       _mm_storeu_si128((__m128i*)&out[i + 16], C1);
     }
-    for (; i < width; ++i) out[i] = in[i] + prev[i];
+    for (; i < width; ++i) out[i] = (uint8_t)(in[i] + prev[i]);
   }
 }
 
@@ -296,7 +297,8 @@ static void GradientPredictInverse_SSE2(const uint8_t* const in,
       _mm_storel_epi64((__m128i*)&row[i], out);
     }
     for (; i < length; ++i) {
-      row[i] = in[i] + GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+      const int delta = GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
+      row[i] = (uint8_t)(in[i] + delta);
     }
   }
 }
@@ -306,7 +308,7 @@ static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
   if (prev == NULL) {
     HorizontalUnfilter_SSE2(NULL, in, out, width);
   } else {
-    out[0] = in[0] + prev[0];  // predict from above
+    out[0] = (uint8_t)(in[0] + prev[0]);  // predict from above
     GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1);
   }
 }
@@ -318,7 +320,12 @@ extern void VP8FiltersInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
   WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
+#if defined(CHROMIUM)
+  // TODO(crbug.com/654974)
+  (void)VerticalUnfilter_SSE2;
+#else
   WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
+#endif
   WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
 
   WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
diff --git a/media/libwebp/dsp/lossless.c b/media/libwebp/dsp/lossless.c
index 1a1523d221..763c425ff8 100644
--- a/media/libwebp/dsp/lossless.c
+++ b/media/libwebp/dsp/lossless.c
@@ -81,7 +81,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
 
 // gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is
 // inlined.
-#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409
+#if defined(__arm__) && defined(__GNUC__) && LOCAL_GCC_VERSION <= 0x409
 # define LOCAL_INLINE __attribute__ ((noinline))
 #else
 # define LOCAL_INLINE WEBP_INLINE
@@ -107,88 +107,107 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 // Predictors
 
-static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)top;
   (void)left;
   return ARGB_BLACK;
 }
-static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)top;
-  return left;
+  return *left;
 }
-static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[0];
 }
-static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[1];
 }
-static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[-1];
 }
-static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3(left, top[0], top[1]);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average3(*left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[-1]);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average2(*left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[0]);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average2(*left, top[0]);
   return pred;
 }
-static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   const uint32_t pred = Average2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   const uint32_t pred = Average2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = Average4(*left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select(top[0], left, top[-1]);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], *left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(*left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(*left, top[0], top[-1]);
   return pred;
 }
 
-GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C)
+static void PredictorAdd0_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
+  int x;
+  (void)upper;
+  for (x = 0; x < num_pixels; ++x) out[x] = VP8LAddPixels(in[x], ARGB_BLACK);
+}
 static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
                             int num_pixels, uint32_t* out) {
   int i;
   uint32_t left = out[-1];
+  (void)upper;
   for (i = 0; i < num_pixels; ++i) {
     out[i] = left = VP8LAddPixels(in[i], left);
   }
-  (void)upper;
 }
-GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
-GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
-GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
-GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
-GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
-GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
-GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
-GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
-GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
-GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
-GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
-GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor2_C, PredictorAdd2_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor3_C, PredictorAdd3_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor4_C, PredictorAdd4_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor5_C, PredictorAdd5_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor6_C, PredictorAdd6_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor7_C, PredictorAdd7_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor8_C, PredictorAdd8_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor9_C, PredictorAdd9_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor10_C, PredictorAdd10_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor11_C, PredictorAdd11_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor12_C, PredictorAdd12_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor13_C, PredictorAdd13_C)
 
 //------------------------------------------------------------------------------
 
@@ -270,14 +289,14 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
   int i;
   for (i = 0; i < num_pixels; ++i) {
     const uint32_t argb = src[i];
-    const uint32_t green = argb >> 8;
+    const int8_t green = (int8_t)(argb >> 8);
     const uint32_t red = argb >> 16;
     int new_red = red & 0xff;
     int new_blue = argb & 0xff;
     new_red += ColorTransformDelta(m->green_to_red_, green);
     new_red &= 0xff;
     new_blue += ColorTransformDelta(m->green_to_blue_, green);
-    new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
+    new_blue += ColorTransformDelta(m->red_to_blue_, (int8_t)new_red);
     new_blue &= 0xff;
     dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   }
@@ -557,7 +576,6 @@ VP8LPredictorFunc VP8LPredictors[16];
 
 // exposed plain-C implementations
 VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
-VP8LPredictorFunc VP8LPredictors_C[16];
 
 VP8LTransformColorInverseFunc VP8LTransformColorInverse;
 
@@ -571,6 +589,7 @@ VP8LMapARGBFunc VP8LMapColor32b;
 VP8LMapAlphaFunc VP8LMapColor8b;
 
 extern void VP8LDspInitSSE2(void);
+extern void VP8LDspInitSSE41(void);
 extern void VP8LDspInitNEON(void);
 extern void VP8LDspInitMIPSdspR2(void);
 extern void VP8LDspInitMSA(void);
@@ -595,8 +614,7 @@ extern void VP8LDspInitMSA(void);
 } while (0);
 
 WEBP_DSP_INIT_FUNC(VP8LDspInit) {
-  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
-  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
+  COPY_PREDICTOR_ARRAY(VP8LPredictor, VP8LPredictors)
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
 
@@ -618,9 +636,14 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8LDspInitSSE2();
+#if defined(WEBP_HAVE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8LDspInitSSE41();
+      }
+#endif
     }
 #endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
@@ -635,7 +658,7 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     VP8LDspInitNEON();
diff --git a/media/libwebp/dsp/lossless.h b/media/libwebp/dsp/lossless.h
index 6db5fafc13..0c129d2860 100644
--- a/media/libwebp/dsp/lossless.h
+++ b/media/libwebp/dsp/lossless.h
@@ -28,9 +28,39 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Decoding
 
-typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
+typedef uint32_t (*VP8LPredictorFunc)(const uint32_t* const left,
+                                      const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
-extern VP8LPredictorFunc VP8LPredictors_C[16];
+
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+                           const uint32_t* const top);
+
 // These Add/Sub function expects upper[-1] and out[-1] to be readable.
 typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
                                         const uint32_t* upper, int num_pixels,
diff --git a/media/libwebp/dsp/lossless_common.h b/media/libwebp/dsp/lossless_common.h
index dd2e4f247e..2b20637a28 100644
--- a/media/libwebp/dsp/lossless_common.h
+++ b/media/libwebp/dsp/lossless_common.h
@@ -177,24 +177,13 @@ uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
 static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
                           int num_pixels, uint32_t* out) {           \
   int x;                                                             \
+  assert(upper != NULL);                                             \
   for (x = 0; x < num_pixels; ++x) {                                 \
-    const uint32_t pred = (PREDICTOR)(out[x - 1], upper + x);        \
+    const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x);       \
     out[x] = VP8LAddPixels(in[x], pred);                             \
   }                                                                  \
 }
 
-// It subtracts the prediction from the input pixel and stores the residual
-// in the output pixel.
-#define GENERATE_PREDICTOR_SUB(PREDICTOR, PREDICTOR_SUB)             \
-static void PREDICTOR_SUB(const uint32_t* in, const uint32_t* upper, \
-                          int num_pixels, uint32_t* out) {           \
-  int x;                                                             \
-  for (x = 0; x < num_pixels; ++x) {                                 \
-    const uint32_t pred = (PREDICTOR)(in[x - 1], upper + x);         \
-    out[x] = VP8LSubPixels(in[x], pred);                             \
-  }                                                                  \
-}
-
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/media/libwebp/dsp/lossless_enc.c b/media/libwebp/dsp/lossless_enc.c
new file mode 100644
index 0000000000..dca5e26be8
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc.c
@@ -0,0 +1,948 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+//          Urvang Joshi (urvang@google.com)
+
+#include "../dsp/dsp.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include "../dec/vp8li_dec.h"
+#include "../utils/endian_inl_utils.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../dsp/yuv.h"
+
+// lookup table for small values of log2(int)
+const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
+  0.0000000000000000f, 0.0000000000000000f,
+  1.0000000000000000f, 1.5849625007211560f,
+  2.0000000000000000f, 2.3219280948873621f,
+  2.5849625007211560f, 2.8073549220576041f,
+  3.0000000000000000f, 3.1699250014423121f,
+  3.3219280948873621f, 3.4594316186372973f,
+  3.5849625007211560f, 3.7004397181410921f,
+  3.8073549220576041f, 3.9068905956085187f,
+  4.0000000000000000f, 4.0874628412503390f,
+  4.1699250014423121f, 4.2479275134435852f,
+  4.3219280948873626f, 4.3923174227787606f,
+  4.4594316186372973f, 4.5235619560570130f,
+  4.5849625007211560f, 4.6438561897747243f,
+  4.7004397181410917f, 4.7548875021634682f,
+  4.8073549220576037f, 4.8579809951275718f,
+  4.9068905956085187f, 4.9541963103868749f,
+  5.0000000000000000f, 5.0443941193584533f,
+  5.0874628412503390f, 5.1292830169449663f,
+  5.1699250014423121f, 5.2094533656289501f,
+  5.2479275134435852f, 5.2854022188622487f,
+  5.3219280948873626f, 5.3575520046180837f,
+  5.3923174227787606f, 5.4262647547020979f,
+  5.4594316186372973f, 5.4918530963296747f,
+  5.5235619560570130f, 5.5545888516776376f,
+  5.5849625007211560f, 5.6147098441152083f,
+  5.6438561897747243f, 5.6724253419714951f,
+  5.7004397181410917f, 5.7279204545631987f,
+  5.7548875021634682f, 5.7813597135246599f,
+  5.8073549220576037f, 5.8328900141647412f,
+  5.8579809951275718f, 5.8826430493618415f,
+  5.9068905956085187f, 5.9307373375628866f,
+  5.9541963103868749f, 5.9772799234999167f,
+  6.0000000000000000f, 6.0223678130284543f,
+  6.0443941193584533f, 6.0660891904577720f,
+  6.0874628412503390f, 6.1085244567781691f,
+  6.1292830169449663f, 6.1497471195046822f,
+  6.1699250014423121f, 6.1898245588800175f,
+  6.2094533656289501f, 6.2288186904958804f,
+  6.2479275134435852f, 6.2667865406949010f,
+  6.2854022188622487f, 6.3037807481771030f,
+  6.3219280948873626f, 6.3398500028846243f,
+  6.3575520046180837f, 6.3750394313469245f,
+  6.3923174227787606f, 6.4093909361377017f,
+  6.4262647547020979f, 6.4429434958487279f,
+  6.4594316186372973f, 6.4757334309663976f,
+  6.4918530963296747f, 6.5077946401986963f,
+  6.5235619560570130f, 6.5391588111080309f,
+  6.5545888516776376f, 6.5698556083309478f,
+  6.5849625007211560f, 6.5999128421871278f,
+  6.6147098441152083f, 6.6293566200796094f,
+  6.6438561897747243f, 6.6582114827517946f,
+  6.6724253419714951f, 6.6865005271832185f,
+  6.7004397181410917f, 6.7142455176661224f,
+  6.7279204545631987f, 6.7414669864011464f,
+  6.7548875021634682f, 6.7681843247769259f,
+  6.7813597135246599f, 6.7944158663501061f,
+  6.8073549220576037f, 6.8201789624151878f,
+  6.8328900141647412f, 6.8454900509443747f,
+  6.8579809951275718f, 6.8703647195834047f,
+  6.8826430493618415f, 6.8948177633079437f,
+  6.9068905956085187f, 6.9188632372745946f,
+  6.9307373375628866f, 6.9425145053392398f,
+  6.9541963103868749f, 6.9657842846620869f,
+  6.9772799234999167f, 6.9886846867721654f,
+  7.0000000000000000f, 7.0112272554232539f,
+  7.0223678130284543f, 7.0334230015374501f,
+  7.0443941193584533f, 7.0552824355011898f,
+  7.0660891904577720f, 7.0768155970508308f,
+  7.0874628412503390f, 7.0980320829605263f,
+  7.1085244567781691f, 7.1189410727235076f,
+  7.1292830169449663f, 7.1395513523987936f,
+  7.1497471195046822f, 7.1598713367783890f,
+  7.1699250014423121f, 7.1799090900149344f,
+  7.1898245588800175f, 7.1996723448363644f,
+  7.2094533656289501f, 7.2191685204621611f,
+  7.2288186904958804f, 7.2384047393250785f,
+  7.2479275134435852f, 7.2573878426926521f,
+  7.2667865406949010f, 7.2761244052742375f,
+  7.2854022188622487f, 7.2946207488916270f,
+  7.3037807481771030f, 7.3128829552843557f,
+  7.3219280948873626f, 7.3309168781146167f,
+  7.3398500028846243f, 7.3487281542310771f,
+  7.3575520046180837f, 7.3663222142458160f,
+  7.3750394313469245f, 7.3837042924740519f,
+  7.3923174227787606f, 7.4008794362821843f,
+  7.4093909361377017f, 7.4178525148858982f,
+  7.4262647547020979f, 7.4346282276367245f,
+  7.4429434958487279f, 7.4512111118323289f,
+  7.4594316186372973f, 7.4676055500829976f,
+  7.4757334309663976f, 7.4838157772642563f,
+  7.4918530963296747f, 7.4998458870832056f,
+  7.5077946401986963f, 7.5156998382840427f,
+  7.5235619560570130f, 7.5313814605163118f,
+  7.5391588111080309f, 7.5468944598876364f,
+  7.5545888516776376f, 7.5622424242210728f,
+  7.5698556083309478f, 7.5774288280357486f,
+  7.5849625007211560f, 7.5924570372680806f,
+  7.5999128421871278f, 7.6073303137496104f,
+  7.6147098441152083f, 7.6220518194563764f,
+  7.6293566200796094f, 7.6366246205436487f,
+  7.6438561897747243f, 7.6510516911789281f,
+  7.6582114827517946f, 7.6653359171851764f,
+  7.6724253419714951f, 7.6794800995054464f,
+  7.6865005271832185f, 7.6934869574993252f,
+  7.7004397181410917f, 7.7073591320808825f,
+  7.7142455176661224f, 7.7210991887071855f,
+  7.7279204545631987f, 7.7347096202258383f,
+  7.7414669864011464f, 7.7481928495894605f,
+  7.7548875021634682f, 7.7615512324444795f,
+  7.7681843247769259f, 7.7747870596011736f,
+  7.7813597135246599f, 7.7879025593914317f,
+  7.7944158663501061f, 7.8008998999203047f,
+  7.8073549220576037f, 7.8137811912170374f,
+  7.8201789624151878f, 7.8265484872909150f,
+  7.8328900141647412f, 7.8392037880969436f,
+  7.8454900509443747f, 7.8517490414160571f,
+  7.8579809951275718f, 7.8641861446542797f,
+  7.8703647195834047f, 7.8765169465649993f,
+  7.8826430493618415f, 7.8887432488982591f,
+  7.8948177633079437f, 7.9008668079807486f,
+  7.9068905956085187f, 7.9128893362299619f,
+  7.9188632372745946f, 7.9248125036057812f,
+  7.9307373375628866f, 7.9366379390025709f,
+  7.9425145053392398f, 7.9483672315846778f,
+  7.9541963103868749f, 7.9600019320680805f,
+  7.9657842846620869f, 7.9715435539507719f,
+  7.9772799234999167f, 7.9829935746943103f,
+  7.9886846867721654f, 7.9943534368588577f
+};
+
+const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
+  0.00000000f,    0.00000000f,  2.00000000f,   4.75488750f,
+  8.00000000f,   11.60964047f,  15.50977500f,  19.65148445f,
+  24.00000000f,  28.52932501f,  33.21928095f,  38.05374781f,
+  43.01955001f,  48.10571634f,  53.30296891f,  58.60335893f,
+  64.00000000f,  69.48686830f,  75.05865003f,  80.71062276f,
+  86.43856190f,  92.23866588f,  98.10749561f,  104.04192499f,
+  110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
+  134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
+  160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
+  186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
+  212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
+  240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
+  268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
+  296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
+  325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
+  354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
+  384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
+  413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
+  444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
+  474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
+  505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
+  536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
+  568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
+  600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
+  632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
+  664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
+  696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
+  729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
+  762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
+  795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
+  828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
+  862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
+  896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
+  929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
+  963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
+  998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
+  1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
+  1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
+  1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
+  1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
+  1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
+  1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
+  1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
+  1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
+  1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
+  1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
+  1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
+  1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
+  1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
+  1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
+  1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
+  1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
+  1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
+  1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
+  1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
+  1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
+  1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
+  1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
+  1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
+  1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
+  1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
+  1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
+  1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
+  2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
+};
+
+const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
+  { 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
+  { 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
+  { 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
+  { 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
+  { 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
+  {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
+  {10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
+  {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
+  {11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+};
+
+const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
+   0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  2,  3,  0,  1,  2,  3,
+   0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+  127,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
+};
+
+static float FastSLog2Slow_C(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+    // use clz if available
+    const int log_cnt = BitsLog2Floor(v) - 7;
+    const uint32_t y = 1 << log_cnt;
+    int correction = 0;
+    const float v_f = (float)v;
+    const uint32_t orig_v = v;
+    v >>= log_cnt;
+#else
+    int log_cnt = 0;
+    uint32_t y = 1;
+    int correction = 0;
+    const float v_f = (float)v;
+    const uint32_t orig_v = v;
+    do {
+      ++log_cnt;
+      v = v >> 1;
+      y = y << 1;
+    } while (v >= LOG_LOOKUP_IDX_MAX);
+#endif
+    // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
+    // Xf = floor(Xf) * (1 + (v % y) / v)
+    // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
+    // The correction factor: log(1 + d) ~ d; for very small d values, so
+    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
+    // LOG_2_RECIPROCAL ~ 23/16
+    correction = (23 * (orig_v & (y - 1))) >> 4;
+    return v_f * (kLog2Table[v] + log_cnt) + correction;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
+  }
+}
+
+static float FastLog2Slow_C(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
+    // use clz if available
+    const int log_cnt = BitsLog2Floor(v) - 7;
+    const uint32_t y = 1 << log_cnt;
+    const uint32_t orig_v = v;
+    double log_2;
+    v >>= log_cnt;
+#else
+    int log_cnt = 0;
+    uint32_t y = 1;
+    const uint32_t orig_v = v;
+    double log_2;
+    do {
+      ++log_cnt;
+      v = v >> 1;
+      y = y << 1;
+    } while (v >= LOG_LOOKUP_IDX_MAX);
+#endif
+    log_2 = kLog2Table[v] + log_cnt;
+    if (orig_v >= APPROX_LOG_MAX) {
+      // Since the division is still expensive, add this correction factor only
+      // for large values of 'v'.
+      const int correction = (23 * (orig_v & (y - 1))) >> 4;
+      log_2 += (double)correction / orig_v;
+    }
+    return (float)log_2;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * log((double)v));
+  }
+}
+
+//------------------------------------------------------------------------------
+// Methods to calculate Entropy (Shannon).
+
+// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
+static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
+  int i;
+  double retval = 0.;
+  int sumX = 0, sumXY = 0;
+  for (i = 0; i < 256; ++i) {
+    const int x = X[i];
+    if (x != 0) {
+      const int xy = x + Y[i];
+      sumX += x;
+      retval -= VP8LFastSLog2(x);
+      sumXY += xy;
+      retval -= VP8LFastSLog2(xy);
+    } else if (Y[i] != 0) {
+      sumXY += Y[i];
+      retval -= VP8LFastSLog2(Y[i]);
+    }
+  }
+  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
+  return (float)retval;
+}
+
+void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
+  entropy->entropy = 0.;
+  entropy->sum = 0;
+  entropy->nonzeros = 0;
+  entropy->max_val = 0;
+  entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM;
+}
+
+void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
+                              VP8LBitEntropy* const entropy) {
+  int i;
+
+  VP8LBitEntropyInit(entropy);
+
+  for (i = 0; i < n; ++i) {
+    if (array[i] != 0) {
+      entropy->sum += array[i];
+      entropy->nonzero_code = i;
+      ++entropy->nonzeros;
+      entropy->entropy -= VP8LFastSLog2(array[i]);
+      if (entropy->max_val < array[i]) {
+        entropy->max_val = array[i];
+      }
+    }
+  }
+  entropy->entropy += VP8LFastSLog2(entropy->sum);
+}
+
+static WEBP_INLINE void GetEntropyUnrefinedHelper(
+    uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
+    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+  const int streak = i - *i_prev;
+
+  // Gather info for the bit entropy.
+  if (*val_prev != 0) {
+    bit_entropy->sum += (*val_prev) * streak;
+    bit_entropy->nonzeros += streak;
+    bit_entropy->nonzero_code = *i_prev;
+    bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
+    if (bit_entropy->max_val < *val_prev) {
+      bit_entropy->max_val = *val_prev;
+    }
+  }
+
+  // Gather info for the Huffman cost.
+  stats->counts[*val_prev != 0] += (streak > 3);
+  stats->streaks[*val_prev != 0][(streak > 3)] += streak;
+
+  *val_prev = val;
+  *i_prev = i;
+}
+
+static void GetEntropyUnrefined_C(const uint32_t X[], int length,
+                                  VP8LBitEntropy* const bit_entropy,
+                                  VP8LStreaks* const stats) {
+  int i;
+  int i_prev = 0;
+  uint32_t x_prev = X[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(bit_entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t x = X[i];
+    if (x != x_prev) {
+      GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
+    }
+  }
+  GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
+
+  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
+                                          const uint32_t Y[],
+                                          int length,
+                                          VP8LBitEntropy* const bit_entropy,
+                                          VP8LStreaks* const stats) {
+  int i = 1;
+  int i_prev = 0;
+  uint32_t xy_prev = X[0] + Y[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(bit_entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t xy = X[i] + Y[i];
+    if (xy != xy_prev) {
+      GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
+    }
+  }
+  GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
+
+  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+//------------------------------------------------------------------------------
+
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const int argb = argb_data[i];
+    const int green = (argb >> 8) & 0xff;
+    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
+    const uint32_t new_b = (((argb >>  0) & 0xff) - green) & 0xff;
+    argb_data[i] = (argb & 0xff00ff00u) | (new_r << 16) | new_b;
+  }
+}
+
+static WEBP_INLINE int ColorTransformDelta(int8_t color_pred, int8_t color) {
+  return ((int)color_pred * color) >> 5;
+}
+
+static WEBP_INLINE int8_t U32ToS8(uint32_t v) {
+  return (int8_t)(v & 0xff);
+}
+
+void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
+                          int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint32_t argb = data[i];
+    const int8_t green = U32ToS8(argb >>  8);
+    const int8_t red   = U32ToS8(argb >> 16);
+    int new_red = red & 0xff;
+    int new_blue = argb & 0xff;
+    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue &= 0xff;
+    data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
+                                             uint32_t argb) {
+  const int8_t green = U32ToS8(argb >> 8);
+  int new_red = argb >> 16;
+  new_red -= ColorTransformDelta(green_to_red, green);
+  return (new_red & 0xff);
+}
+
+static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
+                                              uint8_t red_to_blue,
+                                              uint32_t argb) {
+  const int8_t green = U32ToS8(argb >>  8);
+  const int8_t red   = U32ToS8(argb >> 16);
+  uint8_t new_blue = argb & 0xff;
+  new_blue -= ColorTransformDelta(green_to_blue, green);
+  new_blue -= ColorTransformDelta(red_to_blue, red);
+  return (new_blue & 0xff);
+}
+
+void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
+                                     int tile_width, int tile_height,
+                                     int green_to_red, int histo[]) {
+  while (tile_height-- > 0) {
+    int x;
+    for (x = 0; x < tile_width; ++x) {
+      ++histo[TransformColorRed((uint8_t)green_to_red, argb[x])];
+    }
+    argb += stride;
+  }
+}
+
+void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
+                                      int tile_width, int tile_height,
+                                      int green_to_blue, int red_to_blue,
+                                      int histo[]) {
+  while (tile_height-- > 0) {
+    int x;
+    for (x = 0; x < tile_width; ++x) {
+      ++histo[TransformColorBlue((uint8_t)green_to_blue, (uint8_t)red_to_blue,
+                                 argb[x])];
+    }
+    argb += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static int VectorMismatch_C(const uint32_t* const array1,
+                            const uint32_t* const array2, int length) {
+  int match_len = 0;
+
+  while (match_len < length && array1[match_len] == array2[match_len]) {
+    ++match_len;
+  }
+  return match_len;
+}
+
+// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
+void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
+                          uint32_t* dst) {
+  int x;
+  if (xbits > 0) {
+    const int bit_depth = 1 << (3 - xbits);
+    const int mask = (1 << xbits) - 1;
+    uint32_t code = 0xff000000;
+    for (x = 0; x < width; ++x) {
+      const int xsub = x & mask;
+      if (xsub == 0) {
+        code = 0xff000000;
+      }
+      code |= row[x] << (8 + bit_depth * xsub);
+      dst[x >> xbits] = code;
+    }
+  } else {
+    for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static double ExtraCost_C(const uint32_t* population, int length) {
+  int i;
+  double cost = 0.;
+  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
+  return cost;
+}
+
+static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
+                                  int length) {
+  int i;
+  double cost = 0.;
+  for (i = 2; i < length - 2; ++i) {
+    const int xy = X[i + 2] + Y[i + 2];
+    cost += (i >> 1) * xy;
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+
+static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out,
+                        int size) {
+  int i;
+  for (i = 0; i < size; ++i) out[i] = a[i] + b[i];
+}
+
+static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
+  int i;
+  for (i = 0; i < size; ++i) out[i] += a[i];
+}
+
+#define ADD(X, ARG, LEN) do {                                                  \
+  if (a->is_used_[X]) {                                                        \
+    if (b->is_used_[X]) {                                                      \
+      VP8LAddVector(a->ARG, b->ARG, out->ARG, (LEN));                          \
+    } else {                                                                   \
+      memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0]));           \
+    }                                                                          \
+  } else if (b->is_used_[X]) {                                                 \
+    memcpy(&out->ARG[0], &b->ARG[0], (LEN) * sizeof(out->ARG[0]));             \
+  } else {                                                                     \
+    memset(&out->ARG[0], 0, (LEN) * sizeof(out->ARG[0]));                      \
+  }                                                                            \
+} while (0)
+
+#define ADD_EQ(X, ARG, LEN) do {                                               \
+  if (a->is_used_[X]) {                                                        \
+    if (out->is_used_[X]) {                                                    \
+      VP8LAddVectorEq(a->ARG, out->ARG, (LEN));                                \
+    } else {                                                                   \
+      memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0]));           \
+    }                                                                          \
+  }                                                                            \
+} while (0)
+
+void VP8LHistogramAdd(const VP8LHistogram* const a,
+                      const VP8LHistogram* const b, VP8LHistogram* const out) {
+  int i;
+  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+
+  if (b != out) {
+    ADD(0, literal_, literal_size);
+    ADD(1, red_, NUM_LITERAL_CODES);
+    ADD(2, blue_, NUM_LITERAL_CODES);
+    ADD(3, alpha_, NUM_LITERAL_CODES);
+    ADD(4, distance_, NUM_DISTANCE_CODES);
+    for (i = 0; i < 5; ++i) {
+      out->is_used_[i] = (a->is_used_[i] | b->is_used_[i]);
+    }
+  } else {
+    ADD_EQ(0, literal_, literal_size);
+    ADD_EQ(1, red_, NUM_LITERAL_CODES);
+    ADD_EQ(2, blue_, NUM_LITERAL_CODES);
+    ADD_EQ(3, alpha_, NUM_LITERAL_CODES);
+    ADD_EQ(4, distance_, NUM_DISTANCE_CODES);
+    for (i = 0; i < 5; ++i) out->is_used_[i] |= a->is_used_[i];
+  }
+}
+#undef ADD
+#undef ADD_EQ
+
+//------------------------------------------------------------------------------
+// Image transforms.
+
+static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK);
+  (void)upper;
+}
+
+static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]);
+  (void)upper;
+}
+
+// It subtracts the prediction from the input pixel and stores the residual
+// in the output pixel.
+#define GENERATE_PREDICTOR_SUB(PREDICTOR_I)                                \
+static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in,              \
+                                          const uint32_t* upper,           \
+                                          int num_pixels, uint32_t* out) { \
+  int x;                                                                   \
+  assert(upper != NULL);                                                   \
+  for (x = 0; x < num_pixels; ++x) {                                       \
+    const uint32_t pred =                                                  \
+        VP8LPredictor##PREDICTOR_I##_C(&in[x - 1], upper + x);             \
+    out[x] = VP8LSubPixels(in[x], pred);                                   \
+  }                                                                        \
+}
+
+GENERATE_PREDICTOR_SUB(2)
+GENERATE_PREDICTOR_SUB(3)
+GENERATE_PREDICTOR_SUB(4)
+GENERATE_PREDICTOR_SUB(5)
+GENERATE_PREDICTOR_SUB(6)
+GENERATE_PREDICTOR_SUB(7)
+GENERATE_PREDICTOR_SUB(8)
+GENERATE_PREDICTOR_SUB(9)
+GENERATE_PREDICTOR_SUB(10)
+GENERATE_PREDICTOR_SUB(11)
+GENERATE_PREDICTOR_SUB(12)
+GENERATE_PREDICTOR_SUB(13)
+
+//------------------------------------------------------------------------------
+
+VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+
+VP8LTransformColorFunc VP8LTransformColor;
+
+VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
+VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
+
+VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
+VP8LCostFunc VP8LExtraCost;
+VP8LCostCombinedFunc VP8LExtraCostCombined;
+VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
+
+VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
+VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
+
+VP8LAddVectorFunc VP8LAddVector;
+VP8LAddVectorEqFunc VP8LAddVectorEq;
+
+VP8LVectorMismatchFunc VP8LVectorMismatch;
+VP8LBundleColorMapFunc VP8LBundleColorMap;
+
+VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
+VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
+
+extern void VP8LEncDspInitSSE2(void);
+extern void VP8LEncDspInitSSE41(void);
+extern void VP8LEncDspInitNEON(void);
+extern void VP8LEncDspInitMIPS32(void);
+extern void VP8LEncDspInitMIPSdspR2(void);
+extern void VP8LEncDspInitMSA(void);
+
+WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
+  VP8LDspInit();
+
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
+
+  VP8LTransformColor = VP8LTransformColor_C;
+#endif
+
+  VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
+  VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
+
+  VP8LFastLog2Slow = FastLog2Slow_C;
+  VP8LFastSLog2Slow = FastSLog2Slow_C;
+
+  VP8LExtraCost = ExtraCost_C;
+  VP8LExtraCostCombined = ExtraCostCombined_C;
+  VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
+
+  VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
+  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
+
+  VP8LAddVector = AddVector_C;
+  VP8LAddVectorEq = AddVectorEq_C;
+
+  VP8LVectorMismatch = VectorMismatch_C;
+  VP8LBundleColorMap = VP8LBundleColorMap_C;
+
+  VP8LPredictorsSub[0] = PredictorSub0_C;
+  VP8LPredictorsSub[1] = PredictorSub1_C;
+  VP8LPredictorsSub[2] = PredictorSub2_C;
+  VP8LPredictorsSub[3] = PredictorSub3_C;
+  VP8LPredictorsSub[4] = PredictorSub4_C;
+  VP8LPredictorsSub[5] = PredictorSub5_C;
+  VP8LPredictorsSub[6] = PredictorSub6_C;
+  VP8LPredictorsSub[7] = PredictorSub7_C;
+  VP8LPredictorsSub[8] = PredictorSub8_C;
+  VP8LPredictorsSub[9] = PredictorSub9_C;
+  VP8LPredictorsSub[10] = PredictorSub10_C;
+  VP8LPredictorsSub[11] = PredictorSub11_C;
+  VP8LPredictorsSub[12] = PredictorSub12_C;
+  VP8LPredictorsSub[13] = PredictorSub13_C;
+  VP8LPredictorsSub[14] = PredictorSub0_C;  // <- padding security sentinels
+  VP8LPredictorsSub[15] = PredictorSub0_C;
+
+  VP8LPredictorsSub_C[0] = PredictorSub0_C;
+  VP8LPredictorsSub_C[1] = PredictorSub1_C;
+  VP8LPredictorsSub_C[2] = PredictorSub2_C;
+  VP8LPredictorsSub_C[3] = PredictorSub3_C;
+  VP8LPredictorsSub_C[4] = PredictorSub4_C;
+  VP8LPredictorsSub_C[5] = PredictorSub5_C;
+  VP8LPredictorsSub_C[6] = PredictorSub6_C;
+  VP8LPredictorsSub_C[7] = PredictorSub7_C;
+  VP8LPredictorsSub_C[8] = PredictorSub8_C;
+  VP8LPredictorsSub_C[9] = PredictorSub9_C;
+  VP8LPredictorsSub_C[10] = PredictorSub10_C;
+  VP8LPredictorsSub_C[11] = PredictorSub11_C;
+  VP8LPredictorsSub_C[12] = PredictorSub12_C;
+  VP8LPredictorsSub_C[13] = PredictorSub13_C;
+  VP8LPredictorsSub_C[14] = PredictorSub0_C;  // <- padding security sentinels
+  VP8LPredictorsSub_C[15] = PredictorSub0_C;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_HAVE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8LEncDspInitSSE2();
+#if defined(WEBP_HAVE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8LEncDspInitSSE41();
+      }
+#endif
+    }
+#endif
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8LEncDspInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8LEncDspInitMIPSdspR2();
+    }
+#endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8LEncDspInitMSA();
+    }
+#endif
+  }
+
+#if defined(WEBP_HAVE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8LEncDspInitNEON();
+  }
+#endif
+
+  assert(VP8LSubtractGreenFromBlueAndRed != NULL);
+  assert(VP8LTransformColor != NULL);
+  assert(VP8LCollectColorBlueTransforms != NULL);
+  assert(VP8LCollectColorRedTransforms != NULL);
+  assert(VP8LFastLog2Slow != NULL);
+  assert(VP8LFastSLog2Slow != NULL);
+  assert(VP8LExtraCost != NULL);
+  assert(VP8LExtraCostCombined != NULL);
+  assert(VP8LCombinedShannonEntropy != NULL);
+  assert(VP8LGetEntropyUnrefined != NULL);
+  assert(VP8LGetCombinedEntropyUnrefined != NULL);
+  assert(VP8LAddVector != NULL);
+  assert(VP8LAddVectorEq != NULL);
+  assert(VP8LVectorMismatch != NULL);
+  assert(VP8LBundleColorMap != NULL);
+  assert(VP8LPredictorsSub[0] != NULL);
+  assert(VP8LPredictorsSub[1] != NULL);
+  assert(VP8LPredictorsSub[2] != NULL);
+  assert(VP8LPredictorsSub[3] != NULL);
+  assert(VP8LPredictorsSub[4] != NULL);
+  assert(VP8LPredictorsSub[5] != NULL);
+  assert(VP8LPredictorsSub[6] != NULL);
+  assert(VP8LPredictorsSub[7] != NULL);
+  assert(VP8LPredictorsSub[8] != NULL);
+  assert(VP8LPredictorsSub[9] != NULL);
+  assert(VP8LPredictorsSub[10] != NULL);
+  assert(VP8LPredictorsSub[11] != NULL);
+  assert(VP8LPredictorsSub[12] != NULL);
+  assert(VP8LPredictorsSub[13] != NULL);
+  assert(VP8LPredictorsSub[14] != NULL);
+  assert(VP8LPredictorsSub[15] != NULL);
+  assert(VP8LPredictorsSub_C[0] != NULL);
+  assert(VP8LPredictorsSub_C[1] != NULL);
+  assert(VP8LPredictorsSub_C[2] != NULL);
+  assert(VP8LPredictorsSub_C[3] != NULL);
+  assert(VP8LPredictorsSub_C[4] != NULL);
+  assert(VP8LPredictorsSub_C[5] != NULL);
+  assert(VP8LPredictorsSub_C[6] != NULL);
+  assert(VP8LPredictorsSub_C[7] != NULL);
+  assert(VP8LPredictorsSub_C[8] != NULL);
+  assert(VP8LPredictorsSub_C[9] != NULL);
+  assert(VP8LPredictorsSub_C[10] != NULL);
+  assert(VP8LPredictorsSub_C[11] != NULL);
+  assert(VP8LPredictorsSub_C[12] != NULL);
+  assert(VP8LPredictorsSub_C[13] != NULL);
+  assert(VP8LPredictorsSub_C[14] != NULL);
+  assert(VP8LPredictorsSub_C[15] != NULL);
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/dsp/lossless_enc_mips32.c b/media/libwebp/dsp/lossless_enc_mips32.c
new file mode 100644
index 0000000000..088e608b44
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_mips32.c
@@ -0,0 +1,397 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of lossless functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+static float FastSLog2Slow_MIPS32(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    uint32_t log_cnt, y, correction;
+    const int c24 = 24;
+    const float v_f = (float)v;
+    uint32_t temp;
+
+    // Xf = 256 = 2^8
+    // log_cnt is index of leading one in upper 24 bits
+    __asm__ volatile(
+      "clz      %[log_cnt], %[v]                      \n\t"
+      "addiu    %[y],       $zero,        1           \n\t"
+      "subu     %[log_cnt], %[c24],       %[log_cnt]  \n\t"
+      "sllv     %[y],       %[y],         %[log_cnt]  \n\t"
+      "srlv     %[temp],    %[v],         %[log_cnt]  \n\t"
+      : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
+        [temp]"=r"(temp)
+      : [c24]"r"(c24), [v]"r"(v)
+    );
+
+    // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
+    // Xf = floor(Xf) * (1 + (v % y) / v)
+    // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
+    // The correction factor: log(1 + d) ~ d; for very small d values, so
+    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
+    // LOG_2_RECIPROCAL ~ 23/16
+
+    // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
+    correction = (23 * (v & (y - 1))) >> 4;
+    return v_f * (kLog2Table[temp] + log_cnt) + correction;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
+  }
+}
+
+static float FastLog2Slow_MIPS32(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    uint32_t log_cnt, y;
+    const int c24 = 24;
+    double log_2;
+    uint32_t temp;
+
+    __asm__ volatile(
+      "clz      %[log_cnt], %[v]                      \n\t"
+      "addiu    %[y],       $zero,        1           \n\t"
+      "subu     %[log_cnt], %[c24],       %[log_cnt]  \n\t"
+      "sllv     %[y],       %[y],         %[log_cnt]  \n\t"
+      "srlv     %[temp],    %[v],         %[log_cnt]  \n\t"
+      : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
+        [temp]"=r"(temp)
+      : [c24]"r"(c24), [v]"r"(v)
+    );
+
+    log_2 = kLog2Table[temp] + log_cnt;
+    if (v >= APPROX_LOG_MAX) {
+      // Since the division is still expensive, add this correction factor only
+      // for large values of 'v'.
+
+      const uint32_t correction = (23 * (v & (y - 1))) >> 4;
+      log_2 += (double)correction / v;
+    }
+    return (float)log_2;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * log((double)v));
+  }
+}
+
+// C version of this function:
+//   int i = 0;
+//   int64_t cost = 0;
+//   const uint32_t* pop = &population[4];
+//   const uint32_t* LoopEnd = &population[length];
+//   while (pop != LoopEnd) {
+//     ++i;
+//     cost += i * *pop;
+//     cost += i * *(pop + 1);
+//     pop += 2;
+//   }
+//   return (double)cost;
+static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
+  int i, temp0, temp1;
+  const uint32_t* pop = &population[4];
+  const uint32_t* const LoopEnd = &population[length];
+
+  __asm__ volatile(
+    "mult   $zero,    $zero                  \n\t"
+    "xor    %[i],     %[i],       %[i]       \n\t"
+    "beq    %[pop],   %[LoopEnd], 2f         \n\t"
+  "1:                                        \n\t"
+    "lw     %[temp0], 0(%[pop])              \n\t"
+    "lw     %[temp1], 4(%[pop])              \n\t"
+    "addiu  %[i],     %[i],       1          \n\t"
+    "addiu  %[pop],   %[pop],     8          \n\t"
+    "madd   %[i],     %[temp0]               \n\t"
+    "madd   %[i],     %[temp1]               \n\t"
+    "bne    %[pop],   %[LoopEnd], 1b         \n\t"
+  "2:                                        \n\t"
+    "mfhi   %[temp0]                         \n\t"
+    "mflo   %[temp1]                         \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [i]"=&r"(i), [pop]"+r"(pop)
+    : [LoopEnd]"r"(LoopEnd)
+    : "memory", "hi", "lo"
+  );
+
+  return (double)((int64_t)temp0 << 32 | temp1);
+}
+
+// C version of this function:
+//   int i = 0;
+//   int64_t cost = 0;
+//   const uint32_t* pX = &X[4];
+//   const uint32_t* pY = &Y[4];
+//   const uint32_t* LoopEnd = &X[length];
+//   while (pX != LoopEnd) {
+//     const uint32_t xy0 = *pX + *pY;
+//     const uint32_t xy1 = *(pX + 1) + *(pY + 1);
+//     ++i;
+//     cost += i * xy0;
+//     cost += i * xy1;
+//     pX += 2;
+//     pY += 2;
+//   }
+//   return (double)cost;
+static double ExtraCostCombined_MIPS32(const uint32_t* const X,
+                                       const uint32_t* const Y, int length) {
+  int i, temp0, temp1, temp2, temp3;
+  const uint32_t* pX = &X[4];
+  const uint32_t* pY = &Y[4];
+  const uint32_t* const LoopEnd = &X[length];
+
+  __asm__ volatile(
+    "mult   $zero,    $zero                  \n\t"
+    "xor    %[i],     %[i],       %[i]       \n\t"
+    "beq    %[pX],    %[LoopEnd], 2f         \n\t"
+  "1:                                        \n\t"
+    "lw     %[temp0], 0(%[pX])               \n\t"
+    "lw     %[temp1], 0(%[pY])               \n\t"
+    "lw     %[temp2], 4(%[pX])               \n\t"
+    "lw     %[temp3], 4(%[pY])               \n\t"
+    "addiu  %[i],     %[i],       1          \n\t"
+    "addu   %[temp0], %[temp0],   %[temp1]   \n\t"
+    "addu   %[temp2], %[temp2],   %[temp3]   \n\t"
+    "addiu  %[pX],    %[pX],      8          \n\t"
+    "addiu  %[pY],    %[pY],      8          \n\t"
+    "madd   %[i],     %[temp0]               \n\t"
+    "madd   %[i],     %[temp2]               \n\t"
+    "bne    %[pX],    %[LoopEnd], 1b         \n\t"
+  "2:                                        \n\t"
+    "mfhi   %[temp0]                         \n\t"
+    "mflo   %[temp1]                         \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
+    : [LoopEnd]"r"(LoopEnd)
+    : "memory", "hi", "lo"
+  );
+
+  return (double)((int64_t)temp0 << 32 | temp1);
+}
+
+#define HUFFMAN_COST_PASS                                 \
+  __asm__ volatile(                                       \
+    "sll   %[temp1],  %[temp0],    3           \n\t"      \
+    "addiu %[temp3],  %[streak],   -3          \n\t"      \
+    "addu  %[temp2],  %[pstreaks], %[temp1]    \n\t"      \
+    "blez  %[temp3],  1f                       \n\t"      \
+    "srl   %[temp1],  %[temp1],    1           \n\t"      \
+    "addu  %[temp3],  %[pcnts],    %[temp1]    \n\t"      \
+    "lw    %[temp0],  4(%[temp2])              \n\t"      \
+    "lw    %[temp1],  0(%[temp3])              \n\t"      \
+    "addu  %[temp0],  %[temp0],    %[streak]   \n\t"      \
+    "addiu %[temp1],  %[temp1],    1           \n\t"      \
+    "sw    %[temp0],  4(%[temp2])              \n\t"      \
+    "sw    %[temp1],  0(%[temp3])              \n\t"      \
+    "b     2f                                  \n\t"      \
+  "1:                                          \n\t"      \
+    "lw    %[temp0],  0(%[temp2])              \n\t"      \
+    "addu  %[temp0],  %[temp0],    %[streak]   \n\t"      \
+    "sw    %[temp0],  0(%[temp2])              \n\t"      \
+  "2:                                          \n\t"      \
+    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),           \
+      [temp3]"=&r"(temp3), [temp0]"+r"(temp0)             \
+    : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts),         \
+      [streak]"r"(streak)                                 \
+    : "memory"                                            \
+  );
+
+// Returns the various RLE counts
+static WEBP_INLINE void GetEntropyUnrefinedHelper(
+    uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
+    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+  int* const pstreaks = &stats->streaks[0][0];
+  int* const pcnts = &stats->counts[0];
+  int temp0, temp1, temp2, temp3;
+  const int streak = i - *i_prev;
+
+  // Gather info for the bit entropy.
+  if (*val_prev != 0) {
+    bit_entropy->sum += (*val_prev) * streak;
+    bit_entropy->nonzeros += streak;
+    bit_entropy->nonzero_code = *i_prev;
+    bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
+    if (bit_entropy->max_val < *val_prev) {
+      bit_entropy->max_val = *val_prev;
+    }
+  }
+
+  // Gather info for the Huffman cost.
+  temp0 = (*val_prev != 0);
+  HUFFMAN_COST_PASS
+
+  *val_prev = val;
+  *i_prev = i;
+}
+
+static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
+                                       VP8LBitEntropy* const bit_entropy,
+                                       VP8LStreaks* const stats) {
+  int i;
+  int i_prev = 0;
+  uint32_t x_prev = X[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(bit_entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t x = X[i];
+    if (x != x_prev) {
+      GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
+    }
+  }
+  GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
+
+  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
+                                               const uint32_t Y[],
+                                               int length,
+                                               VP8LBitEntropy* const entropy,
+                                               VP8LStreaks* const stats) {
+  int i = 1;
+  int i_prev = 0;
+  uint32_t xy_prev = X[0] + Y[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t xy = X[i] + Y[i];
+    if (xy != xy_prev) {
+      GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);
+    }
+  }
+  GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
+
+  entropy->entropy += VP8LFastSLog2(entropy->sum);
+}
+
+#define ASM_START                                       \
+  __asm__ volatile(                                     \
+    ".set   push                            \n\t"       \
+    ".set   at                              \n\t"       \
+    ".set   macro                           \n\t"       \
+  "1:                                       \n\t"
+
+// P2 = P0 + P1
+// A..D - offsets
+// E - temp variable to tell macro
+//     if pointer should be incremented
+// literal_ and successive histograms could be unaligned
+// so we must use ulw and usw
+#define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2)           \
+    "ulw    %[temp0], " #A "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp1], " #B "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp2], " #C "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp3], " #D "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp4], " #A "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp5], " #B "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp6], " #C "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp7], " #D "(%[" #P1 "])    \n\t"       \
+    "addu   %[temp4], %[temp4],   %[temp0]  \n\t"       \
+    "addu   %[temp5], %[temp5],   %[temp1]  \n\t"       \
+    "addu   %[temp6], %[temp6],   %[temp2]  \n\t"       \
+    "addu   %[temp7], %[temp7],   %[temp3]  \n\t"       \
+    "addiu  %[" #P0 "],  %[" #P0 "],  16    \n\t"       \
+  ".if " #E " == 1                          \n\t"       \
+    "addiu  %[" #P1 "],  %[" #P1 "],  16    \n\t"       \
+  ".endif                                   \n\t"       \
+    "usw    %[temp4], " #A "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp5], " #B "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp6], " #C "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp7], " #D "(%[" #P2 "])    \n\t"       \
+    "addiu  %[" #P2 "], %[" #P2 "],   16    \n\t"       \
+    "bne    %[" #P0 "], %[LoopEnd], 1b      \n\t"       \
+    ".set   pop                             \n\t"       \
+
+#define ASM_END_COMMON_0                                \
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),         \
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),         \
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),         \
+      [pa]"+r"(pa), [pout]"+r"(pout)
+
+#define ASM_END_COMMON_1                                \
+    : [LoopEnd]"r"(LoopEnd)                             \
+    : "memory", "at"                                    \
+  );
+
+#define ASM_END_0                                       \
+    ASM_END_COMMON_0                                    \
+      , [pb]"+r"(pb)                                    \
+    ASM_END_COMMON_1
+
+#define ASM_END_1                                       \
+    ASM_END_COMMON_0                                    \
+    ASM_END_COMMON_1
+
+static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
+                             uint32_t* pout, int size) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  const uint32_t end = ((size) / 4) * 4;
+  const uint32_t* const LoopEnd = pa + end;
+  int i;
+  ASM_START
+  ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)
+  ASM_END_0
+  for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i];
+}
+
+static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  const uint32_t end = ((size) / 4) * 4;
+  const uint32_t* const LoopEnd = pa + end;
+  int i;
+  ASM_START
+  ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)
+  ASM_END_1
+  for (i = end; i < size; ++i) pout[i] += pa[i];
+}
+
+#undef ASM_END_1
+#undef ASM_END_0
+#undef ASM_END_COMMON_1
+#undef ASM_END_COMMON_0
+#undef ADD_TO_OUT
+#undef ASM_START
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
+  VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;
+  VP8LFastLog2Slow = FastLog2Slow_MIPS32;
+  VP8LExtraCost = ExtraCost_MIPS32;
+  VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
+  VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
+  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
+  VP8LAddVector = AddVector_MIPS32;
+  VP8LAddVectorEq = AddVectorEq_MIPS32;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/lossless_enc_mips_dsp_r2.c b/media/libwebp/dsp/lossless_enc_mips_dsp_r2.c
new file mode 100644
index 0000000000..157dfc2e01
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_mips_dsp_r2.c
@@ -0,0 +1,281 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/lossless.h"
+
+static void SubtractGreenFromBlueAndRed_MIPSdspR2(uint32_t* argb_data,
+                                                  int num_pixels) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
+  uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
+  __asm__ volatile (
+    ".set       push                                          \n\t"
+    ".set       noreorder                                     \n\t"
+    "beq        %[argb_data],    %[p_loop1_end],     3f       \n\t"
+    " nop                                                     \n\t"
+  "0:                                                         \n\t"
+    "lw         %[temp0],        0(%[argb_data])              \n\t"
+    "lw         %[temp1],        4(%[argb_data])              \n\t"
+    "lw         %[temp2],        8(%[argb_data])              \n\t"
+    "lw         %[temp3],        12(%[argb_data])             \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "ext        %[temp5],        %[temp1],           8,    8  \n\t"
+    "ext        %[temp6],        %[temp2],           8,    8  \n\t"
+    "ext        %[temp7],        %[temp3],           8,    8  \n\t"
+    "addiu      %[argb_data],    %[argb_data],       16       \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "replv.ph   %[temp5],        %[temp5]                     \n\t"
+    "replv.ph   %[temp6],        %[temp6]                     \n\t"
+    "replv.ph   %[temp7],        %[temp7]                     \n\t"
+    "subu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "subu.qb    %[temp1],        %[temp1],           %[temp5] \n\t"
+    "subu.qb    %[temp2],        %[temp2],           %[temp6] \n\t"
+    "subu.qb    %[temp3],        %[temp3],           %[temp7] \n\t"
+    "sw         %[temp0],        -16(%[argb_data])            \n\t"
+    "sw         %[temp1],        -12(%[argb_data])            \n\t"
+    "sw         %[temp2],        -8(%[argb_data])             \n\t"
+    "bne        %[argb_data],    %[p_loop1_end],     0b       \n\t"
+    " sw        %[temp3],        -4(%[argb_data])             \n\t"
+  "3:                                                         \n\t"
+    "beq        %[argb_data],    %[p_loop2_end],     2f       \n\t"
+    " nop                                                     \n\t"
+  "1:                                                         \n\t"
+    "lw         %[temp0],        0(%[argb_data])              \n\t"
+    "addiu      %[argb_data],    %[argb_data],       4        \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "subu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "bne        %[argb_data],    %[p_loop2_end],     1b       \n\t"
+    " sw        %[temp0],        -4(%[argb_data])             \n\t"
+  "2:                                                         \n\t"
+    ".set       pop                                           \n\t"
+    : [argb_data]"+&r"(argb_data), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
+                                                int8_t color) {
+  return (uint32_t)((int)(color_pred) * color) >> 5;
+}
+
+static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
+                                     uint32_t* data, int num_pixels) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  uint32_t argb, argb1, new_red, new_red1;
+  const uint32_t G_to_R = m->green_to_red_;
+  const uint32_t G_to_B = m->green_to_blue_;
+  const uint32_t R_to_B = m->red_to_blue_;
+  uint32_t* const p_loop_end = data + (num_pixels & ~1);
+  __asm__ volatile (
+    ".set            push                                    \n\t"
+    ".set            noreorder                               \n\t"
+    "beq             %[data],      %[p_loop_end],  1f        \n\t"
+    " nop                                                    \n\t"
+    "replv.ph        %[temp0],     %[G_to_R]                 \n\t"
+    "replv.ph        %[temp1],     %[G_to_B]                 \n\t"
+    "replv.ph        %[temp2],     %[R_to_B]                 \n\t"
+    "shll.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shll.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shll.ph         %[temp2],     %[temp2],       8         \n\t"
+    "shra.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shra.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shra.ph         %[temp2],     %[temp2],       8         \n\t"
+  "0:                                                        \n\t"
+    "lw              %[argb],      0(%[data])                \n\t"
+    "lw              %[argb1],     4(%[data])                \n\t"
+    "lhu             %[new_red],   2(%[data])                \n\t"
+    "lhu             %[new_red1],  6(%[data])                \n\t"
+    "precrq.qb.ph    %[temp3],     %[argb],        %[argb1]  \n\t"
+    "precr.qb.ph     %[temp4],     %[argb],        %[argb1]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[temp3]                  \n\t"
+    "preceu.ph.qbla  %[temp4],     %[temp4]                  \n\t"
+    "shll.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shll.ph         %[temp4],     %[temp4],       8         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       8         \n\t"
+    "mul.ph          %[temp5],     %[temp3],       %[temp0]  \n\t"
+    "mul.ph          %[temp3],     %[temp3],       %[temp1]  \n\t"
+    "mul.ph          %[temp4],     %[temp4],       %[temp2]  \n\t"
+    "addiu           %[data],      %[data],        8         \n\t"
+    "ins             %[new_red1],  %[new_red],     16,   16  \n\t"
+    "ins             %[argb1],     %[argb],        16,   16  \n\t"
+    "shra.ph         %[temp5],     %[temp5],       5         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       5         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       5         \n\t"
+    "subu.ph         %[new_red1],  %[new_red1],    %[temp5]  \n\t"
+    "subu.ph         %[argb1],     %[argb1],       %[temp3]  \n\t"
+    "preceu.ph.qbra  %[temp5],     %[new_red1]               \n\t"
+    "subu.ph         %[argb1],     %[argb1],       %[temp4]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[argb1]                  \n\t"
+    "sb              %[temp5],     -2(%[data])               \n\t"
+    "sb              %[temp3],     -4(%[data])               \n\t"
+    "sra             %[temp5],     %[temp5],       16        \n\t"
+    "sra             %[temp3],     %[temp3],       16        \n\t"
+    "sb              %[temp5],     -6(%[data])               \n\t"
+    "bne             %[data],      %[p_loop_end],  0b        \n\t"
+    " sb             %[temp3],     -8(%[data])               \n\t"
+  "1:                                                        \n\t"
+    ".set            pop                                     \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [new_red1]"=&r"(new_red1), [new_red]"=&r"(new_red),
+      [argb]"=&r"(argb), [argb1]"=&r"(argb1), [data]"+&r"(data)
+    : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
+      [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
+    : "memory", "hi", "lo"
+  );
+
+  if (num_pixels & 1) {
+    const uint32_t argb_ = data[0];
+    const uint32_t green = argb_ >> 8;
+    const uint32_t red = argb_ >> 16;
+    uint32_t new_blue = argb_;
+    new_red = red;
+    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue &= 0xff;
+    data[0] = (argb_ & 0xff00ff00u) | (new_red << 16) | (new_blue);
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
+                                              uint8_t red_to_blue,
+                                              uint32_t argb) {
+  const uint32_t green = argb >> 8;
+  const uint32_t red = argb >> 16;
+  uint8_t new_blue = argb;
+  new_blue -= ColorTransformDelta(green_to_blue, green);
+  new_blue -= ColorTransformDelta(red_to_blue, red);
+  return (new_blue & 0xff);
+}
+
+static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
+                                                 int stride,
+                                                 int tile_width,
+                                                 int tile_height,
+                                                 int green_to_blue,
+                                                 int red_to_blue,
+                                                 int histo[]) {
+  const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
+  const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
+  const uint32_t mask = 0xff00ffu;
+  while (tile_height-- > 0) {
+    int x;
+    const uint32_t* p_argb = argb;
+    argb += stride;
+    for (x = 0; x < (tile_width >> 1); ++x) {
+      int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+      __asm__ volatile (
+        "lw           %[temp0],  0(%[p_argb])             \n\t"
+        "lw           %[temp1],  4(%[p_argb])             \n\t"
+        "precr.qb.ph  %[temp2],  %[temp0],  %[temp1]      \n\t"
+        "ins          %[temp1],  %[temp0],  16,    16     \n\t"
+        "shra.ph      %[temp2],  %[temp2],  8             \n\t"
+        "shra.ph      %[temp3],  %[temp1],  8             \n\t"
+        "mul.ph       %[temp5],  %[temp2],  %[rtb]        \n\t"
+        "mul.ph       %[temp6],  %[temp3],  %[gtb]        \n\t"
+        "and          %[temp4],  %[temp1],  %[mask]       \n\t"
+        "addiu        %[p_argb], %[p_argb], 8             \n\t"
+        "shra.ph      %[temp5],  %[temp5],  5             \n\t"
+        "shra.ph      %[temp6],  %[temp6],  5             \n\t"
+        "subu.qb      %[temp2],  %[temp4],  %[temp5]      \n\t"
+        "subu.qb      %[temp2],  %[temp2],  %[temp6]      \n\t"
+        : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+          [temp5]"=&r"(temp5), [temp6]"=&r"(temp6)
+        : [rtb]"r"(rtb), [gtb]"r"(gtb), [mask]"r"(mask)
+        : "memory", "hi", "lo"
+      );
+      ++histo[(uint8_t)(temp2 >> 16)];
+      ++histo[(uint8_t)temp2];
+    }
+    if (tile_width & 1) {
+      ++histo[TransformColorBlue(green_to_blue, red_to_blue, *p_argb)];
+    }
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
+                                             uint32_t argb) {
+  const uint32_t green = argb >> 8;
+  uint32_t new_red = argb >> 16;
+  new_red -= ColorTransformDelta(green_to_red, green);
+  return (new_red & 0xff);
+}
+
+static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
+                                                int stride,
+                                                int tile_width,
+                                                int tile_height,
+                                                int green_to_red,
+                                                int histo[]) {
+  const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
+  while (tile_height-- > 0) {
+    int x;
+    const uint32_t* p_argb = argb;
+    argb += stride;
+    for (x = 0; x < (tile_width >> 1); ++x) {
+      int temp0, temp1, temp2, temp3, temp4;
+      __asm__ volatile (
+        "lw           %[temp0],  0(%[p_argb])             \n\t"
+        "lw           %[temp1],  4(%[p_argb])             \n\t"
+        "precrq.ph.w  %[temp4],  %[temp0],  %[temp1]      \n\t"
+        "ins          %[temp1],  %[temp0],  16,    16     \n\t"
+        "shra.ph      %[temp3],  %[temp1],  8             \n\t"
+        "mul.ph       %[temp2],  %[temp3],  %[gtr]        \n\t"
+        "addiu        %[p_argb], %[p_argb], 8             \n\t"
+        "shra.ph      %[temp2],  %[temp2],  5             \n\t"
+        "subu.qb      %[temp2],  %[temp4],  %[temp2]      \n\t"
+        : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
+        : [gtr]"r"(gtr)
+        : "memory", "hi", "lo"
+      );
+      ++histo[(uint8_t)(temp2 >> 16)];
+      ++histo[(uint8_t)temp2];
+    }
+    if (tile_width & 1) {
+      ++histo[TransformColorRed(green_to_red, *p_argb)];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MIPSdspR2;
+  VP8LTransformColor = TransformColor_MIPSdspR2;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_MIPSdspR2;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/lossless_enc_msa.c b/media/libwebp/dsp/lossless_enc_msa.c
new file mode 100644
index 0000000000..f8a5f8c56f
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_msa.c
@@ -0,0 +1,148 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of Image transform methods for lossless encoder.
+//
+// Authors: Prashant Patil (Prashant.Patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/lossless.h"
+#include "../dsp/msa_macro.h"
+
+#define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do {  \
+  v8i16 g0, g1, t0, t1, t2, t3;                                               \
+  v4i32 t4, t5;                                                               \
+  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1);                   \
+  DOTP_SB2_SH(g0, g1, c0, c0, t0, t1);                                        \
+  SRAI_H2_SH(t0, t1, 5);                                                      \
+  t0 = __msa_subv_h((v8i16)src0, t0);                                         \
+  t1 = __msa_subv_h((v8i16)src1, t1);                                         \
+  t4 = __msa_srli_w((v4i32)src0, 16);                                         \
+  t5 = __msa_srli_w((v4i32)src1, 16);                                         \
+  DOTP_SB2_SH(t4, t5, c1, c1, t2, t3);                                        \
+  SRAI_H2_SH(t2, t3, 5);                                                      \
+  SUB2(t0, t2, t1, t3, t0, t1);                                               \
+  VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1);                   \
+} while (0)
+
+#define TRANSFORM_COLOR_4(src, dst, c0, c1, mask0, mask1) do {  \
+  const v16i8 g0 = VSHF_SB(src, src, mask0);                    \
+  v8i16 t0 = __msa_dotp_s_h(c0, g0);                            \
+  v8i16 t1;                                                     \
+  v4i32 t2;                                                     \
+  t0 = SRAI_H(t0, 5);                                           \
+  t0 = __msa_subv_h((v8i16)src, t0);                            \
+  t2 = __msa_srli_w((v4i32)src, 16);                            \
+  t1 = __msa_dotp_s_h(c1, (v16i8)t2);                           \
+  t1 = SRAI_H(t1, 5);                                           \
+  t0 = t0 - t1;                                                 \
+  dst = VSHF_UB(src, t0, mask1);                                \
+} while (0)
+
+static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
+                               int num_pixels) {
+  v16u8 src0, dst0;
+  const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
+                                         (m->green_to_red_ << 16));
+  const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
+  const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                        13, 255, 13, 255 };
+  const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
+                        28, 13, 30, 15 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1;
+    LD_UB2(data, 4, src0, src1);
+    TRANSFORM_COLOR_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
+    ST_UB2(dst0, dst1, data, 4);
+    data += 8;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(data);
+      TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
+      ST_UB(dst0, data);
+      data += 4;
+      num_pixels -= 4;
+    }
+    if (num_pixels > 0) {
+      src0 = LD_UB(data);
+      TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
+      if (num_pixels == 3) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
+        SD(pix_d, data + 0);
+        SW(pix_w, data + 2);
+      } else if (num_pixels == 2) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        SD(pix_d, data);
+      } else {
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
+        SW(pix_w, data);
+      }
+    }
+  }
+}
+
+static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data,
+                                            int num_pixels) {
+  int i;
+  uint8_t* ptemp_data = (uint8_t*)argb_data;
+  v16u8 src0, dst0, tmp0;
+  const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                       13, 255, 13, 255 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1, tmp1;
+    LD_UB2(ptemp_data, 16, src0, src1);
+    VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
+    SUB2(src0, tmp0, src1, tmp1, dst0, dst1);
+    ST_UB2(dst0, dst1, ptemp_data, 16);
+    ptemp_data += 8 * 4;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(ptemp_data);
+      tmp0 = VSHF_UB(src0, src0, mask);
+      dst0 = src0 - tmp0;
+      ST_UB(dst0, ptemp_data);
+      ptemp_data += 4 * 4;
+      num_pixels -= 4;
+    }
+    for (i = 0; i < num_pixels; i++) {
+      const uint8_t b = ptemp_data[0];
+      const uint8_t g = ptemp_data[1];
+      const uint8_t r = ptemp_data[2];
+      ptemp_data[0] = (b - g) & 0xff;
+      ptemp_data[2] = (r - g) & 0xff;
+      ptemp_data += 4;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA;
+  VP8LTransformColor = TransformColor_MSA;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/lossless_enc_neon.c b/media/libwebp/dsp/lossless_enc_neon.c
new file mode 100644
index 0000000000..89d5439e49
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_neon.c
@@ -0,0 +1,144 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
+#include "../dsp/lossless.h"
+#include "../dsp/neon.h"
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+// non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+#define USE_VTBLQ
+#endif
+
+#ifdef USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[16] = {
+  1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
+};
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+                                                  const uint8x16_t shuffle) {
+  return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
+                     vtbl1q_u8(argb, vget_high_u8(shuffle)));
+}
+#else  // !USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+                                                  const uint8x8_t shuffle) {
+  return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+                     vtbl1_u8(vget_high_u8(argb), shuffle));
+}
+#endif  // USE_VTBLQ
+
+static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
+                                             int num_pixels) {
+  const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
+  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
+  for (; argb_data < end; argb_data += 4) {
+    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+    const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
+    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+static void TransformColor_NEON(const VP8LMultipliers* const m,
+                                uint32_t* argb_data, int num_pixels) {
+  // sign-extended multiplying constants, pre-shifted by 6.
+#define CST(X)  (((int16_t)(m->X << 8)) >> 6)
+  const int16_t rb[8] = {
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_)
+  };
+  const int16x8_t mults_rb = vld1q_s16(rb);
+  const int16_t b2[8] = {
+    0, CST(red_to_blue_), 0, CST(red_to_blue_),
+    0, CST(red_to_blue_), 0, CST(red_to_blue_),
+  };
+  const int16x8_t mults_b2 = vld1q_s16(b2);
+#undef CST
+#ifdef USE_VTBLQ
+  static const uint8_t kg0g0[16] = {
+    255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
+  };
+  const uint8x16_t shuffle = vld1q_u8(kg0g0);
+#else
+  static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
+  const uint8x8_t shuffle = vld1_u8(k0g0g);
+#endif
+  const uint32x4_t mask_rb = vdupq_n_u32(0x00ff00ffu);  // red-blue masks
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
+    // 0 g 0 g
+    const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
+    // x dr  x db1
+    const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
+    // r 0   b   0
+    const int16x8_t B = vshlq_n_s16(vreinterpretq_s16_u8(in), 8);
+    // x db2 0   0
+    const int16x8_t C = vqdmulhq_s16(B, mults_b2);
+    // 0 0   x db2
+    const uint32x4_t D = vshrq_n_u32(vreinterpretq_u32_s16(C), 16);
+    // x dr  x  db
+    const int8x16_t E = vaddq_s8(vreinterpretq_s8_u32(D),
+                                 vreinterpretq_s8_s16(A));
+    // 0 dr  0  db
+    const uint32x4_t F = vandq_u32(vreinterpretq_u32_s8(E), mask_rb);
+    const int8x16_t out = vsubq_s8(vreinterpretq_s8_u8(in),
+                                   vreinterpretq_s8_u32(F));
+    vst1q_s8((int8_t*)(argb_data + i), out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+}
+
+#undef USE_VTBLQ
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_NEON;
+  VP8LTransformColor = TransformColor_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/media/libwebp/dsp/lossless_enc_sse2.c b/media/libwebp/dsp/lossless_enc_sse2.c
new file mode 100644
index 0000000000..665ceb669c
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_sse2.c
@@ -0,0 +1,669 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <assert.h>
+#include <emmintrin.h>
+#include "../dsp/lossless.h"
+#include "../dsp/common_sse2.h"
+#include "../dsp/lossless_common.h"
+
+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X)  (((int16_t)((uint16_t)(X) << 8)) >> 5)
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
+                                             int num_pixels) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
+    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
+    const __m128i out = _mm_sub_epi8(in, C);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  if (i != num_pixels) {
+    VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+#define MK_CST_16(HI, LO) \
+  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
+static void TransformColor_SSE2(const VP8LMultipliers* const m,
+                                uint32_t* argb_data, int num_pixels) {
+  const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
+                                     CST_5b(m->green_to_blue_));
+  const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
+  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
+  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
+    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
+    const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
+    const __m128i E = _mm_slli_epi16(in, 8);           // r 0   b   0
+    const __m128i F = _mm_mulhi_epi16(E, mults_b2);    // x db2 0   0
+    const __m128i G = _mm_srli_epi32(F, 16);           // 0 0   x db2
+    const __m128i H = _mm_add_epi8(G, D);              // x dr  x  db
+    const __m128i I = _mm_and_si128(H, mask_rb);       // 0 dr  0  db
+    const __m128i out = _mm_sub_epi8(in, I);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  if (i != num_pixels) {
+    VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+  }
+}
+
+//------------------------------------------------------------------------------
+#define SPAN 8
+static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
+                                            int tile_width, int tile_height,
+                                            int green_to_blue, int red_to_blue,
+                                            int histo[]) {
+  const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
+  const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
+  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
+  const __m128i mask_b = _mm_set1_epi32(0x0000ff);  // blue mask
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x +        0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i A0 = _mm_slli_epi16(in0, 8);        // r 0  | b 0
+      const __m128i A1 = _mm_slli_epi16(in1, 8);
+      const __m128i B0 = _mm_and_si128(in0, mask_g);    // 0 0  | g 0
+      const __m128i B1 = _mm_and_si128(in1, mask_g);
+      const __m128i C0 = _mm_mulhi_epi16(A0, mults_r);  // x db | 0 0
+      const __m128i C1 = _mm_mulhi_epi16(A1, mults_r);
+      const __m128i D0 = _mm_mulhi_epi16(B0, mults_g);  // 0 0  | x db
+      const __m128i D1 = _mm_mulhi_epi16(B1, mults_g);
+      const __m128i E0 = _mm_sub_epi8(in0, D0);         // x x  | x b'
+      const __m128i E1 = _mm_sub_epi8(in1, D1);
+      const __m128i F0 = _mm_srli_epi32(C0, 16);        // 0 0  | x db
+      const __m128i F1 = _mm_srli_epi32(C1, 16);
+      const __m128i G0 = _mm_sub_epi8(E0, F0);          // 0 0  | x b'
+      const __m128i G1 = _mm_sub_epi8(E1, F1);
+      const __m128i H0 = _mm_and_si128(G0, mask_b);     // 0 0  | 0 b
+      const __m128i H1 = _mm_and_si128(G1, mask_b);
+      const __m128i I = _mm_packs_epi32(H0, H1);        // 0 b' | 0 b'
+      _mm_storeu_si128((__m128i*)values, I);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
+                                       left_over, tile_height,
+                                       green_to_blue, red_to_blue, histo);
+    }
+  }
+}
+
+static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
+                                           int tile_width, int tile_height,
+                                           int green_to_red, int histo[]) {
+  const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
+  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
+  const __m128i mask = _mm_set1_epi32(0xff);
+
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x +        0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i A0 = _mm_and_si128(in0, mask_g);    // 0 0  | g 0
+      const __m128i A1 = _mm_and_si128(in1, mask_g);
+      const __m128i B0 = _mm_srli_epi32(in0, 16);       // 0 0  | x r
+      const __m128i B1 = _mm_srli_epi32(in1, 16);
+      const __m128i C0 = _mm_mulhi_epi16(A0, mults_g);  // 0 0  | x dr
+      const __m128i C1 = _mm_mulhi_epi16(A1, mults_g);
+      const __m128i E0 = _mm_sub_epi8(B0, C0);          // x x  | x r'
+      const __m128i E1 = _mm_sub_epi8(B1, C1);
+      const __m128i F0 = _mm_and_si128(E0, mask);       // 0 0  | 0 r'
+      const __m128i F1 = _mm_and_si128(E1, mask);
+      const __m128i I = _mm_packs_epi32(F0, F1);
+      _mm_storeu_si128((__m128i*)values, I);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
+                                      left_over, tile_height,
+                                      green_to_red, histo);
+    }
+  }
+}
+#undef SPAN
+#undef MK_CST_16
+
+//------------------------------------------------------------------------------
+
+// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
+// that's ok since the histogram values are less than 1<<28 (max picture size).
+#define LINE_SIZE 16    // 8 or 16
+static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
+                           int size) {
+  int i;
+  for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+  for (; i < size; ++i) {
+    out[i] = a[i] + b[i];
+  }
+}
+
+static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
+  int i;
+  for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+  for (; i < size; ++i) {
+    out[i] += a[i];
+  }
+}
+#undef LINE_SIZE
+
+//------------------------------------------------------------------------------
+// Entropy
+
+// TODO(https://crbug.com/webp/499): this function produces different results
+// from the C code due to use of double/float resulting in output differences
+// when compared to -noasm.
+#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
+
+static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
+  int i;
+  double retval = 0.;
+  int sumX = 0, sumXY = 0;
+  const __m128i zero = _mm_setzero_si128();
+
+  for (i = 0; i < 256; i += 16) {
+    const __m128i x0 = _mm_loadu_si128((const __m128i*)(X + i +  0));
+    const __m128i y0 = _mm_loadu_si128((const __m128i*)(Y + i +  0));
+    const __m128i x1 = _mm_loadu_si128((const __m128i*)(X + i +  4));
+    const __m128i y1 = _mm_loadu_si128((const __m128i*)(Y + i +  4));
+    const __m128i x2 = _mm_loadu_si128((const __m128i*)(X + i +  8));
+    const __m128i y2 = _mm_loadu_si128((const __m128i*)(Y + i +  8));
+    const __m128i x3 = _mm_loadu_si128((const __m128i*)(X + i + 12));
+    const __m128i y3 = _mm_loadu_si128((const __m128i*)(Y + i + 12));
+    const __m128i x4 = _mm_packs_epi16(_mm_packs_epi32(x0, x1),
+                                       _mm_packs_epi32(x2, x3));
+    const __m128i y4 = _mm_packs_epi16(_mm_packs_epi32(y0, y1),
+                                       _mm_packs_epi32(y2, y3));
+    const int32_t mx = _mm_movemask_epi8(_mm_cmpgt_epi8(x4, zero));
+    int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
+    while (my) {
+      const int32_t j = BitsCtz(my);
+      int xy;
+      if ((mx >> j) & 1) {
+        const int x = X[i + j];
+        sumXY += x;
+        retval -= VP8LFastSLog2(x);
+      }
+      xy = X[i + j] + Y[i + j];
+      sumX += xy;
+      retval -= VP8LFastSLog2(xy);
+      my &= my - 1;
+    }
+  }
+  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
+  return (float)retval;
+}
+
+#else
+
+#define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC   // won't be faster
+
+#endif
+
+//------------------------------------------------------------------------------
+
+static int VectorMismatch_SSE2(const uint32_t* const array1,
+                               const uint32_t* const array2, int length) {
+  int match_len;
+
+  if (length >= 12) {
+    __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]);
+    __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]);
+    match_len = 0;
+    do {
+      // Loop unrolling and early load both provide a speedup of 10% for the
+      // current function. Also, max_limit can be MAX_LENGTH=4096 at most.
+      const __m128i cmpA = _mm_cmpeq_epi32(A0, A1);
+      const __m128i B0 =
+          _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
+      const __m128i B1 =
+          _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
+      if (_mm_movemask_epi8(cmpA) != 0xffff) break;
+      match_len += 4;
+
+      {
+        const __m128i cmpB = _mm_cmpeq_epi32(B0, B1);
+        A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
+        A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
+        if (_mm_movemask_epi8(cmpB) != 0xffff) break;
+        match_len += 4;
+      }
+    } while (match_len + 12 < length);
+  } else {
+    match_len = 0;
+    // Unroll the potential first two loops.
+    if (length >= 4 &&
+        _mm_movemask_epi8(_mm_cmpeq_epi32(
+            _mm_loadu_si128((const __m128i*)&array1[0]),
+            _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) {
+      match_len = 4;
+      if (length >= 8 &&
+          _mm_movemask_epi8(_mm_cmpeq_epi32(
+              _mm_loadu_si128((const __m128i*)&array1[4]),
+              _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) {
+        match_len = 8;
+      }
+    }
+  }
+
+  while (match_len < length && array1[match_len] == array2[match_len]) {
+    ++match_len;
+  }
+  return match_len;
+}
+
+// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
+static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
+                                uint32_t* dst) {
+  int x;
+  assert(xbits >= 0);
+  assert(xbits <= 3);
+  switch (xbits) {
+    case 0: {
+      const __m128i ff = _mm_set1_epi16((short)0xff00);
+      const __m128i zero = _mm_setzero_si128();
+      // Store 0xff000000 | (row[x] << 8).
+      for (x = 0; x + 16 <= width; x += 16, dst += 16) {
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i in_lo = _mm_unpacklo_epi8(zero, in);
+        const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff);
+        const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff);
+        const __m128i in_hi = _mm_unpackhi_epi8(zero, in);
+        const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff);
+        const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff);
+        _mm_storeu_si128((__m128i*)&dst[0], dst0);
+        _mm_storeu_si128((__m128i*)&dst[4], dst1);
+        _mm_storeu_si128((__m128i*)&dst[8], dst2);
+        _mm_storeu_si128((__m128i*)&dst[12], dst3);
+      }
+      break;
+    }
+    case 1: {
+      const __m128i ff = _mm_set1_epi16((short)0xff00);
+      const __m128i mul = _mm_set1_epi16(0x110);
+      for (x = 0; x + 16 <= width; x += 16, dst += 8) {
+        // 0a0b | (where a/b are 4 bits).
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i tmp = _mm_mullo_epi16(in, mul);  // aba0
+        const __m128i pack = _mm_and_si128(tmp, ff);   // ab00
+        const __m128i dst0 = _mm_unpacklo_epi16(pack, ff);
+        const __m128i dst1 = _mm_unpackhi_epi16(pack, ff);
+        _mm_storeu_si128((__m128i*)&dst[0], dst0);
+        _mm_storeu_si128((__m128i*)&dst[4], dst1);
+      }
+      break;
+    }
+    case 2: {
+      const __m128i mask_or = _mm_set1_epi32(0xff000000);
+      const __m128i mul_cst = _mm_set1_epi16(0x0104);
+      const __m128i mask_mul = _mm_set1_epi16(0x0f00);
+      for (x = 0; x + 16 <= width; x += 16, dst += 4) {
+        // 000a000b000c000d | (where a/b/c/d are 2 bits).
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i mul = _mm_mullo_epi16(in, mul_cst);  // 00ab00b000cd00d0
+        const __m128i tmp = _mm_and_si128(mul, mask_mul);  // 00ab000000cd0000
+        const __m128i shift = _mm_srli_epi32(tmp, 12);     // 00000000ab000000
+        const __m128i pack = _mm_or_si128(shift, tmp);     // 00000000abcd0000
+        // Convert to 0xff00**00.
+        const __m128i res = _mm_or_si128(pack, mask_or);
+        _mm_storeu_si128((__m128i*)dst, res);
+      }
+      break;
+    }
+    default: {
+      assert(xbits == 3);
+      for (x = 0; x + 16 <= width; x += 16, dst += 2) {
+        // 0000000a00000000b... | (where a/b are 1 bit).
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i shift = _mm_slli_epi64(in, 7);
+        const uint32_t move = _mm_movemask_epi8(shift);
+        dst[0] = 0xff000000 | ((move & 0xff) << 8);
+        dst[1] = 0xff000000 | (move & 0xff00);
+      }
+      break;
+    }
+  }
+  if (x != width) {
+    VP8LBundleColorMap_C(row + x, width - x, xbits, dst);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Batch version of Predictor Transform subtraction
+
+static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
+                                       const __m128i* const a1,
+                                       __m128i* const avg) {
+  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
+  const __m128i ones = _mm_set1_epi8(1);
+  const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
+  const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
+  *avg = _mm_sub_epi8(avg1, one);
+}
+
+// Predictor0: ARGB_BLACK.
+static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i res = _mm_sub_epi8(src, black);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[0](in + i, NULL, num_pixels - i, out + i);
+  }
+  (void)upper;
+}
+
+#define GENERATE_PREDICTOR_1(X, IN)                                         \
+  static void PredictorSub##X##_SSE2(const uint32_t* const in,              \
+                                     const uint32_t* const upper,           \
+                                     int num_pixels, uint32_t* const out) { \
+    int i;                                                                  \
+    for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
+      const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);          \
+      const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN));          \
+      const __m128i res = _mm_sub_epi8(src, pred);                          \
+      _mm_storeu_si128((__m128i*)&out[i], res);                             \
+    }                                                                       \
+    if (i != num_pixels) {                                                  \
+      VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i),           \
+                               num_pixels - i, out + i);                    \
+    }                                                                       \
+  }
+
+GENERATE_PREDICTOR_1(1, in[i - 1])       // Predictor1: L
+GENERATE_PREDICTOR_1(2, upper[i])        // Predictor2: T
+GENERATE_PREDICTOR_1(3, upper[i + 1])    // Predictor3: TR
+GENERATE_PREDICTOR_1(4, upper[i - 1])    // Predictor4: TL
+#undef GENERATE_PREDICTOR_1
+
+// Predictor5: avg2(avg2(L, TR), T)
+static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    __m128i avg, pred, res;
+    Average2_m128i(&L, &TR, &avg);
+    Average2_m128i(&avg, &T, &pred);
+    res = _mm_sub_epi8(src, pred);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+#define GENERATE_PREDICTOR_2(X, A, B)                                         \
+static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
+                                   int num_pixels, uint32_t* out) {           \
+  int i;                                                                      \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
+    const __m128i tA = _mm_loadu_si128((const __m128i*)&(A));                 \
+    const __m128i tB = _mm_loadu_si128((const __m128i*)&(B));                 \
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
+    __m128i pred, res;                                                        \
+    Average2_m128i(&tA, &tB, &pred);                                          \
+    res = _mm_sub_epi8(src, pred);                                            \
+    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
+  }                                                                           \
+  if (i != num_pixels) {                                                      \
+    VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
+  }                                                                           \
+}
+
+GENERATE_PREDICTOR_2(6, in[i - 1], upper[i - 1])   // Predictor6: avg(L, TL)
+GENERATE_PREDICTOR_2(7, in[i - 1], upper[i])       // Predictor7: avg(L, T)
+GENERATE_PREDICTOR_2(8, upper[i - 1], upper[i])    // Predictor8: avg(TL, T)
+GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1])    // Predictor9: average(T, TR)
+#undef GENERATE_PREDICTOR_2
+
+// Predictor10: avg(avg(L,TL), avg(T, TR)).
+static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
+    __m128i avgTTR, avgLTL, avg, res;
+    Average2_m128i(&T, &TR, &avgTTR);
+    Average2_m128i(&L, &TL, &avgLTL);
+    Average2_m128i(&avgTTR, &avgLTL, &avg);
+    res = _mm_sub_epi8(src, avg);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor11: select.
+static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
+                                 __m128i* const out) {
+  // We can unpack with any value on the upper 32 bits, provided it's the same
+  // on both operands (to that their sum of abs diff is zero). Here we use *A.
+  const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
+  const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
+  const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
+  const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
+  const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
+  const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
+  *out = _mm_packs_epi32(s_lo, s_hi);
+}
+
+static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    __m128i pa, pb;
+    GetSumAbsDiff32_SSE2(&T, &TL, &pa);   // pa = sum |T-TL|
+    GetSumAbsDiff32_SSE2(&L, &TL, &pb);   // pb = sum |L-TL|
+    {
+      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
+      const __m128i A = _mm_and_si128(mask, L);
+      const __m128i B = _mm_andnot_si128(mask, T);
+      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
+      const __m128i res = _mm_sub_epi8(src, pred);
+      _mm_storeu_si128((__m128i*)&out[i], res);
+    }
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor12: ClampedSubSubtractFull.
+static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
+    const __m128i L_hi = _mm_unpackhi_epi8(L, zero);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+    const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+    const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
+    const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
+    const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
+    const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo);
+    const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi);
+    const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+    const __m128i res = _mm_sub_epi8(src, pred);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictors13: ClampedAddSubtractHalf
+static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 2 <= num_pixels; i += 2) {
+    // we can only process two pixels at a time
+    const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]);
+    const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]);
+    const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]);
+    const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]);
+    const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
+    const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+    const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+    const __m128i sum = _mm_add_epi16(T_lo, L_lo);
+    const __m128i avg = _mm_srli_epi16(sum, 1);
+    const __m128i A1 = _mm_sub_epi16(avg, TL_lo);
+    const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg);
+    const __m128i A2 = _mm_sub_epi16(A1, bit_fix);
+    const __m128i A3 = _mm_srai_epi16(A2, 1);
+    const __m128i A4 = _mm_add_epi16(avg, A3);
+    const __m128i pred = _mm_packus_epi16(A4, A4);
+    const __m128i res = _mm_sub_epi8(src, pred);
+    _mm_storel_epi64((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE2;
+  VP8LTransformColor = TransformColor_SSE2;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
+  VP8LAddVector = AddVector_SSE2;
+  VP8LAddVectorEq = AddVectorEq_SSE2;
+#if !defined(DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC)
+  VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
+#endif
+  VP8LVectorMismatch = VectorMismatch_SSE2;
+  VP8LBundleColorMap = BundleColorMap_SSE2;
+
+  VP8LPredictorsSub[0] = PredictorSub0_SSE2;
+  VP8LPredictorsSub[1] = PredictorSub1_SSE2;
+  VP8LPredictorsSub[2] = PredictorSub2_SSE2;
+  VP8LPredictorsSub[3] = PredictorSub3_SSE2;
+  VP8LPredictorsSub[4] = PredictorSub4_SSE2;
+  VP8LPredictorsSub[5] = PredictorSub5_SSE2;
+  VP8LPredictorsSub[6] = PredictorSub6_SSE2;
+  VP8LPredictorsSub[7] = PredictorSub7_SSE2;
+  VP8LPredictorsSub[8] = PredictorSub8_SSE2;
+  VP8LPredictorsSub[9] = PredictorSub9_SSE2;
+  VP8LPredictorsSub[10] = PredictorSub10_SSE2;
+  VP8LPredictorsSub[11] = PredictorSub11_SSE2;
+  VP8LPredictorsSub[12] = PredictorSub12_SSE2;
+  VP8LPredictorsSub[13] = PredictorSub13_SSE2;
+  VP8LPredictorsSub[14] = PredictorSub0_SSE2;  // <- padding security sentinels
+  VP8LPredictorsSub[15] = PredictorSub0_SSE2;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/lossless_enc_sse41.c b/media/libwebp/dsp/lossless_enc_sse41.c
new file mode 100644
index 0000000000..2c6bc5bb00
--- /dev/null
+++ b/media/libwebp/dsp/lossless_enc_sse41.c
@@ -0,0 +1,155 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4.1 variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+#include <assert.h>
+#include <smmintrin.h>
+#include "../dsp/lossless.h"
+
+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X)  (((int16_t)((uint16_t)(X) << 8)) >> 5)
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
+                                              int num_pixels) {
+  int i;
+  const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
+                                           -1,  5, -1,  5, -1, 1, -1, 1);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle);
+    const __m128i out = _mm_sub_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  if (i != num_pixels) {
+    VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+#define MK_CST_16(HI, LO) \
+  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
+static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
+                                             int tile_width, int tile_height,
+                                             int green_to_blue, int red_to_blue,
+                                             int histo[]) {
+  const __m128i mult =
+      MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
+  const __m128i perm =
+      _mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14);
+  if (tile_width >= 4) {
+    int y;
+    for (y = 0; y < tile_height; ++y) {
+      const uint32_t* const src = argb + y * stride;
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
+      const __m128i B1 = _mm_shuffle_epi8(A1, perm);
+      const __m128i C1 = _mm_mulhi_epi16(B1, mult);
+      const __m128i D1 = _mm_sub_epi16(A1, C1);
+      __m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1);
+      int x;
+      for (x = 4; x + 4 <= tile_width; x += 4) {
+        const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
+        __m128i B2, C2, D2;
+        ++histo[_mm_extract_epi8(E,  0)];
+        B2 = _mm_shuffle_epi8(A2, perm);
+        ++histo[_mm_extract_epi8(E,  4)];
+        C2 = _mm_mulhi_epi16(B2, mult);
+        ++histo[_mm_extract_epi8(E,  8)];
+        D2 = _mm_sub_epi16(A2, C2);
+        ++histo[_mm_extract_epi8(E, 12)];
+        E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2);
+      }
+      ++histo[_mm_extract_epi8(E,  0)];
+      ++histo[_mm_extract_epi8(E,  4)];
+      ++histo[_mm_extract_epi8(E,  8)];
+      ++histo[_mm_extract_epi8(E, 12)];
+    }
+  }
+  {
+    const int left_over = tile_width & 3;
+    if (left_over > 0) {
+      VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
+                                       left_over, tile_height,
+                                       green_to_blue, red_to_blue, histo);
+    }
+  }
+}
+
+static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
+                                            int tile_width, int tile_height,
+                                            int green_to_red, int histo[]) {
+
+  const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
+  const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
+  if (tile_width >= 4) {
+    int y;
+    for (y = 0; y < tile_height; ++y) {
+      const uint32_t* const src = argb + y * stride;
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
+      const __m128i B1 = _mm_and_si128(A1, mask_g);
+      const __m128i C1 = _mm_madd_epi16(B1, mult);
+      __m128i D = _mm_sub_epi16(A1, C1);
+      int x;
+      for (x = 4; x + 4 <= tile_width; x += 4) {
+        const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
+        __m128i B2, C2;
+        ++histo[_mm_extract_epi8(D,  2)];
+        B2 = _mm_and_si128(A2, mask_g);
+        ++histo[_mm_extract_epi8(D,  6)];
+        C2 = _mm_madd_epi16(B2, mult);
+        ++histo[_mm_extract_epi8(D, 10)];
+        ++histo[_mm_extract_epi8(D, 14)];
+        D = _mm_sub_epi16(A2, C2);
+      }
+      ++histo[_mm_extract_epi8(D,  2)];
+      ++histo[_mm_extract_epi8(D,  6)];
+      ++histo[_mm_extract_epi8(D, 10)];
+      ++histo[_mm_extract_epi8(D, 14)];
+    }
+  }
+  {
+    const int left_over = tile_width & 3;
+    if (left_over > 0) {
+      VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
+                                      left_over, tile_height, green_to_red,
+                                      histo);
+    }
+  }
+}
+
+#undef MK_CST_16
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/media/libwebp/dsp/lossless_mips_dsp_r2.c b/media/libwebp/dsp/lossless_mips_dsp_r2.c
new file mode 100644
index 0000000000..ec98834f84
--- /dev/null
+++ b/media/libwebp/dsp/lossless_mips_dsp_r2.c
@@ -0,0 +1,701 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transforms and color space conversion methods for lossless decoder.
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+
+#define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)                 \
+static void FUNC_NAME(const TYPE* src,                                         \
+                      const uint32_t* const color_map,                         \
+                      TYPE* dst, int y_start, int y_end,                       \
+                      int width) {                                             \
+  int y;                                                                       \
+  for (y = y_start; y < y_end; ++y) {                                          \
+    int x;                                                                     \
+    for (x = 0; x < (width >> 2); ++x) {                                       \
+      int tmp1, tmp2, tmp3, tmp4;                                              \
+      __asm__ volatile (                                                       \
+      ".ifc        " #TYPE ",  uint8_t                  \n\t"                  \
+        "lbu       %[tmp1],  0(%[src])                  \n\t"                  \
+        "lbu       %[tmp2],  1(%[src])                  \n\t"                  \
+        "lbu       %[tmp3],  2(%[src])                  \n\t"                  \
+        "lbu       %[tmp4],  3(%[src])                  \n\t"                  \
+        "addiu     %[src],   %[src],      4             \n\t"                  \
+      ".endif                                           \n\t"                  \
+      ".ifc        " #TYPE ",  uint32_t                 \n\t"                  \
+        "lw        %[tmp1],  0(%[src])                  \n\t"                  \
+        "lw        %[tmp2],  4(%[src])                  \n\t"                  \
+        "lw        %[tmp3],  8(%[src])                  \n\t"                  \
+        "lw        %[tmp4],  12(%[src])                 \n\t"                  \
+        "ext       %[tmp1],  %[tmp1],     8,        8   \n\t"                  \
+        "ext       %[tmp2],  %[tmp2],     8,        8   \n\t"                  \
+        "ext       %[tmp3],  %[tmp3],     8,        8   \n\t"                  \
+        "ext       %[tmp4],  %[tmp4],     8,        8   \n\t"                  \
+        "addiu     %[src],   %[src],      16            \n\t"                  \
+      ".endif                                           \n\t"                  \
+        "sll       %[tmp1],  %[tmp1],     2             \n\t"                  \
+        "sll       %[tmp2],  %[tmp2],     2             \n\t"                  \
+        "sll       %[tmp3],  %[tmp3],     2             \n\t"                  \
+        "sll       %[tmp4],  %[tmp4],     2             \n\t"                  \
+        "lwx       %[tmp1],  %[tmp1](%[color_map])      \n\t"                  \
+        "lwx       %[tmp2],  %[tmp2](%[color_map])      \n\t"                  \
+        "lwx       %[tmp3],  %[tmp3](%[color_map])      \n\t"                  \
+        "lwx       %[tmp4],  %[tmp4](%[color_map])      \n\t"                  \
+      ".ifc        " #TYPE ",  uint8_t                  \n\t"                  \
+        "ext       %[tmp1],  %[tmp1],     8,        8   \n\t"                  \
+        "ext       %[tmp2],  %[tmp2],     8,        8   \n\t"                  \
+        "ext       %[tmp3],  %[tmp3],     8,        8   \n\t"                  \
+        "ext       %[tmp4],  %[tmp4],     8,        8   \n\t"                  \
+        "sb        %[tmp1],  0(%[dst])                  \n\t"                  \
+        "sb        %[tmp2],  1(%[dst])                  \n\t"                  \
+        "sb        %[tmp3],  2(%[dst])                  \n\t"                  \
+        "sb        %[tmp4],  3(%[dst])                  \n\t"                  \
+        "addiu     %[dst],   %[dst],      4             \n\t"                  \
+      ".endif                                           \n\t"                  \
+      ".ifc        " #TYPE ",  uint32_t                 \n\t"                  \
+        "sw        %[tmp1],  0(%[dst])                  \n\t"                  \
+        "sw        %[tmp2],  4(%[dst])                  \n\t"                  \
+        "sw        %[tmp3],  8(%[dst])                  \n\t"                  \
+        "sw        %[tmp4],  12(%[dst])                 \n\t"                  \
+        "addiu     %[dst],   %[dst],      16            \n\t"                  \
+      ".endif                                           \n\t"                  \
+        : [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),             \
+          [tmp4]"=&r"(tmp4), [src]"+&r"(src), [dst]"+r"(dst)                   \
+        : [color_map]"r"(color_map)                                            \
+        : "memory"                                                             \
+      );                                                                       \
+    }                                                                          \
+    for (x = 0; x < (width & 3); ++x) {                                        \
+      *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                        \
+    }                                                                          \
+  }                                                                            \
+}
+
+MAP_COLOR_FUNCS(MapARGB_MIPSdspR2, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
+MAP_COLOR_FUNCS(MapAlpha_MIPSdspR2, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
+
+#undef MAP_COLOR_FUNCS
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  __asm__ volatile (
+    "preceu.ph.qbr   %[temp1],   %[c0]                 \n\t"
+    "preceu.ph.qbl   %[temp2],   %[c0]                 \n\t"
+    "preceu.ph.qbr   %[temp3],   %[c1]                 \n\t"
+    "preceu.ph.qbl   %[temp4],   %[c1]                 \n\t"
+    "preceu.ph.qbr   %[temp5],   %[c2]                 \n\t"
+    "preceu.ph.qbl   %[temp0],   %[c2]                 \n\t"
+    "subq.ph         %[temp3],   %[temp3],   %[temp5]  \n\t"
+    "subq.ph         %[temp4],   %[temp4],   %[temp0]  \n\t"
+    "addq.ph         %[temp1],   %[temp1],   %[temp3]  \n\t"
+    "addq.ph         %[temp2],   %[temp2],   %[temp4]  \n\t"
+    "shll_s.ph       %[temp1],   %[temp1],   7         \n\t"
+    "shll_s.ph       %[temp2],   %[temp2],   7         \n\t"
+    "precrqu_s.qb.ph %[temp2],   %[temp2],   %[temp1]  \n\t"
+    : [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5)
+    : [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
+    : "memory"
+  );
+  return temp2;
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  __asm__ volatile (
+    "adduh.qb         %[temp5],   %[c0],      %[c1]       \n\t"
+    "preceu.ph.qbr    %[temp3],   %[c2]                   \n\t"
+    "preceu.ph.qbr    %[temp1],   %[temp5]                \n\t"
+    "preceu.ph.qbl    %[temp2],   %[temp5]                \n\t"
+    "preceu.ph.qbl    %[temp4],   %[c2]                   \n\t"
+    "subq.ph          %[temp3],   %[temp1],   %[temp3]    \n\t"
+    "subq.ph          %[temp4],   %[temp2],   %[temp4]    \n\t"
+    "shrl.ph          %[temp5],   %[temp3],   15          \n\t"
+    "shrl.ph          %[temp0],   %[temp4],   15          \n\t"
+    "addq.ph          %[temp3],   %[temp3],   %[temp5]    \n\t"
+    "addq.ph          %[temp4],   %[temp0],   %[temp4]    \n\t"
+    "shra.ph          %[temp3],   %[temp3],   1           \n\t"
+    "shra.ph          %[temp4],   %[temp4],   1           \n\t"
+    "addq.ph          %[temp1],   %[temp1],   %[temp3]    \n\t"
+    "addq.ph          %[temp2],   %[temp2],   %[temp4]    \n\t"
+    "shll_s.ph        %[temp1],   %[temp1],   7           \n\t"
+    "shll_s.ph        %[temp2],   %[temp2],   7           \n\t"
+    "precrqu_s.qb.ph  %[temp1],   %[temp2],   %[temp1]    \n\t"
+    : [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=r"(temp4), [temp5]"=&r"(temp5)
+    : [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
+    : "memory"
+  );
+  return temp1;
+}
+
+static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  __asm__ volatile (
+    "cmpgdu.lt.qb %[temp1], %[c],     %[b]             \n\t"
+    "pick.qb      %[temp1], %[b],     %[c]             \n\t"
+    "pick.qb      %[temp2], %[c],     %[b]             \n\t"
+    "cmpgdu.lt.qb %[temp4], %[c],     %[a]             \n\t"
+    "pick.qb      %[temp4], %[a],     %[c]             \n\t"
+    "pick.qb      %[temp5], %[c],     %[a]             \n\t"
+    "subu.qb      %[temp3], %[temp1], %[temp2]         \n\t"
+    "subu.qb      %[temp0], %[temp4], %[temp5]         \n\t"
+    "raddu.w.qb   %[temp3], %[temp3]                   \n\t"
+    "raddu.w.qb   %[temp0], %[temp0]                   \n\t"
+    "subu         %[temp3], %[temp3], %[temp0]         \n\t"
+    "slti         %[temp0], %[temp3], 0x1              \n\t"
+    "movz         %[a],     %[b],     %[temp0]         \n\t"
+    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp0]"=&r"(temp0),
+      [a]"+&r"(a)
+    : [b]"r"(b), [c]"r"(c)
+  );
+  return a;
+}
+
+static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+  __asm__ volatile (
+    "adduh.qb    %[a0], %[a0], %[a1]       \n\t"
+    : [a0]"+r"(a0)
+    : [a1]"r"(a1)
+  );
+  return a0;
+}
+
+static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+  return Average2(Average2(a0, a2), a1);
+}
+
+static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
+                                     uint32_t a2, uint32_t a3) {
+  return Average2(Average2(a0, a1), Average2(a2, a3));
+}
+
+static uint32_t Predictor5_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average3(*left, top[0], top[1]);
+}
+
+static uint32_t Predictor6_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average2(*left, top[-1]);
+}
+
+static uint32_t Predictor7_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average2(*left, top[0]);
+}
+
+static uint32_t Predictor8_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  (void)left;
+  return Average2(top[-1], top[0]);
+}
+
+static uint32_t Predictor9_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  (void)left;
+  return Average2(top[0], top[1]);
+}
+
+static uint32_t Predictor10_MIPSdspR2(const uint32_t* const left,
+                                      const uint32_t* const top) {
+  return Average4(*left, top[-1], top[0], top[1]);
+}
+
+static uint32_t Predictor11_MIPSdspR2(const uint32_t* const left,
+                                      const uint32_t* const top) {
+  return Select(top[0], *left, top[-1]);
+}
+
+static uint32_t Predictor12_MIPSdspR2(const uint32_t* const left,
+                                      const uint32_t* const top) {
+  return ClampedAddSubtractFull(*left, top[0], top[-1]);
+}
+
+static uint32_t Predictor13_MIPSdspR2(const uint32_t* const left,
+                                      const uint32_t* const top) {
+  return ClampedAddSubtractHalf(*left, top[0], top[-1]);
+}
+
+// Add green to blue and red channels (i.e. perform the inverse transform of
+// 'subtract green').
+static void AddGreenToBlueAndRed_MIPSdspR2(const uint32_t* src, int num_pixels,
+                                           uint32_t* dst) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set       push                                          \n\t"
+    ".set       noreorder                                     \n\t"
+    "beq        %[src],          %[p_loop1_end],     3f       \n\t"
+    " nop                                                     \n\t"
+  "0:                                                         \n\t"
+    "lw         %[temp0],        0(%[src])                    \n\t"
+    "lw         %[temp1],        4(%[src])                    \n\t"
+    "lw         %[temp2],        8(%[src])                    \n\t"
+    "lw         %[temp3],        12(%[src])                   \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "ext        %[temp5],        %[temp1],           8,    8  \n\t"
+    "ext        %[temp6],        %[temp2],           8,    8  \n\t"
+    "ext        %[temp7],        %[temp3],           8,    8  \n\t"
+    "addiu      %[src],          %[src],             16       \n\t"
+    "addiu      %[dst],          %[dst],             16       \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "replv.ph   %[temp5],        %[temp5]                     \n\t"
+    "replv.ph   %[temp6],        %[temp6]                     \n\t"
+    "replv.ph   %[temp7],        %[temp7]                     \n\t"
+    "addu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "addu.qb    %[temp1],        %[temp1],           %[temp5] \n\t"
+    "addu.qb    %[temp2],        %[temp2],           %[temp6] \n\t"
+    "addu.qb    %[temp3],        %[temp3],           %[temp7] \n\t"
+    "sw         %[temp0],        -16(%[dst])                  \n\t"
+    "sw         %[temp1],        -12(%[dst])                  \n\t"
+    "sw         %[temp2],        -8(%[dst])                   \n\t"
+    "bne        %[src],          %[p_loop1_end],     0b       \n\t"
+    " sw        %[temp3],        -4(%[dst])                   \n\t"
+  "3:                                                         \n\t"
+    "beq        %[src],          %[p_loop2_end],     2f       \n\t"
+    " nop                                                     \n\t"
+  "1:                                                         \n\t"
+    "lw         %[temp0],        0(%[src])                    \n\t"
+    "addiu      %[src],          %[src],             4        \n\t"
+    "addiu      %[dst],          %[dst],             4        \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "addu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "bne        %[src],          %[p_loop2_end],     1b       \n\t"
+    " sw        %[temp0],        -4(%[dst])                   \n\t"
+  "2:                                                         \n\t"
+    ".set       pop                                           \n\t"
+    : [dst]"+&r"(dst), [src]"+&r"(src), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void TransformColorInverse_MIPSdspR2(const VP8LMultipliers* const m,
+                                            const uint32_t* src, int num_pixels,
+                                            uint32_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  uint32_t argb, argb1, new_red;
+  const uint32_t G_to_R = m->green_to_red_;
+  const uint32_t G_to_B = m->green_to_blue_;
+  const uint32_t R_to_B = m->red_to_blue_;
+  const uint32_t* const p_loop_end = src + (num_pixels & ~1);
+  __asm__ volatile (
+    ".set            push                                    \n\t"
+    ".set            noreorder                               \n\t"
+    "beq             %[src],       %[p_loop_end],  1f        \n\t"
+    " nop                                                    \n\t"
+    "replv.ph        %[temp0],     %[G_to_R]                 \n\t"
+    "replv.ph        %[temp1],     %[G_to_B]                 \n\t"
+    "replv.ph        %[temp2],     %[R_to_B]                 \n\t"
+    "shll.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shll.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shll.ph         %[temp2],     %[temp2],       8         \n\t"
+    "shra.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shra.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shra.ph         %[temp2],     %[temp2],       8         \n\t"
+  "0:                                                        \n\t"
+    "lw              %[argb],      0(%[src])                 \n\t"
+    "lw              %[argb1],     4(%[src])                 \n\t"
+    "sw              %[argb],      0(%[dst])                 \n\t"
+    "sw              %[argb1],     4(%[dst])                 \n\t"
+    "addiu           %[src],       %[src],         8         \n\t"
+    "addiu           %[dst],       %[dst],         8         \n\t"
+    "precrq.qb.ph    %[temp3],     %[argb],        %[argb1]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[temp3]                  \n\t"
+    "shll.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       8         \n\t"
+    "mul.ph          %[temp5],     %[temp3],       %[temp0]  \n\t"
+    "mul.ph          %[temp3],     %[temp3],       %[temp1]  \n\t"
+    "precrq.ph.w     %[new_red],   %[argb],        %[argb1]  \n\t"
+    "ins             %[argb1],     %[argb],        16,   16  \n\t"
+    "shra.ph         %[temp5],     %[temp5],       5         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       5         \n\t"
+    "addu.ph         %[new_red],   %[new_red],     %[temp5]  \n\t"
+    "addu.ph         %[argb1],     %[argb1],       %[temp3]  \n\t"
+    "preceu.ph.qbra  %[temp5],     %[new_red]                \n\t"
+    "shll.ph         %[temp4],     %[temp5],       8         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       8         \n\t"
+    "mul.ph          %[temp4],     %[temp4],       %[temp2]  \n\t"
+    "sb              %[temp5],     -2(%[dst])                \n\t"
+    "sra             %[temp5],     %[temp5],       16        \n\t"
+    "shra.ph         %[temp4],     %[temp4],       5         \n\t"
+    "addu.ph         %[argb1],     %[argb1],       %[temp4]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[argb1]                  \n\t"
+    "sb              %[temp5],     -6(%[dst])                \n\t"
+    "sb              %[temp3],     -4(%[dst])                \n\t"
+    "sra             %[temp3],     %[temp3],       16        \n\t"
+    "bne             %[src],       %[p_loop_end],  0b        \n\t"
+    " sb             %[temp3],     -8(%[dst])                \n\t"
+  "1:                                                        \n\t"
+    ".set            pop                                     \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [new_red]"=&r"(new_red), [argb]"=&r"(argb),
+      [argb1]"=&r"(argb1), [dst]"+&r"(dst), [src]"+&r"(src)
+    : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
+      [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
+    : "memory", "hi", "lo"
+  );
+
+  // Fall-back to C-version for left-overs.
+  if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
+}
+
+static void ConvertBGRAToRGB_MIPSdspR2(const uint32_t* src,
+                                       int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set       push                                       \n\t"
+    ".set       noreorder                                  \n\t"
+    "beq        %[src],      %[p_loop1_end],    3f         \n\t"
+    " nop                                                  \n\t"
+  "0:                                                      \n\t"
+    "lw         %[temp3],    12(%[src])                    \n\t"
+    "lw         %[temp2],    8(%[src])                     \n\t"
+    "lw         %[temp1],    4(%[src])                     \n\t"
+    "lw         %[temp0],    0(%[src])                     \n\t"
+    "ins        %[temp3],    %[temp2],          24,   8    \n\t"
+    "sll        %[temp2],    %[temp2],          8          \n\t"
+    "rotr       %[temp3],    %[temp3],          16         \n\t"
+    "ins        %[temp2],    %[temp1],          0,    16   \n\t"
+    "sll        %[temp1],    %[temp1],          8          \n\t"
+    "wsbh       %[temp3],    %[temp3]                      \n\t"
+    "balign     %[temp0],    %[temp1],          1          \n\t"
+    "wsbh       %[temp2],    %[temp2]                      \n\t"
+    "wsbh       %[temp0],    %[temp0]                      \n\t"
+    "usw        %[temp3],    8(%[dst])                     \n\t"
+    "rotr       %[temp0],    %[temp0],          16         \n\t"
+    "usw        %[temp2],    4(%[dst])                     \n\t"
+    "addiu      %[src],      %[src],            16         \n\t"
+    "usw        %[temp0],    0(%[dst])                     \n\t"
+    "bne        %[src],      %[p_loop1_end],    0b         \n\t"
+    " addiu     %[dst],      %[dst],            12         \n\t"
+  "3:                                                      \n\t"
+    "beq        %[src],      %[p_loop2_end],    2f         \n\t"
+    " nop                                                  \n\t"
+  "1:                                                      \n\t"
+    "lw         %[temp0],    0(%[src])                     \n\t"
+    "addiu      %[src],      %[src],            4          \n\t"
+    "wsbh       %[temp1],    %[temp0]                      \n\t"
+    "addiu      %[dst],      %[dst],            3          \n\t"
+    "ush        %[temp1],    -2(%[dst])                    \n\t"
+    "sra        %[temp0],    %[temp0],          16         \n\t"
+    "bne        %[src],      %[p_loop2_end],    1b         \n\t"
+    " sb        %[temp0],    -3(%[dst])                    \n\t"
+  "2:                                                      \n\t"
+    ".set       pop                                        \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void ConvertBGRAToRGBA_MIPSdspR2(const uint32_t* src,
+                                        int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set       push                                       \n\t"
+    ".set       noreorder                                  \n\t"
+    "beq        %[src],      %[p_loop1_end],    3f         \n\t"
+    " nop                                                  \n\t"
+  "0:                                                      \n\t"
+    "lw         %[temp0],    0(%[src])                     \n\t"
+    "lw         %[temp1],    4(%[src])                     \n\t"
+    "lw         %[temp2],    8(%[src])                     \n\t"
+    "lw         %[temp3],    12(%[src])                    \n\t"
+    "wsbh       %[temp0],    %[temp0]                      \n\t"
+    "wsbh       %[temp1],    %[temp1]                      \n\t"
+    "wsbh       %[temp2],    %[temp2]                      \n\t"
+    "wsbh       %[temp3],    %[temp3]                      \n\t"
+    "addiu      %[src],      %[src],            16         \n\t"
+    "balign     %[temp0],    %[temp0],          1          \n\t"
+    "balign     %[temp1],    %[temp1],          1          \n\t"
+    "balign     %[temp2],    %[temp2],          1          \n\t"
+    "balign     %[temp3],    %[temp3],          1          \n\t"
+    "usw        %[temp0],    0(%[dst])                     \n\t"
+    "usw        %[temp1],    4(%[dst])                     \n\t"
+    "usw        %[temp2],    8(%[dst])                     \n\t"
+    "usw        %[temp3],    12(%[dst])                    \n\t"
+    "bne        %[src],      %[p_loop1_end],    0b         \n\t"
+    " addiu     %[dst],      %[dst],            16         \n\t"
+  "3:                                                      \n\t"
+    "beq        %[src],      %[p_loop2_end],    2f         \n\t"
+    " nop                                                  \n\t"
+  "1:                                                      \n\t"
+    "lw         %[temp0],    0(%[src])                     \n\t"
+    "wsbh       %[temp0],    %[temp0]                      \n\t"
+    "addiu      %[src],      %[src],            4          \n\t"
+    "balign     %[temp0],    %[temp0],          1          \n\t"
+    "usw        %[temp0],    0(%[dst])                     \n\t"
+    "bne        %[src],      %[p_loop2_end],    1b         \n\t"
+    " addiu     %[dst],      %[dst],            4          \n\t"
+  "2:                                                      \n\t"
+    ".set       pop                                        \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void ConvertBGRAToRGBA4444_MIPSdspR2(const uint32_t* src,
+                                            int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set           push                                       \n\t"
+    ".set           noreorder                                  \n\t"
+    "beq            %[src],      %[p_loop1_end],    3f         \n\t"
+    " nop                                                      \n\t"
+  "0:                                                          \n\t"
+    "lw             %[temp0],    0(%[src])                     \n\t"
+    "lw             %[temp1],    4(%[src])                     \n\t"
+    "lw             %[temp2],    8(%[src])                     \n\t"
+    "lw             %[temp3],    12(%[src])                    \n\t"
+    "ext            %[temp4],    %[temp0],          28,   4    \n\t"
+    "ext            %[temp5],    %[temp0],          12,   4    \n\t"
+    "ins            %[temp0],    %[temp4],          0,    4    \n\t"
+    "ext            %[temp4],    %[temp1],          28,   4    \n\t"
+    "ins            %[temp0],    %[temp5],          16,   4    \n\t"
+    "ext            %[temp5],    %[temp1],          12,   4    \n\t"
+    "ins            %[temp1],    %[temp4],          0,    4    \n\t"
+    "ext            %[temp4],    %[temp2],          28,   4    \n\t"
+    "ins            %[temp1],    %[temp5],          16,   4    \n\t"
+    "ext            %[temp5],    %[temp2],          12,   4    \n\t"
+    "ins            %[temp2],    %[temp4],          0,    4    \n\t"
+    "ext            %[temp4],    %[temp3],          28,   4    \n\t"
+    "ins            %[temp2],    %[temp5],          16,   4    \n\t"
+    "ext            %[temp5],    %[temp3],          12,   4    \n\t"
+    "ins            %[temp3],    %[temp4],          0,    4    \n\t"
+    "precr.qb.ph    %[temp1],    %[temp1],          %[temp0]   \n\t"
+    "ins            %[temp3],    %[temp5],          16,   4    \n\t"
+    "addiu          %[src],      %[src],            16         \n\t"
+    "precr.qb.ph    %[temp3],    %[temp3],          %[temp2]   \n\t"
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    "usw            %[temp1],    0(%[dst])                     \n\t"
+    "usw            %[temp3],    4(%[dst])                     \n\t"
+#else
+    "wsbh           %[temp1],    %[temp1]                      \n\t"
+    "wsbh           %[temp3],    %[temp3]                      \n\t"
+    "usw            %[temp1],    0(%[dst])                     \n\t"
+    "usw            %[temp3],    4(%[dst])                     \n\t"
+#endif
+    "bne            %[src],      %[p_loop1_end],    0b         \n\t"
+    " addiu         %[dst],      %[dst],            8          \n\t"
+  "3:                                                          \n\t"
+    "beq            %[src],      %[p_loop2_end],    2f         \n\t"
+    " nop                                                      \n\t"
+  "1:                                                          \n\t"
+    "lw             %[temp0],    0(%[src])                     \n\t"
+    "ext            %[temp4],    %[temp0],          28,   4    \n\t"
+    "ext            %[temp5],    %[temp0],          12,   4    \n\t"
+    "ins            %[temp0],    %[temp4],          0,    4    \n\t"
+    "ins            %[temp0],    %[temp5],          16,   4    \n\t"
+    "addiu          %[src],      %[src],            4          \n\t"
+    "precr.qb.ph    %[temp0],    %[temp0],          %[temp0]   \n\t"
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    "ush            %[temp0],    0(%[dst])                     \n\t"
+#else
+    "wsbh           %[temp0],    %[temp0]                      \n\t"
+    "ush            %[temp0],    0(%[dst])                     \n\t"
+#endif
+    "bne            %[src],      %[p_loop2_end],    1b         \n\t"
+    " addiu         %[dst],      %[dst],            2          \n\t"
+  "2:                                                          \n\t"
+    ".set           pop                                        \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void ConvertBGRAToRGB565_MIPSdspR2(const uint32_t* src,
+                                          int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set           push                                       \n\t"
+    ".set           noreorder                                  \n\t"
+    "beq            %[src],      %[p_loop1_end],    3f         \n\t"
+    " nop                                                      \n\t"
+  "0:                                                          \n\t"
+    "lw             %[temp0],    0(%[src])                     \n\t"
+    "lw             %[temp1],    4(%[src])                     \n\t"
+    "lw             %[temp2],    8(%[src])                     \n\t"
+    "lw             %[temp3],    12(%[src])                    \n\t"
+    "ext            %[temp4],    %[temp0],          8,    16   \n\t"
+    "ext            %[temp5],    %[temp0],          5,    11   \n\t"
+    "ext            %[temp0],    %[temp0],          3,    5    \n\t"
+    "ins            %[temp4],    %[temp5],          0,    11   \n\t"
+    "ext            %[temp5],    %[temp1],          5,    11   \n\t"
+    "ins            %[temp4],    %[temp0],          0,    5    \n\t"
+    "ext            %[temp0],    %[temp1],          8,    16   \n\t"
+    "ext            %[temp1],    %[temp1],          3,    5    \n\t"
+    "ins            %[temp0],    %[temp5],          0,    11   \n\t"
+    "ext            %[temp5],    %[temp2],          5,    11   \n\t"
+    "ins            %[temp0],    %[temp1],          0,    5    \n\t"
+    "ext            %[temp1],    %[temp2],          8,    16   \n\t"
+    "ext            %[temp2],    %[temp2],          3,    5    \n\t"
+    "ins            %[temp1],    %[temp5],          0,    11   \n\t"
+    "ext            %[temp5],    %[temp3],          5,    11   \n\t"
+    "ins            %[temp1],    %[temp2],          0,    5    \n\t"
+    "ext            %[temp2],    %[temp3],          8,    16   \n\t"
+    "ext            %[temp3],    %[temp3],          3,    5    \n\t"
+    "ins            %[temp2],    %[temp5],          0,    11   \n\t"
+    "append         %[temp0],    %[temp4],          16         \n\t"
+    "ins            %[temp2],    %[temp3],          0,    5    \n\t"
+    "addiu          %[src],      %[src],            16         \n\t"
+    "append         %[temp2],    %[temp1],          16         \n\t"
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    "usw            %[temp0],    0(%[dst])                     \n\t"
+    "usw            %[temp2],    4(%[dst])                     \n\t"
+#else
+    "wsbh           %[temp0],    %[temp0]                      \n\t"
+    "wsbh           %[temp2],    %[temp2]                      \n\t"
+    "usw            %[temp0],    0(%[dst])                     \n\t"
+    "usw            %[temp2],    4(%[dst])                     \n\t"
+#endif
+    "bne            %[src],      %[p_loop1_end],    0b         \n\t"
+    " addiu         %[dst],      %[dst],            8          \n\t"
+  "3:                                                          \n\t"
+    "beq            %[src],      %[p_loop2_end],    2f         \n\t"
+    " nop                                                      \n\t"
+  "1:                                                          \n\t"
+    "lw             %[temp0],    0(%[src])                     \n\t"
+    "ext            %[temp4],    %[temp0],          8,    16   \n\t"
+    "ext            %[temp5],    %[temp0],          5,    11   \n\t"
+    "ext            %[temp0],    %[temp0],          3,    5    \n\t"
+    "ins            %[temp4],    %[temp5],          0,    11   \n\t"
+    "addiu          %[src],      %[src],            4          \n\t"
+    "ins            %[temp4],    %[temp0],          0,    5    \n\t"
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    "ush            %[temp4],    0(%[dst])                     \n\t"
+#else
+    "wsbh           %[temp4],    %[temp4]                      \n\t"
+    "ush            %[temp4],    0(%[dst])                     \n\t"
+#endif
+    "bne            %[src],      %[p_loop2_end],    1b         \n\t"
+    " addiu         %[dst],      %[dst],            2          \n\t"
+  "2:                                                          \n\t"
+    ".set           pop                                        \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void ConvertBGRAToBGR_MIPSdspR2(const uint32_t* src,
+                                       int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set       push                                         \n\t"
+    ".set       noreorder                                    \n\t"
+    "beq        %[src],      %[p_loop1_end],    3f           \n\t"
+    " nop                                                    \n\t"
+  "0:                                                        \n\t"
+    "lw         %[temp0],    0(%[src])                       \n\t"
+    "lw         %[temp1],    4(%[src])                       \n\t"
+    "lw         %[temp2],    8(%[src])                       \n\t"
+    "lw         %[temp3],    12(%[src])                      \n\t"
+    "ins        %[temp0],    %[temp1],          24,    8     \n\t"
+    "sra        %[temp1],    %[temp1],          8            \n\t"
+    "ins        %[temp1],    %[temp2],          16,    16    \n\t"
+    "sll        %[temp2],    %[temp2],          8            \n\t"
+    "balign     %[temp3],    %[temp2],          1            \n\t"
+    "addiu      %[src],      %[src],            16           \n\t"
+    "usw        %[temp0],    0(%[dst])                       \n\t"
+    "usw        %[temp1],    4(%[dst])                       \n\t"
+    "usw        %[temp3],    8(%[dst])                       \n\t"
+    "bne        %[src],      %[p_loop1_end],    0b           \n\t"
+    " addiu     %[dst],      %[dst],            12           \n\t"
+  "3:                                                        \n\t"
+    "beq        %[src],      %[p_loop2_end],    2f           \n\t"
+    " nop                                                    \n\t"
+  "1:                                                        \n\t"
+    "lw         %[temp0],    0(%[src])                       \n\t"
+    "addiu      %[src],      %[src],            4            \n\t"
+    "addiu      %[dst],      %[dst],            3            \n\t"
+    "ush        %[temp0],    -3(%[dst])                      \n\t"
+    "sra        %[temp0],    %[temp0],          16           \n\t"
+    "bne        %[src],      %[p_loop2_end],    1b           \n\t"
+    " sb        %[temp0],    -1(%[dst])                      \n\t"
+  "2:                                                        \n\t"
+    ".set       pop                                          \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
+  VP8LMapColor32b = MapARGB_MIPSdspR2;
+  VP8LMapColor8b = MapAlpha_MIPSdspR2;
+
+  VP8LPredictors[5] = Predictor5_MIPSdspR2;
+  VP8LPredictors[6] = Predictor6_MIPSdspR2;
+  VP8LPredictors[7] = Predictor7_MIPSdspR2;
+  VP8LPredictors[8] = Predictor8_MIPSdspR2;
+  VP8LPredictors[9] = Predictor9_MIPSdspR2;
+  VP8LPredictors[10] = Predictor10_MIPSdspR2;
+  VP8LPredictors[11] = Predictor11_MIPSdspR2;
+  VP8LPredictors[12] = Predictor12_MIPSdspR2;
+  VP8LPredictors[13] = Predictor13_MIPSdspR2;
+
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MIPSdspR2;
+  VP8LTransformColorInverse = TransformColorInverse_MIPSdspR2;
+
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MIPSdspR2;
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MIPSdspR2;
+  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_MIPSdspR2;
+  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_MIPSdspR2;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8LDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/lossless_msa.c b/media/libwebp/dsp/lossless_msa.c
new file mode 100644
index 0000000000..16256ab57f
--- /dev/null
+++ b/media/libwebp/dsp/lossless_msa.c
@@ -0,0 +1,356 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of methods for lossless decoder
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/lossless.h"
+#include "../dsp/msa_macro.h"
+
+//------------------------------------------------------------------------------
+// Colorspace conversion functions
+
+#define CONVERT16_BGRA_XXX(psrc, pdst, m0, m1, m2) do {    \
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;          \
+  LD_UB4(psrc, 16, src0, src1, src2, src3);                \
+  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
+  dst2 = VSHF_UB(src2, src3, m2);                          \
+  ST_UB2(dst0, dst1, pdst, 16);                            \
+  ST_UB(dst2, pdst + 32);                                  \
+} while (0)
+
+#define CONVERT12_BGRA_XXX(psrc, pdst, m0, m1, m2) do {    \
+  uint32_t pix_w;                                          \
+  v16u8 src0, src1, src2, dst0, dst1, dst2;                \
+  LD_UB3(psrc, 16, src0, src1, src2);                      \
+  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
+  dst2 = VSHF_UB(src2, src2, m2);                          \
+  ST_UB2(dst0, dst1, pdst, 16);                            \
+  pix_w = __msa_copy_s_w((v4i32)dst2, 0);                  \
+  SW(pix_w, pdst + 32);                                    \
+} while (0)
+
+#define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do {         \
+  uint64_t pix_d;                                          \
+  v16u8 src0, src1, src2 = { 0 }, dst0, dst1;              \
+  LD_UB2(psrc, 16, src0, src1);                            \
+  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
+  ST_UB(dst0, pdst);                                       \
+  pix_d = __msa_copy_s_d((v2i64)dst1, 0);                  \
+  SD(pix_d, pdst + 16);                                    \
+} while (0)
+
+#define CONVERT4_BGRA_XXX(psrc, pdst, m) do {       \
+  const v16u8 src0 = LD_UB(psrc);                   \
+  const v16u8 dst0 = VSHF_UB(src0, src0, m);        \
+  uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);  \
+  uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);  \
+  SD(pix_d, pdst + 0);                              \
+  SW(pix_w, pdst + 8);                              \
+} while (0)
+
+#define CONVERT1_BGRA_BGR(psrc, pdst) do {  \
+  const int32_t b = (psrc)[0];              \
+  const int32_t g = (psrc)[1];              \
+  const int32_t r = (psrc)[2];              \
+  (pdst)[0] = b;                            \
+  (pdst)[1] = g;                            \
+  (pdst)[2] = r;                            \
+} while (0)
+
+#define CONVERT1_BGRA_RGB(psrc, pdst) do {  \
+  const int32_t b = (psrc)[0];              \
+  const int32_t g = (psrc)[1];              \
+  const int32_t r = (psrc)[2];              \
+  (pdst)[0] = r;                            \
+  (pdst)[1] = g;                            \
+  (pdst)[2] = b;                            \
+} while (0)
+
+#define TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1,     \
+                                  c0, c1, mask0, mask1) do {  \
+  v8i16 g0, g1, t0, t1, t2, t3;                               \
+  v4i32 t4, t5;                                               \
+  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1);   \
+  DOTP_SB2_SH(g0, g1, c0, c0, t0, t1);                        \
+  SRAI_H2_SH(t0, t1, 5);                                      \
+  t0 = __msa_addv_h(t0, (v8i16)src0);                         \
+  t1 = __msa_addv_h(t1, (v8i16)src1);                         \
+  t4 = __msa_srli_w((v4i32)t0, 16);                           \
+  t5 = __msa_srli_w((v4i32)t1, 16);                           \
+  DOTP_SB2_SH(t4, t5, c1, c1, t2, t3);                        \
+  SRAI_H2_SH(t2, t3, 5);                                      \
+  ADD2(t0, t2, t1, t3, t0, t1);                               \
+  VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1);   \
+} while (0)
+
+#define TRANSFORM_COLOR_INVERSE_4(src, dst, c0, c1, mask0, mask1) do {  \
+  const v16i8 g0 = VSHF_SB(src, src, mask0);                            \
+  v8i16 t0 = __msa_dotp_s_h(c0, g0);                                    \
+  v8i16 t1;                                                             \
+  v4i32 t2;                                                             \
+  t0 = SRAI_H(t0, 5);                                                   \
+  t0 = __msa_addv_h(t0, (v8i16)src);                                    \
+  t2 = __msa_srli_w((v4i32)t0, 16);                                     \
+  t1 = __msa_dotp_s_h(c1, (v16i8)t2);                                   \
+  t1 = SRAI_H(t1, 5);                                                   \
+  t0 = t0 + t1;                                                         \
+  dst = VSHF_UB(src, t0, mask1);                                        \
+} while (0)
+
+static void ConvertBGRAToRGBA_MSA(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
+  int i;
+  const uint8_t* ptemp_src = (const uint8_t*)src;
+  uint8_t* ptemp_dst = (uint8_t*)dst;
+  v16u8 src0, dst0;
+  const v16u8 mask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1;
+    LD_UB2(ptemp_src, 16, src0, src1);
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, dst0, dst1);
+    ST_UB2(dst0, dst1, ptemp_dst, 16);
+    ptemp_src += 32;
+    ptemp_dst += 32;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(ptemp_src);
+      dst0 = VSHF_UB(src0, src0, mask);
+      ST_UB(dst0, ptemp_dst);
+      ptemp_src += 16;
+      ptemp_dst += 16;
+      num_pixels -= 4;
+    }
+    for (i = 0; i < num_pixels; i++) {
+      const uint8_t b = ptemp_src[2];
+      const uint8_t g = ptemp_src[1];
+      const uint8_t r = ptemp_src[0];
+      const uint8_t a = ptemp_src[3];
+      ptemp_dst[0] = b;
+      ptemp_dst[1] = g;
+      ptemp_dst[2] = r;
+      ptemp_dst[3] = a;
+      ptemp_src += 4;
+      ptemp_dst += 4;
+    }
+  }
+}
+
+static void ConvertBGRAToBGR_MSA(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst) {
+  const uint8_t* ptemp_src = (const uint8_t*)src;
+  uint8_t* ptemp_dst = (uint8_t*)dst;
+  const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
+                        16, 17, 18, 20 };
+  const v16u8 mask1 = { 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20,
+                        21, 22, 24, 25 };
+  const v16u8 mask2 = { 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25,
+                        26, 28, 29, 30 };
+
+  while (num_pixels >= 16) {
+    CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+    ptemp_src += 64;
+    ptemp_dst += 48;
+    num_pixels -= 16;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 12) {
+      CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+      ptemp_src += 48;
+      ptemp_dst += 36;
+      num_pixels -= 12;
+    } else if (num_pixels >= 8) {
+      CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1);
+      ptemp_src += 32;
+      ptemp_dst += 24;
+      num_pixels -= 8;
+    } else if (num_pixels >= 4) {
+      CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0);
+      ptemp_src += 16;
+      ptemp_dst += 12;
+      num_pixels -= 4;
+    }
+    if (num_pixels == 3) {
+      CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3);
+      CONVERT1_BGRA_BGR(ptemp_src + 8, ptemp_dst + 6);
+    } else if (num_pixels == 2) {
+      CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3);
+    } else if (num_pixels == 1) {
+      CONVERT1_BGRA_BGR(ptemp_src, ptemp_dst);
+    }
+  }
+}
+
+static void ConvertBGRAToRGB_MSA(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst) {
+  const uint8_t* ptemp_src = (const uint8_t*)src;
+  uint8_t* ptemp_dst = (uint8_t*)dst;
+  const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12,
+                        18, 17, 16, 22 };
+  const v16u8 mask1 = { 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22,
+                        21, 20, 26, 25 };
+  const v16u8 mask2 = { 8, 14, 13, 12, 18, 17, 16, 22, 21, 20, 26, 25,
+                        24, 30, 29, 28 };
+
+  while (num_pixels >= 16) {
+    CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+    ptemp_src += 64;
+    ptemp_dst += 48;
+    num_pixels -= 16;
+  }
+  if (num_pixels) {
+    if (num_pixels >= 12) {
+      CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+      ptemp_src += 48;
+      ptemp_dst += 36;
+      num_pixels -= 12;
+    } else if (num_pixels >= 8) {
+      CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1);
+      ptemp_src += 32;
+      ptemp_dst += 24;
+      num_pixels -= 8;
+    } else if (num_pixels >= 4) {
+      CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0);
+      ptemp_src += 16;
+      ptemp_dst += 12;
+      num_pixels -= 4;
+    }
+    if (num_pixels == 3) {
+      CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3);
+      CONVERT1_BGRA_RGB(ptemp_src + 8, ptemp_dst + 6);
+    } else if (num_pixels == 2) {
+      CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3);
+    } else if (num_pixels == 1) {
+      CONVERT1_BGRA_RGB(ptemp_src, ptemp_dst);
+    }
+  }
+}
+
+static void AddGreenToBlueAndRed_MSA(const uint32_t* const src, int num_pixels,
+                                     uint32_t* dst) {
+  int i;
+  const uint8_t* in = (const uint8_t*)src;
+  uint8_t* out = (uint8_t*)dst;
+  v16u8 src0, dst0, tmp0;
+  const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                       13, 255, 13, 255 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1, tmp1;
+    LD_UB2(in, 16, src0, src1);
+    VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
+    ADD2(src0, tmp0, src1, tmp1, dst0, dst1);
+    ST_UB2(dst0, dst1, out, 16);
+    in += 32;
+    out += 32;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(in);
+      tmp0 = VSHF_UB(src0, src0, mask);
+      dst0 = src0 + tmp0;
+      ST_UB(dst0, out);
+      in += 16;
+      out += 16;
+      num_pixels -= 4;
+    }
+    for (i = 0; i < num_pixels; i++) {
+      const uint8_t b = in[0];
+      const uint8_t g = in[1];
+      const uint8_t r = in[2];
+      out[0] = (b + g) & 0xff;
+      out[1] = g;
+      out[2] = (r + g) & 0xff;
+      out[4] = in[4];
+      out += 4;
+    }
+  }
+}
+
+static void TransformColorInverse_MSA(const VP8LMultipliers* const m,
+                                      const uint32_t* src, int num_pixels,
+                                      uint32_t* dst) {
+  v16u8 src0, dst0;
+  const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
+                                         (m->green_to_red_ << 16));
+  const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
+  const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                        13, 255, 13, 255 };
+  const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
+                        28, 13, 30, 15 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1;
+    LD_UB2(src, 4, src0, src1);
+    TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
+    ST_UB2(dst0, dst1, dst, 4);
+    src += 8;
+    dst += 8;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(src);
+      TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
+      ST_UB(dst0, dst);
+      src += 4;
+      dst += 4;
+      num_pixels -= 4;
+    }
+    if (num_pixels > 0) {
+      src0 = LD_UB(src);
+      TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
+      if (num_pixels == 3) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
+        SD(pix_d, dst + 0);
+        SW(pix_w, dst + 2);
+      } else if (num_pixels == 2) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        SD(pix_d, dst);
+      } else {
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
+        SW(pix_w, dst);
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) {
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MSA;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MSA;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MSA;
+
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MSA;
+  VP8LTransformColorInverse = TransformColorInverse_MSA;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8LDspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/lossless_neon.c b/media/libwebp/dsp/lossless_neon.c
index a7bf47f3c4..2122e46f7a 100644
--- a/media/libwebp/dsp/lossless_neon.c
+++ b/media/libwebp/dsp/lossless_neon.c
@@ -188,17 +188,21 @@ static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,
   return avg;
 }
 
-static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) {
-  return Average3_NEON(left, top[0], top[1]);
+static uint32_t Predictor5_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average3_NEON(*left, top[0], top[1]);
 }
-static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) {
-  return Average2_NEON(left, top[-1]);
+static uint32_t Predictor6_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average2_NEON(*left, top[-1]);
 }
-static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) {
-  return Average2_NEON(left, top[0]);
+static uint32_t Predictor7_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average2_NEON(*left, top[0]);
 }
-static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) {
-  return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]);
+static uint32_t Predictor13_NEON(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  return ClampedAddSubtractHalf_NEON(*left, top[0], top[-1]);
 }
 
 // Batch versions of those functions.
diff --git a/media/libwebp/dsp/lossless_sse2.c b/media/libwebp/dsp/lossless_sse2.c
index c40fcfb769..03796493de 100644
--- a/media/libwebp/dsp/lossless_sse2.c
+++ b/media/libwebp/dsp/lossless_sse2.c
@@ -18,7 +18,6 @@
 #include "../dsp/common_sse2.h"
 #include "../dsp/lossless.h"
 #include "../dsp/lossless_common.h"
-#include <assert.h>
 #include <emmintrin.h>
 
 //------------------------------------------------------------------------------
@@ -139,42 +138,51 @@ static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
   return output;
 }
 
-static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
+static uint32_t Predictor5_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2_SSE2(left, top[-1]);
+static uint32_t Predictor6_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average2_SSE2(*left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2_SSE2(left, top[0]);
+static uint32_t Predictor7_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average2_SSE2(*left, top[0]);
   return pred;
 }
-static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
   const uint32_t pred = Average2_SSE2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
   const uint32_t pred = Average2_SSE2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
+static uint32_t Predictor10_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
+static uint32_t Predictor11_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor12_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor13_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);
   return pred;
 }
 
@@ -191,8 +199,9 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
     _mm_storeu_si128((__m128i*)&out[i], res);
   }
   if (i != num_pixels) {
-    VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
+    VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
   }
+  (void)upper;
 }
 
 // Predictor1: left.
diff --git a/media/libwebp/dsp/lossless_sse41.c b/media/libwebp/dsp/lossless_sse41.c
new file mode 100644
index 0000000000..3308ac31ee
--- /dev/null
+++ b/media/libwebp/dsp/lossless_sse41.c
@@ -0,0 +1,132 @@
+// Copyright 2021 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE41 variant of methods for lossless decoder
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include "../dsp/common_sse41.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+
+//------------------------------------------------------------------------------
+// Color-space conversion functions
+
+static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
+                                        const uint32_t* const src,
+                                        int num_pixels, uint32_t* dst) {
+// sign-extended multiplying constants, pre-shifted by 5.
+#define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
+  const __m128i mults_rb = _mm_set1_epi32((uint32_t)CST(green_to_red_) << 16 |
+                                          (CST(green_to_blue_) & 0xffff));
+  const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_));
+#undef CST
+  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);
+  const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
+                                      -1, 9, -1, 9, -1, 13, -1, 13);
+  const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
+                                      -1, 10, -1, -1, -1, 14, -1, -1);
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
+    const __m128i C = _mm_mulhi_epi16(B, mults_rb);
+    const __m128i D = _mm_add_epi8(A, C);
+    const __m128i E = _mm_shuffle_epi8(D, perm2);
+    const __m128i F = _mm_mulhi_epi16(E, mults_b2);
+    const __m128i G = _mm_add_epi8(D, F);
+    const __m128i out = _mm_blendv_epi8(G, A, mask_ag);
+    _mm_storeu_si128((__m128i*)&dst[i], out);
+  }
+  // Fall-back to C-version for left-overs.
+  if (i != num_pixels) {
+    VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#define ARGB_TO_RGB_SSE41 do {                        \
+  while (num_pixels >= 16) {                          \
+    const __m128i in0 = _mm_loadu_si128(in + 0);      \
+    const __m128i in1 = _mm_loadu_si128(in + 1);      \
+    const __m128i in2 = _mm_loadu_si128(in + 2);      \
+    const __m128i in3 = _mm_loadu_si128(in + 3);      \
+    const __m128i a0 = _mm_shuffle_epi8(in0, perm0);  \
+    const __m128i a1 = _mm_shuffle_epi8(in1, perm1);  \
+    const __m128i a2 = _mm_shuffle_epi8(in2, perm2);  \
+    const __m128i a3 = _mm_shuffle_epi8(in3, perm3);  \
+    const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
+    const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
+    const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
+    _mm_storeu_si128(out + 0, b0);                    \
+    _mm_storeu_si128(out + 1, b1);                    \
+    _mm_storeu_si128(out + 2, b2);                    \
+    in += 4;                                          \
+    out += 3;                                         \
+    num_pixels -= 16;                                 \
+  }                                                   \
+} while (0)
+
+static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
+                                   uint8_t* dst) {
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
+                                      8, 14, 13, 12, -1, -1, -1, -1);
+  const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
+  const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
+  const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
+
+  ARGB_TO_RGB_SSE41;
+
+  // left-overs
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
+}
+
+static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
+                                   int num_pixels, uint8_t* dst) {
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+  const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
+                                      12, 13, 14, -1, -1, -1, -1);
+  const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
+  const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
+  const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
+
+  ARGB_TO_RGB_SSE41;
+
+  // left-overs
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
+}
+
+#undef ARGB_TO_RGB_SSE41
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
+  VP8LTransformColorInverse = TransformColorInverse_SSE41;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8LDspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/media/libwebp/dsp/moz.build b/media/libwebp/dsp/moz.build
index c00d5e1401..6e2c9deb2c 100644
--- a/media/libwebp/dsp/moz.build
+++ b/media/libwebp/dsp/moz.build
@@ -1,4 +1,5 @@
 # -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
@@ -8,53 +9,103 @@ with Files('**'):
 
 SOURCES += [
     'alpha_processing.c',
-    'alpha_processing_neon.c',
-    'alpha_processing_sse2.c',
-    'alpha_processing_sse41.c',
+    'cost.c',
     'dec.c',
     'dec_clip_tables.c',
-    'dec_neon.c',
-    'dec_sse2.c',
-    'dec_sse41.c',
+    'enc.c',
     'filters.c',
-    'filters_neon.c',
-    'filters_sse2.c',
     'lossless.c',
-    'lossless_neon.c',
-    'lossless_sse2.c',
+    'lossless_enc.c',
     'rescaler.c',
-    'rescaler_neon.c',
-    'rescaler_sse2.c',
+    'ssim.c',
     'upsampling.c',
-    'upsampling_neon.c',
-    'upsampling_sse2.c',
-    'upsampling_sse41.c',
     'yuv.c',
-    'yuv_neon.c',
-    'yuv_sse2.c',
-    'yuv_sse41.c',
 ]
 
 if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['BUILD_ARM_NEON']:
-    SOURCES['alpha_processing_neon.c'].flags += CONFIG['NEON_FLAGS']
-    SOURCES['dec_neon.c'].flags += CONFIG['NEON_FLAGS']
-    SOURCES['filters_neon.c'].flags += CONFIG['NEON_FLAGS']
-    SOURCES['lossless_neon.c'].flags += CONFIG['NEON_FLAGS']
-    SOURCES['rescaler_neon.c'].flags += CONFIG['NEON_FLAGS']
-    SOURCES['upsampling_neon.c'].flags += CONFIG['NEON_FLAGS']
-    SOURCES['yuv_neon.c'].flags += CONFIG['NEON_FLAGS']
+    SOURCES += [
+        'alpha_processing_neon.c',
+        'cost_neon.c',
+        'dec_neon.c',
+        'enc_neon.c',
+        'filters_neon.c',
+        'lossless_enc_neon.c',
+        'lossless_neon.c',
+        'rescaler_neon.c',
+        'upsampling_neon.c',
+        'yuv_neon.c',
+    ]
+    DEFINES['WEBP_HAVE_NEON'] = 1;
+    for f in SOURCES:
+      if f.endswith('neon.c'):
+        SOURCES[f].flags += CONFIG['NEON_FLAGS']
+elif CONFIG['CPU_ARCH'] == 'aarch64':
+    SOURCES += [
+        'alpha_processing_neon.c',
+        'cost_neon.c',
+        'dec_neon.c',
+        'enc_neon.c',
+        'filters_neon.c',
+        'lossless_enc_neon.c',
+        'lossless_neon.c',
+        'rescaler_neon.c',
+        'upsampling_neon.c',
+        'yuv_neon.c',
+    ]
+    DEFINES['WEBP_HAVE_NEON'] = 1;
 elif CONFIG['INTEL_ARCHITECTURE']:
-    SOURCES['alpha_processing_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['alpha_processing_sse41.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['dec_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['dec_sse41.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['filters_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['lossless_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['rescaler_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['upsampling_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['upsampling_sse41.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['yuv_sse2.c'].flags += CONFIG['SSE2_FLAGS']
-    SOURCES['yuv_sse41.c'].flags += CONFIG['SSE2_FLAGS']
+    SOURCES += [
+        'alpha_processing_sse2.c',
+        'alpha_processing_sse41.c',
+        'cost_sse2.c',
+        'dec_sse2.c',
+        'dec_sse41.c',
+        'enc_sse2.c',
+        'enc_sse41.c',
+        'filters_sse2.c',
+        'lossless_enc_sse2.c',
+        'lossless_enc_sse41.c',
+        'lossless_sse2.c',
+        'lossless_sse41.c',
+        'rescaler_sse2.c',
+        'ssim_sse2.c',
+        'upsampling_sse2.c',
+        'upsampling_sse41.c',
+        'yuv_sse2.c',
+        'yuv_sse41.c',
+    ]
+    DEFINES['WEBP_HAVE_SSE2'] = 1;
+    DEFINES['WEBP_HAVE_SSE41'] = 1;
+    for f in SOURCES:
+      if f.endswith('sse2.c'):
+        SOURCES[f].flags += CONFIG['SSE2_FLAGS']
+      elif f.endswith('sse41.c'):
+        SOURCES[f].flags += ['-msse4.1']
+elif CONFIG['CPU_ARCH'].startswith('mips'):
+    SOURCES += [
+        'alpha_processing_mips_dsp_r2.c',
+        'cost_mips32.c',
+        'cost_mips_dsp_r2.c',
+        'dec_mips32.c',
+        'dec_mips_dsp_r2.c',
+        'enc_mips32.c',
+        'enc_mips_dsp_r2.c',
+        'filters_mips_dsp_r2.c',
+        'lossless_enc_mips32.c',
+        'lossless_enc_mips_dsp_r2.c',
+        'lossless_mips_dsp_r2.c',
+        'lossless_msa.c',
+        'rescaler_mips32.c',
+        'rescaler_mips_dsp_r2.c',
+        'rescaler_msa.c',
+        'upsampling_mips_dsp_r2.c',
+        'upsampling_msa.c',
+        'yuv_mips32.c',
+        'yuv_mips_dsp_r2.c',
+    ]
+
+if CONFIG['CC_TYPE'] in ('clang', 'clang-cl'):
+    CFLAGS += ['-Wno-unreachable-code']
 
 FINAL_LIBRARY = 'gkmedias'
 
diff --git a/media/libwebp/dsp/msa_macro.h b/media/libwebp/dsp/msa_macro.h
index de026a1d9e..717e3b7b9f 100644
--- a/media/libwebp/dsp/msa_macro.h
+++ b/media/libwebp/dsp/msa_macro.h
@@ -14,6 +14,10 @@
 #ifndef WEBP_DSP_MSA_MACRO_H_
 #define WEBP_DSP_MSA_MACRO_H_
 
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
 #include <stdint.h>
 #include <msa.h>
 
@@ -1389,4 +1393,5 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
 } while (0)
 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 
+#endif  // WEBP_USE_MSA
 #endif  // WEBP_DSP_MSA_MACRO_H_
diff --git a/media/libwebp/dsp/neon.h b/media/libwebp/dsp/neon.h
index 63c27a2901..9a0f630de0 100644
--- a/media/libwebp/dsp/neon.h
+++ b/media/libwebp/dsp/neon.h
@@ -12,10 +12,12 @@
 #ifndef WEBP_DSP_NEON_H_
 #define WEBP_DSP_NEON_H_
 
-#include <arm_neon.h>
-
 #include "../dsp/dsp.h"
 
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
 // Right now, some intrinsics functions seem slower, so we disable them
 // everywhere except newer clang/gcc or aarch64 where the inline assembly is
 // incompatible.
@@ -98,4 +100,5 @@ static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
 } while (0)
 #endif
 
+#endif  // WEBP_USE_NEON
 #endif  // WEBP_DSP_NEON_H_
diff --git a/media/libwebp/dsp/quant.h b/media/libwebp/dsp/quant.h
index b82e728a53..14d8613431 100644
--- a/media/libwebp/dsp/quant.h
+++ b/media/libwebp/dsp/quant.h
@@ -10,6 +10,8 @@
 #ifndef WEBP_DSP_QUANT_H_
 #define WEBP_DSP_QUANT_H_
 
+#include <string.h>
+
 #include "../dsp/dsp.h"
 #include "../webp/types.h"
 
@@ -67,4 +69,17 @@ static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
 #endif  // defined(WEBP_USE_NEON) && !defined(WEBP_ANDROID_NEON) &&
         // !defined(WEBP_HAVE_NEON_RTCD)
 
+static WEBP_INLINE int IsFlatSource16(const uint8_t* src) {
+  const uint32_t v = src[0] * 0x01010101u;
+  int i;
+  for (i = 0; i < 16; ++i) {
+    if (memcmp(src + 0, &v, 4) || memcmp(src +  4, &v, 4) ||
+        memcmp(src + 8, &v, 4) || memcmp(src + 12, &v, 4)) {
+      return 0;
+    }
+    src += BPS;
+  }
+  return 1;
+}
+
 #endif  // WEBP_DSP_QUANT_H_
diff --git a/media/libwebp/dsp/rescaler.c b/media/libwebp/dsp/rescaler.c
index 6bf387f8e0..4bbd281b1c 100644
--- a/media/libwebp/dsp/rescaler.c
+++ b/media/libwebp/dsp/rescaler.c
@@ -38,8 +38,9 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
     int x_out = channel;
     // simple bilinear interpolation
     int accum = wrk->x_add;
-    int left = src[x_in];
-    int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
+    rescaler_t left = (rescaler_t)src[x_in];
+    rescaler_t right =
+        (wrk->src_width > 1) ? (rescaler_t)src[x_in + x_stride] : left;
     x_in += x_stride;
     while (1) {
       wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
@@ -50,7 +51,7 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
         left = right;
         x_in += x_stride;
         assert(x_in < wrk->src_width * x_stride);
-        right = src[x_in];
+        right = (rescaler_t)src[x_in];
         accum += wrk->x_add;
       }
     }
@@ -109,8 +110,7 @@ void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
     for (x_out = 0; x_out < x_out_max; ++x_out) {
       const uint32_t J = frow[x_out];
       const int v = (int)MULT_FIX(J, wrk->fy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   } else {
     const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
@@ -120,8 +120,7 @@ void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
                        + (uint64_t)B * irow[x_out];
       const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
       const int v = (int)MULT_FIX(J, wrk->fy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   }
 }
@@ -138,17 +137,15 @@ void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
   assert(!wrk->y_expand);
   if (yscale) {
     for (x_out = 0; x_out < x_out_max; ++x_out) {
-      const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
-      const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
+      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = frac;   // new fractional start
     }
   } else {
     for (x_out = 0; x_out < x_out_max; ++x_out) {
       const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = 0;
     }
   }
@@ -217,7 +214,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
   WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPRescalerDspInitSSE2();
     }
@@ -239,7 +236,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPRescalerDspInitNEON();
diff --git a/media/libwebp/dsp/rescaler_mips32.c b/media/libwebp/dsp/rescaler_mips32.c
new file mode 100644
index 0000000000..44fad3fbe7
--- /dev/null
+++ b/media/libwebp/dsp/rescaler_mips32.c
@@ -0,0 +1,295 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of rescaling functions
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32) && !defined(WEBP_REDUCE_SIZE)
+
+#include <assert.h>
+#include "../utils/rescaler_utils.h"
+
+//------------------------------------------------------------------------------
+// Row import
+
+static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
+                                   const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const int fx_scale = wrk->fx_scale;
+  const int x_add = wrk->x_add;
+  const int x_sub = wrk->x_sub;
+  const int x_stride1 = x_stride << 2;
+  int channel;
+  assert(!wrk->x_expand);
+  assert(!WebPRescalerInputDone(wrk));
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    rescaler_t* frow = wrk->frow + channel;
+    int temp1, temp2, temp3;
+    int base, frac, sum;
+    int accum, accum1;
+    int loop_c = x_out_max - channel;
+
+    __asm__ volatile (
+      "li     %[temp1],   0x8000                    \n\t"
+      "li     %[temp2],   0x10000                   \n\t"
+      "li     %[sum],     0                         \n\t"
+      "li     %[accum],   0                         \n\t"
+    "1:                                             \n\t"
+      "addu   %[accum],   %[accum],   %[x_add]      \n\t"
+      "li     %[base],    0                         \n\t"
+      "blez   %[accum],   3f                        \n\t"
+    "2:                                             \n\t"
+      "lbu    %[base],    0(%[src1])                \n\t"
+      "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
+      "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
+      "addu   %[sum],     %[sum],     %[base]       \n\t"
+      "bgtz   %[accum],   2b                        \n\t"
+    "3:                                             \n\t"
+      "negu   %[accum1],  %[accum]                  \n\t"
+      "mul    %[frac],    %[base],    %[accum1]     \n\t"
+      "mul    %[temp3],   %[sum],     %[x_sub]      \n\t"
+      "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
+      "mult   %[temp1],   %[temp2]                  \n\t"
+      "maddu  %[frac],    %[fx_scale]               \n\t"
+      "mfhi   %[sum]                                \n\t"
+      "subu   %[temp3],   %[temp3],   %[frac]       \n\t"
+      "sw     %[temp3],   0(%[frow])                \n\t"
+      "addu   %[frow],    %[frow],    %[x_stride1]  \n\t"
+      "bgtz   %[loop_c],  1b                        \n\t"
+      : [accum]"=&r"(accum), [src1]"+r"(src1), [temp3]"=&r"(temp3),
+        [sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac),
+        [frow]"+r"(frow), [accum1]"=&r"(accum1),
+        [temp2]"=&r"(temp2), [temp1]"=&r"(temp1)
+      : [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale),
+        [x_sub]"r"(x_sub), [x_add]"r"(x_add),
+        [loop_c]"r"(loop_c), [x_stride1]"r"(x_stride1)
+      : "memory", "hi", "lo"
+    );
+    assert(accum == 0);
+  }
+}
+
+static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
+                                   const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const int x_add = wrk->x_add;
+  const int x_sub = wrk->x_sub;
+  const int src_width = wrk->src_width;
+  const int x_stride1 = x_stride << 2;
+  int channel;
+  assert(wrk->x_expand);
+  assert(!WebPRescalerInputDone(wrk));
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    rescaler_t* frow = wrk->frow + channel;
+    int temp1, temp2, temp3, temp4;
+    int frac;
+    int accum;
+    int x_out = channel;
+
+    __asm__ volatile (
+      "addiu  %[temp3],   %[src_width], -1            \n\t"
+      "lbu    %[temp2],   0(%[src1])                  \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "bgtz   %[temp3],   0f                          \n\t"
+      "addiu  %[temp1],   %[temp2],     0             \n\t"
+      "b      3f                                      \n\t"
+    "0:                                               \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+    "3:                                               \n\t"
+      "addiu  %[accum],   %[x_add],     0             \n\t"
+    "1:                                               \n\t"
+      "subu   %[temp3],   %[temp2],     %[temp1]      \n\t"
+      "mul    %[temp3],   %[temp3],     %[accum]      \n\t"
+      "mul    %[temp4],   %[temp1],     %[x_add]      \n\t"
+      "addu   %[temp3],   %[temp4],     %[temp3]      \n\t"
+      "sw     %[temp3],   0(%[frow])                  \n\t"
+      "addu   %[frow],    %[frow],      %[x_stride1]  \n\t"
+      "addu   %[x_out],   %[x_out],     %[x_stride]   \n\t"
+      "subu   %[temp3],   %[x_out],     %[x_out_max]  \n\t"
+      "bgez   %[temp3],   2f                          \n\t"
+      "subu   %[accum],   %[accum],     %[x_sub]      \n\t"
+      "bgez   %[accum],   4f                          \n\t"
+      "addiu  %[temp2],   %[temp1],     0             \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+      "addu   %[accum],   %[accum],     %[x_add]      \n\t"
+    "4:                                               \n\t"
+      "b      1b                                      \n\t"
+    "2:                                               \n\t"
+      : [src1]"+r"(src1), [accum]"=&r"(accum), [temp1]"=&r"(temp1),
+        [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+        [x_out]"+r"(x_out), [frac]"=&r"(frac), [frow]"+r"(frow)
+      : [x_stride]"r"(x_stride), [x_add]"r"(x_add), [x_sub]"r"(x_sub),
+        [x_stride1]"r"(x_stride1), [src_width]"r"(src_width),
+        [x_out_max]"r"(x_out_max)
+      : "memory", "hi", "lo"
+    );
+    assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Row export
+
+static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  int temp0, temp1, temp3, temp4, temp5, loop_end;
+  const int temp2 = (int)wrk->fy_scale;
+  const int temp6 = x_out_max << 2;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "lw       %[temp1],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[A],        %[temp0]                   \n\t"
+      "maddu    %[B],        %[temp1]                   \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp5],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
+      : "memory", "hi", "lo"
+    );
+  }
+}
+
+#if 0  // disabled for now. TODO(skal): make match the C-code
+static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) {
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const rescaler_t* frow = wrk->frow;
+  const int yscale = wrk->fy_scale * (-wrk->y_accum);
+  int temp0, temp1, temp3, temp4, temp5, loop_end;
+  const int temp2 = (int)wrk->fxy_scale;
+  const int temp6 = x_out_max << 2;
+
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  assert(wrk->fxy_scale != 0);
+  if (yscale) {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "maddu    %[temp0],    %[yscale]                  \n\t"
+      "mfhi     %[temp1]                                \n\t"
+      "lw       %[temp0],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "subu     %[temp0],    %[temp0],    %[temp1]      \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sw       %[temp1],    -4(%[irow])                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
+  } else {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[irow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sw       $zero,       -4(%[irow])                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[irow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
+        [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
+  }
+}
+#endif  // 0
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
+  WebPRescalerImportRowExpand = ImportRowExpand_MIPS32;
+  WebPRescalerImportRowShrink = ImportRowShrink_MIPS32;
+  WebPRescalerExportRowExpand = ExportRowExpand_MIPS32;
+//  WebPRescalerExportRowShrink = ExportRowShrink_MIPS32;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/rescaler_mips_dsp_r2.c b/media/libwebp/dsp/rescaler_mips_dsp_r2.c
new file mode 100644
index 0000000000..d6f2996578
--- /dev/null
+++ b/media/libwebp/dsp/rescaler_mips_dsp_r2.c
@@ -0,0 +1,314 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of rescaling functions
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2) && !defined(WEBP_REDUCE_SIZE)
+
+#include <assert.h>
+#include "../utils/rescaler_utils.h"
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
+
+//------------------------------------------------------------------------------
+// Row export
+
+#if 0  // disabled for now. TODO(skal): make match the C-code
+static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
+  int i;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const rescaler_t* frow = wrk->frow;
+  const int yscale = wrk->fy_scale * (-wrk->y_accum);
+  int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
+  const int temp7 = (int)wrk->fxy_scale;
+  const int temp6 = (x_out_max & ~0x3) << 2;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  assert(wrk->fxy_scale != 0);
+  if (yscale) {
+    if (x_out_max >= 4) {
+      int temp8, temp9, temp10, temp11;
+      __asm__ volatile (
+        "li       %[temp3],    0x10000                    \n\t"
+        "li       %[temp4],    0x8000                     \n\t"
+        "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[frow])                 \n\t"
+        "lw       %[temp1],    4(%[frow])                 \n\t"
+        "lw       %[temp2],    8(%[frow])                 \n\t"
+        "lw       %[temp5],    12(%[frow])                \n\t"
+        "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac0,        %[temp0],    %[yscale]     \n\t"
+        "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac1,        %[temp1],    %[yscale]     \n\t"
+        "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac2,        %[temp2],    %[yscale]     \n\t"
+        "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac3,        %[temp5],    %[yscale]     \n\t"
+        "addiu    %[frow],     %[frow],     16            \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp5],    $ac3                       \n\t"
+        "lw       %[temp8],    0(%[irow])                 \n\t"
+        "lw       %[temp9],    4(%[irow])                 \n\t"
+        "lw       %[temp10],   8(%[irow])                 \n\t"
+        "lw       %[temp11],   12(%[irow])                \n\t"
+        "addiu    %[dst],      %[dst],      4             \n\t"
+        "addiu    %[irow],     %[irow],     16            \n\t"
+        "subu     %[temp8],    %[temp8],    %[temp0]      \n\t"
+        "subu     %[temp9],    %[temp9],    %[temp1]      \n\t"
+        "subu     %[temp10],   %[temp10],   %[temp2]      \n\t"
+        "subu     %[temp11],   %[temp11],   %[temp5]      \n\t"
+        "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac0,        %[temp8],    %[temp7]      \n\t"
+        "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac1,        %[temp9],    %[temp7]      \n\t"
+        "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac2,        %[temp10],   %[temp7]      \n\t"
+        "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac3,        %[temp11],   %[temp7]      \n\t"
+        "mfhi     %[temp8],    $ac0                       \n\t"
+        "mfhi     %[temp9],    $ac1                       \n\t"
+        "mfhi     %[temp10],   $ac2                       \n\t"
+        "mfhi     %[temp11],   $ac3                       \n\t"
+        "sw       %[temp0],    -16(%[irow])               \n\t"
+        "sw       %[temp1],    -12(%[irow])               \n\t"
+        "sw       %[temp2],    -8(%[irow])                \n\t"
+        "sw       %[temp5],    -4(%[irow])                \n\t"
+        "sb       %[temp8],    -4(%[dst])                 \n\t"
+        "sb       %[temp9],    -3(%[dst])                 \n\t"
+        "sb       %[temp10],   -2(%[dst])                 \n\t"
+        "sb       %[temp11],   -1(%[dst])                 \n\t"
+        "bne      %[frow],     %[loop_end], 1b            \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+          [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
+          [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
+          [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
+        : [temp7]"r"(temp7), [yscale]"r"(yscale), [temp6]"r"(temp6)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+      );
+    }
+    for (i = 0; i < (x_out_max & 0x3); ++i) {
+      const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(*frow++, yscale);
+      const int v = (int)MULT_FIX(*irow - frac, wrk->fxy_scale);
+      *dst++ = (v > 255) ? 255u : (uint8_t)v;
+      *irow++ = frac;   // new fractional start
+    }
+  } else {
+    if (x_out_max >= 4) {
+      __asm__ volatile (
+        "li       %[temp3],    0x10000                    \n\t"
+        "li       %[temp4],    0x8000                     \n\t"
+        "addu     %[loop_end], %[irow],     %[temp6]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[irow])                 \n\t"
+        "lw       %[temp1],    4(%[irow])                 \n\t"
+        "lw       %[temp2],    8(%[irow])                 \n\t"
+        "lw       %[temp5],    12(%[irow])                \n\t"
+        "addiu    %[dst],      %[dst],      4             \n\t"
+        "addiu    %[irow],     %[irow],     16            \n\t"
+        "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
+        "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
+        "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
+        "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac3,        %[temp5],    %[temp7]      \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp5],    $ac3                       \n\t"
+        "sw       $zero,       -16(%[irow])               \n\t"
+        "sw       $zero,       -12(%[irow])               \n\t"
+        "sw       $zero,       -8(%[irow])                \n\t"
+        "sw       $zero,       -4(%[irow])                \n\t"
+        "sb       %[temp0],    -4(%[dst])                 \n\t"
+        "sb       %[temp1],    -3(%[dst])                 \n\t"
+        "sb       %[temp2],    -2(%[dst])                 \n\t"
+        "sb       %[temp5],    -1(%[dst])                 \n\t"
+        "bne      %[irow],     %[loop_end], 1b            \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
+          [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
+        : [temp7]"r"(temp7), [temp6]"r"(temp6)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+      );
+    }
+    for (i = 0; i < (x_out_max & 0x3); ++i) {
+      const int v = (int)MULT_FIX_FLOOR(*irow, wrk->fxy_scale);
+      *dst++ = (v > 255) ? 255u : (uint8_t)v;
+      *irow++ = 0;
+    }
+  }
+}
+#endif  // 0
+
+static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
+  int i;
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
+  const int temp6 = (x_out_max & ~0x3) << 2;
+  const int temp7 = (int)wrk->fy_scale;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    if (x_out_max >= 4) {
+      __asm__ volatile (
+        "li       %[temp4],    0x10000                    \n\t"
+        "li       %[temp5],    0x8000                     \n\t"
+        "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[frow])                 \n\t"
+        "lw       %[temp1],    4(%[frow])                 \n\t"
+        "lw       %[temp2],    8(%[frow])                 \n\t"
+        "lw       %[temp3],    12(%[frow])                \n\t"
+        "addiu    %[dst],      %[dst],      4             \n\t"
+        "addiu    %[frow],     %[frow],     16            \n\t"
+        "mult     $ac0,        %[temp4],    %[temp5]      \n\t"
+        "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
+        "mult     $ac1,        %[temp4],    %[temp5]      \n\t"
+        "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
+        "mult     $ac2,        %[temp4],    %[temp5]      \n\t"
+        "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
+        "mult     $ac3,        %[temp4],    %[temp5]      \n\t"
+        "maddu    $ac3,        %[temp3],    %[temp7]      \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp3],    $ac3                       \n\t"
+        "sb       %[temp0],    -4(%[dst])                 \n\t"
+        "sb       %[temp1],    -3(%[dst])                 \n\t"
+        "sb       %[temp2],    -2(%[dst])                 \n\t"
+        "sb       %[temp3],    -1(%[dst])                 \n\t"
+        "bne      %[frow],     %[loop_end], 1b            \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+          [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
+        : [temp7]"r"(temp7), [temp6]"r"(temp6)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+      );
+    }
+    for (i = 0; i < (x_out_max & 0x3); ++i) {
+      const uint32_t J = *frow++;
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      *dst++ = (v > 255) ? 255u : (uint8_t)v;
+    }
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    if (x_out_max >= 4) {
+      int temp8, temp9, temp10, temp11;
+      __asm__ volatile (
+        "li       %[temp8],    0x10000                    \n\t"
+        "li       %[temp9],    0x8000                     \n\t"
+        "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[frow])                 \n\t"
+        "lw       %[temp1],    4(%[frow])                 \n\t"
+        "lw       %[temp2],    8(%[frow])                 \n\t"
+        "lw       %[temp3],    12(%[frow])                \n\t"
+        "lw       %[temp4],    0(%[irow])                 \n\t"
+        "lw       %[temp5],    4(%[irow])                 \n\t"
+        "lw       %[temp10],   8(%[irow])                 \n\t"
+        "lw       %[temp11],   12(%[irow])                \n\t"
+        "addiu    %[dst],      %[dst],      4             \n\t"
+        "mult     $ac0,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac0,        %[A],        %[temp0]      \n\t"
+        "maddu    $ac0,        %[B],        %[temp4]      \n\t"
+        "mult     $ac1,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac1,        %[A],        %[temp1]      \n\t"
+        "maddu    $ac1,        %[B],        %[temp5]      \n\t"
+        "mult     $ac2,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac2,        %[A],        %[temp2]      \n\t"
+        "maddu    $ac2,        %[B],        %[temp10]     \n\t"
+        "mult     $ac3,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac3,        %[A],        %[temp3]      \n\t"
+        "maddu    $ac3,        %[B],        %[temp11]     \n\t"
+        "addiu    %[frow],     %[frow],     16            \n\t"
+        "addiu    %[irow],     %[irow],     16            \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp3],    $ac3                       \n\t"
+        "mult     $ac0,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
+        "mult     $ac1,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
+        "mult     $ac2,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
+        "mult     $ac3,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac3,        %[temp3],    %[temp7]      \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp3],    $ac3                       \n\t"
+        "sb       %[temp0],    -4(%[dst])                 \n\t"
+        "sb       %[temp1],    -3(%[dst])                 \n\t"
+        "sb       %[temp2],    -2(%[dst])                 \n\t"
+        "sb       %[temp3],    -1(%[dst])                 \n\t"
+        "bne      %[frow],     %[loop_end], 1b            \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+          [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
+          [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
+          [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
+        : [temp7]"r"(temp7), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+      );
+    }
+    for (i = 0; i < (x_out_max & 0x3); ++i) {
+      const uint64_t I = (uint64_t)A * *frow++
+                       + (uint64_t)B * *irow++;
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      *dst++ = (v > 255) ? 255u : (uint8_t)v;
+    }
+  }
+}
+
+#undef MULT_FIX_FLOOR
+#undef MULT_FIX
+#undef ROUNDER
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
+  WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2;
+//  WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/dsp/rescaler_msa.c b/media/libwebp/dsp/rescaler_msa.c
new file mode 100644
index 0000000000..3366b6d637
--- /dev/null
+++ b/media/libwebp/dsp/rescaler_msa.c
@@ -0,0 +1,443 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of rescaling functions
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
+
+#include <assert.h>
+
+#include "../utils/rescaler_utils.h"
+#include "../dsp/msa_macro.h"
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
+
+#define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do {  \
+  v4u32 tmp0, tmp1, tmp2, tmp3;                                       \
+  v16u8 t0, t1, t2, t3, t4, t5;                                       \
+  v2u64 out0, out1, out2, out3;                                       \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                                 \
+  ILVRL_W2_UW(zero, in1, tmp2, tmp3);                                 \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
+  DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
+  PCKEV_B2_UB(out1, out0, out3, out2, t0, t1);                        \
+  ILVRL_W2_UW(zero, in2, tmp0, tmp1);                                 \
+  ILVRL_W2_UW(zero, in3, tmp2, tmp3);                                 \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
+  DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
+  PCKEV_B2_UB(out1, out0, out3, out2, t2, t3);                        \
+  PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);                                \
+  dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);                   \
+} while (0)
+
+#define CALC_MULT_FIX_4(in0, scale, shift, dst) do {  \
+  v4u32 tmp0, tmp1;                                   \
+  v16i8 t0, t1;                                       \
+  v2u64 out0, out1;                                   \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                 \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);  \
+  SRAR_D2_UD(out0, out1, shift);                      \
+  t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);       \
+  t1 = __msa_pckev_b(t0, t0);                         \
+  t0 = __msa_pckev_b(t1, t1);                         \
+  dst = __msa_copy_s_w((v4i32)t0, 0);                 \
+} while (0)
+
+#define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift,  \
+                          dst0, dst1, dst2, dst3) do {         \
+  v4u32 tmp0, tmp1, tmp2, tmp3;                                \
+  v2u64 out0, out1, out2, out3;                                \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                          \
+  ILVRL_W2_UW(zero, in1, tmp2, tmp3);                          \
+  DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
+  DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
+  PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1);             \
+  ILVRL_W2_UW(zero, in2, tmp0, tmp1);                          \
+  ILVRL_W2_UW(zero, in3, tmp2, tmp3);                          \
+  DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
+  DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
+  PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3);             \
+} while (0)
+
+#define CALC_MULT_FIX1_4(in0, scale, shift, dst) do {    \
+  v4u32 tmp0, tmp1;                                      \
+  v2u64 out0, out1;                                      \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                    \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);     \
+  SRAR_D2_UD(out0, out1, shift);                         \
+  dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0);  \
+} while (0)
+
+#define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift,  \
+                          dst0, dst1) do {                         \
+  v4u32 tmp0, tmp1, tmp2, tmp3;                                    \
+  v2u64 out0, out1, out2, out3;                                    \
+  ILVRL_W2_UW(in0, in2, tmp0, tmp1);                               \
+  ILVRL_W2_UW(in1, in3, tmp2, tmp3);                               \
+  DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                 \
+  DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3);                 \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
+  DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);               \
+  DOTP_UW2_UD(out2, out3, scale, scale, out2, out3);               \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
+  PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1);                 \
+} while (0)
+
+#define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do {  \
+  v4u32 tmp0, tmp1;                                               \
+  v2u64 out0, out1;                                               \
+  v16i8 t0, t1;                                                   \
+  ILVRL_W2_UW(in0, in1, tmp0, tmp1);                              \
+  DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                \
+  SRAR_D2_UD(out0, out1, shift);                                  \
+  DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);              \
+  SRAR_D2_UD(out0, out1, shift);                                  \
+  t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);                   \
+  t1 = __msa_pckev_b(t0, t0);                                     \
+  t0 = __msa_pckev_b(t1, t1);                                     \
+  dst = __msa_copy_s_w((v4i32)t0, 0);                             \
+} while (0)
+
+static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
+                                          int length,
+                                          WebPRescaler* const wrk) {
+  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
+  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+  const v4i32 zero = { 0 };
+
+  while (length >= 16) {
+    v4u32 src0, src1, src2, src3;
+    v16u8 out;
+    LD_UW4(frow, 4, src0, src1, src2, src3);
+    CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
+    ST_UB(out, dst);
+    length -= 16;
+    frow   += 16;
+    dst    += 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 src0, src1, src2;
+      LD_UW3(frow, 4, src0, src1, src2);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      CALC_MULT_FIX_4(src2, scale, shift, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      length -= 12;
+      frow   += 12;
+      dst    += 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 src0, src1;
+      LD_UW2(frow, 4, src0, src1);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      length -= 8;
+      frow   += 8;
+      dst    += 8;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      const v4u32 src0 = LD_UW(frow);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      SW(val0_m, dst);
+      length -= 4;
+      frow   += 4;
+      dst    += 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const uint32_t J = frow[x_out];
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
+    }
+  }
+}
+
+static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
+                                          uint8_t* dst, int length,
+                                          WebPRescaler* const wrk) {
+  const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+  const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+  const v4i32 B1 = __msa_fill_w(B);
+  const v4i32 A1 = __msa_fill_w(A);
+  const v4i32 AB = __msa_ilvr_w(A1, B1);
+  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
+  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+
+  while (length >= 16) {
+    v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
+    v16u8 t0, t1, t2, t3, t4, t5;
+    LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
+    LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
+    CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
+    CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
+    PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
+    t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
+    ST_UB(t0, dst);
+    frow   += 16;
+    irow   += 16;
+    dst    += 16;
+    length -= 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
+      LD_UW3(frow, 4, frow0, frow1, frow2);
+      LD_UW3(irow, 4, irow0, irow1, irow2);
+      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+      CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
+      CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      frow   += 12;
+      irow   += 12;
+      dst    += 12;
+      length -= 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 frow0, frow1, irow0, irow1;
+      LD_UW2(frow, 4, frow0, frow1);
+      LD_UW2(irow, 4, irow0, irow1);
+      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+      CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      frow   += 4;
+      irow   += 4;
+      dst    += 4;
+      length -= 4;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      const v4u32 frow0 = LD_UW(frow + 0);
+      const v4u32 irow0 = LD_UW(irow + 0);
+      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+      SW(val0_m, dst);
+      frow   += 4;
+      irow   += 4;
+      dst    += 4;
+      length -= 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const uint64_t I = (uint64_t)A * frow[x_out]
+                       + (uint64_t)B * irow[x_out];
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
+    }
+  }
+}
+
+static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    ExportRowExpand_0(frow, dst, x_out_max, wrk);
+  } else {
+    ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
+  }
+}
+
+#if 0  // disabled for now. TODO(skal): make match the C-code
+static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
+                                          uint8_t* dst, int length,
+                                          const uint32_t yscale,
+                                          WebPRescaler* const wrk) {
+  const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
+  const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
+  const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+  const v4i32 zero = { 0 };
+
+  while (length >= 16) {
+    v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
+    v16u8 out;
+    LD_UW4(frow, 4, src0, src1, src2, src3);
+    CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
+                      frac0, frac1, frac2, frac3);
+    LD_UW4(irow, 4, src0, src1, src2, src3);
+    SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
+         src0, src1, src2, src3);
+    CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
+    ST_UB(out, dst);
+    ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
+    frow   += 16;
+    irow   += 16;
+    dst    += 16;
+    length -= 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 src0, src1, src2, frac0, frac1, frac2;
+      LD_UW3(frow, 4, src0, src1, src2);
+      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+      CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
+      CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
+      LD_UW3(irow, 4, src0, src1, src2);
+      SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
+      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+      CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
+      CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      ST_UW3(frac0, frac1, frac2, irow, 4);
+      frow   += 12;
+      irow   += 12;
+      dst    += 12;
+      length -= 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 src0, src1, frac0, frac1;
+      LD_UW2(frow, 4, src0, src1);
+      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+      CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
+      LD_UW2(irow, 4, src0, src1);
+      SUB2(src0, frac0, src1, frac1, src0, src1);
+      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+      CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      ST_UW2(frac0, frac1, irow, 4);
+      frow   += 8;
+      irow   += 8;
+      dst    += 8;
+      length -= 8;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      v4u32 frac0;
+      v4u32 src0 = LD_UW(frow);
+      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+      src0 = LD_UW(irow);
+      src0 = src0 - frac0;
+      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+      SW(val0_m, dst);
+      ST_UW(frac0, irow);
+      frow   += 4;
+      irow   += 4;
+      dst    += 4;
+      length -= 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
+      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
+      irow[x_out] = frac;
+    }
+  }
+}
+
+static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
+                                          int length,
+                                          WebPRescaler* const wrk) {
+  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
+  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+  const v4i32 zero = { 0 };
+
+  while (length >= 16) {
+    v4u32 src0, src1, src2, src3;
+    v16u8 dst0;
+    LD_UW4(irow, 4, src0, src1, src2, src3);
+    CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
+    ST_UB(dst0, dst);
+    ST_SW4(zero, zero, zero, zero, irow, 4);
+    length -= 16;
+    irow   += 16;
+    dst    += 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 src0, src1, src2;
+      LD_UW3(irow, 4, src0, src1, src2);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      CALC_MULT_FIX_4(src2, scale, shift, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      ST_SW3(zero, zero, zero, irow, 4);
+      length -= 12;
+      irow   += 12;
+      dst    += 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 src0, src1;
+      LD_UW2(irow, 4, src0, src1);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      ST_SW2(zero, zero, irow, 4);
+      length -= 8;
+      irow   += 8;
+      dst    += 8;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      const v4u32 src0 = LD_UW(irow + 0);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      SW(val0_m, dst);
+      ST_SW(zero, irow);
+      length -= 4;
+      irow   += 4;
+      dst    += 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
+      irow[x_out] = 0;
+    }
+  }
+}
+
+static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  if (yscale) {
+    ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
+  } else {
+    ExportRowShrink_1(irow, dst, x_out_max, wrk);
+  }
+}
+#endif  // 0
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
+  WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
+//  WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
+}
+
+#else     // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
+
+#endif    // WEBP_USE_MSA
diff --git a/media/libwebp/dsp/rescaler_neon.c b/media/libwebp/dsp/rescaler_neon.c
index b560d0cdcc..62bef72113 100644
--- a/media/libwebp/dsp/rescaler_neon.c
+++ b/media/libwebp/dsp/rescaler_neon.c
@@ -81,14 +81,13 @@ static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
       const uint32x4_t B1 = MULT_FIX(A1, fy_scale_half);
       const uint16x4_t C0 = vmovn_u32(B0);
       const uint16x4_t C1 = vmovn_u32(B1);
-      const uint8x8_t D = vmovn_u16(vcombine_u16(C0, C1));
+      const uint8x8_t D = vqmovn_u16(vcombine_u16(C0, C1));
       vst1_u8(dst + x_out, D);
     }
     for (; x_out < x_out_max; ++x_out) {
       const uint32_t J = frow[x_out];
       const int v = (int)MULT_FIX_C(J, fy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   } else {
     const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
@@ -102,7 +101,7 @@ static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
       const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
       const uint16x4_t E0 = vmovn_u32(D0);
       const uint16x4_t E1 = vmovn_u32(D1);
-      const uint8x8_t F = vmovn_u16(vcombine_u16(E0, E1));
+      const uint8x8_t F = vqmovn_u16(vcombine_u16(E0, E1));
       vst1_u8(dst + x_out, F);
     }
     for (; x_out < x_out_max; ++x_out) {
@@ -110,8 +109,7 @@ static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
                        + (uint64_t)B * irow[x_out];
       const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
       const int v = (int)MULT_FIX_C(J, fy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   }
 }
@@ -135,23 +133,22 @@ static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
     for (x_out = 0; x_out < max_span; x_out += 8) {
       LOAD_32x8(frow + x_out, in0, in1);
       LOAD_32x8(irow + x_out, in2, in3);
-      const uint32x4_t A0 = MULT_FIX(in0, yscale_half);
-      const uint32x4_t A1 = MULT_FIX(in1, yscale_half);
+      const uint32x4_t A0 = MULT_FIX_FLOOR(in0, yscale_half);
+      const uint32x4_t A1 = MULT_FIX_FLOOR(in1, yscale_half);
       const uint32x4_t B0 = vqsubq_u32(in2, A0);
       const uint32x4_t B1 = vqsubq_u32(in3, A1);
-      const uint32x4_t C0 = MULT_FIX_FLOOR(B0, fxy_scale_half);
-      const uint32x4_t C1 = MULT_FIX_FLOOR(B1, fxy_scale_half);
+      const uint32x4_t C0 = MULT_FIX(B0, fxy_scale_half);
+      const uint32x4_t C1 = MULT_FIX(B1, fxy_scale_half);
       const uint16x4_t D0 = vmovn_u32(C0);
       const uint16x4_t D1 = vmovn_u32(C1);
-      const uint8x8_t E = vmovn_u16(vcombine_u16(D0, D1));
+      const uint8x8_t E = vqmovn_u16(vcombine_u16(D0, D1));
       vst1_u8(dst + x_out, E);
       STORE_32x8(A0, A1, irow + x_out);
     }
     for (; x_out < x_out_max; ++x_out) {
-      const uint32_t frac = (uint32_t)MULT_FIX_C(frow[x_out], yscale);
-      const int v = (int)MULT_FIX_FLOOR_C(irow[x_out] - frac, fxy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      const uint32_t frac = (uint32_t)MULT_FIX_FLOOR_C(frow[x_out], yscale);
+      const int v = (int)MULT_FIX_C(irow[x_out] - frac, fxy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = frac;   // new fractional start
     }
   } else {
@@ -161,14 +158,13 @@ static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
       const uint32x4_t A1 = MULT_FIX(in1, fxy_scale_half);
       const uint16x4_t B0 = vmovn_u32(A0);
       const uint16x4_t B1 = vmovn_u32(A1);
-      const uint8x8_t C = vmovn_u16(vcombine_u16(B0, B1));
+      const uint8x8_t C = vqmovn_u16(vcombine_u16(B0, B1));
       vst1_u8(dst + x_out, C);
       STORE_32x8(zero, zero, irow + x_out);
     }
     for (; x_out < x_out_max; ++x_out) {
       const int v = (int)MULT_FIX_C(irow[x_out], fxy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = 0;
     }
   }
diff --git a/media/libwebp/dsp/rescaler_sse2.c b/media/libwebp/dsp/rescaler_sse2.c
index 2d35f76ab0..237997c808 100644
--- a/media/libwebp/dsp/rescaler_sse2.c
+++ b/media/libwebp/dsp/rescaler_sse2.c
@@ -225,35 +225,6 @@ static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
   _mm_storel_epi64((__m128i*)dst, G);
 }
 
-static WEBP_INLINE void ProcessRow_Floor_SSE2(const __m128i* const A0,
-                                              const __m128i* const A1,
-                                              const __m128i* const A2,
-                                              const __m128i* const A3,
-                                              const __m128i* const mult,
-                                              uint8_t* const dst) {
-  const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
-  const __m128i B0 = _mm_mul_epu32(*A0, *mult);
-  const __m128i B1 = _mm_mul_epu32(*A1, *mult);
-  const __m128i B2 = _mm_mul_epu32(*A2, *mult);
-  const __m128i B3 = _mm_mul_epu32(*A3, *mult);
-  const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX);
-  const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX);
-#if (WEBP_RESCALER_RFIX < 32)
-  const __m128i D2 =
-      _mm_and_si128(_mm_slli_epi64(B2, 32 - WEBP_RESCALER_RFIX), mask);
-  const __m128i D3 =
-      _mm_and_si128(_mm_slli_epi64(B3, 32 - WEBP_RESCALER_RFIX), mask);
-#else
-  const __m128i D2 = _mm_and_si128(B2, mask);
-  const __m128i D3 = _mm_and_si128(B3, mask);
-#endif
-  const __m128i E0 = _mm_or_si128(D0, D2);
-  const __m128i E1 = _mm_or_si128(D1, D3);
-  const __m128i F = _mm_packs_epi32(E0, E1);
-  const __m128i G = _mm_packus_epi16(F, F);
-  _mm_storel_epi64((__m128i*)dst, G);
-}
-
 static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
@@ -274,8 +245,7 @@ static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
     for (; x_out < x_out_max; ++x_out) {
       const uint32_t J = frow[x_out];
       const int v = (int)MULT_FIX(J, wrk->fy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   } else {
     const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
@@ -308,8 +278,7 @@ static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
                        + (uint64_t)B * irow[x_out];
       const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
       const int v = (int)MULT_FIX(J, wrk->fy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
     }
   }
 }
@@ -328,20 +297,15 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
     const int scale_xy = wrk->fxy_scale;
     const __m128i mult_xy = _mm_set_epi32(0, scale_xy, 0, scale_xy);
     const __m128i mult_y = _mm_set_epi32(0, yscale, 0, yscale);
-    const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
     for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
       __m128i A0, A1, A2, A3, B0, B1, B2, B3;
       LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
       LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
       {
-        const __m128i C0 = _mm_add_epi64(B0, rounder);
-        const __m128i C1 = _mm_add_epi64(B1, rounder);
-        const __m128i C2 = _mm_add_epi64(B2, rounder);
-        const __m128i C3 = _mm_add_epi64(B3, rounder);
-        const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);   // = frac
-        const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
-        const __m128i D2 = _mm_srli_epi64(C2, WEBP_RESCALER_RFIX);
-        const __m128i D3 = _mm_srli_epi64(C3, WEBP_RESCALER_RFIX);
+        const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX);   // = frac
+        const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX);
+        const __m128i D2 = _mm_srli_epi64(B2, WEBP_RESCALER_RFIX);
+        const __m128i D3 = _mm_srli_epi64(B3, WEBP_RESCALER_RFIX);
         const __m128i E0 = _mm_sub_epi64(A0, D0);   // irow[x] - frac
         const __m128i E1 = _mm_sub_epi64(A1, D1);
         const __m128i E2 = _mm_sub_epi64(A2, D2);
@@ -352,14 +316,13 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
         const __m128i G1 = _mm_or_si128(D1, F3);
         _mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
         _mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
-        ProcessRow_Floor_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
+        ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
       }
     }
     for (; x_out < x_out_max; ++x_out) {
-      const uint32_t frac = (int)MULT_FIX(frow[x_out], yscale);
-      const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      const uint32_t frac = (int)MULT_FIX_FLOOR(frow[x_out], yscale);
+      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = frac;   // new fractional start
     }
   } else {
@@ -375,8 +338,7 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
     }
     for (; x_out < x_out_max; ++x_out) {
       const int v = (int)MULT_FIX(irow[x_out], scale);
-      assert(v >= 0 && v <= 255);
-      dst[x_out] = v;
+      dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
       irow[x_out] = 0;
     }
   }
diff --git a/media/libwebp/dsp/ssim.c b/media/libwebp/dsp/ssim.c
new file mode 100644
index 0000000000..4141e32fd8
--- /dev/null
+++ b/media/libwebp/dsp/ssim.c
@@ -0,0 +1,159 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>  // for abs()
+
+#include "../dsp/dsp.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR
+
+// hat-shaped filter. Sum of coefficients is equal to 16.
+static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
+  1, 2, 3, 4, 3, 2, 1
+};
+static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
+
+static WEBP_INLINE double SSIMCalculation(
+    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
+  const uint32_t w2 =  N * N;
+  const uint32_t C1 = 20 * w2;
+  const uint32_t C2 = 60 * w2;
+  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
+  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
+  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
+  if (xmxm + ymym >= C3) {
+    const int64_t xmym = (int64_t)stats->xm * stats->ym;
+    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
+    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
+    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
+    // we descale by 8 to prevent overflow during the fnum/fden multiply.
+    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
+    const uint64_t den_S = (sxx + syy + C2) >> 8;
+    const uint64_t fnum = (2 * xmym + C1) * num_S;
+    const uint64_t fden = (xmxm + ymym + C1) * den_S;
+    const double r = (double)fnum / fden;
+    assert(r >= 0. && r <= 1.0);
+    return r;
+  }
+  return 1.;   // area is too dark to contribute meaningfully
+}
+
+double VP8SSIMFromStats(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, kWeightSum);
+}
+
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, stats->w);
+}
+
+static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
+                               const uint8_t* src2, int stride2,
+                               int xo, int yo, int W, int H) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
+  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
+                                                  : yo + VP8_SSIM_KERNEL;
+  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
+  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
+                                                  : xo + VP8_SSIM_KERNEL;
+  int x, y;
+  src1 += ymin * stride1;
+  src2 += ymin * stride2;
+  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
+    for (x = xmin; x <= xmax; ++x) {
+      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
+                       * kWeight[VP8_SSIM_KERNEL + y - yo];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.w   += w;
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
+    }
+  }
+  return VP8SSIMFromStatsClipped(&stats);
+}
+
+static double SSIMGet_C(const uint8_t* src1, int stride1,
+                        const uint8_t* src2, int stride2) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+  int x, y;
+  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
+    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
+      const uint32_t w = kWeight[x] * kWeight[y];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
+    }
+  }
+  return VP8SSIMFromStats(&stats);
+}
+
+#endif  // !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_DISABLE_STATS)
+static uint32_t AccumulateSSE_C(const uint8_t* src1,
+                                const uint8_t* src2, int len) {
+  int i;
+  uint32_t sse2 = 0;
+  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
+  for (i = 0; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_REDUCE_SIZE)
+VP8SSIMGetFunc VP8SSIMGet;
+VP8SSIMGetClippedFunc VP8SSIMGetClipped;
+#endif
+#if !defined(WEBP_DISABLE_STATS)
+VP8AccumulateSSEFunc VP8AccumulateSSE;
+#endif
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
+#if !defined(WEBP_REDUCE_SIZE)
+  VP8SSIMGetClipped = SSIMGetClipped_C;
+  VP8SSIMGet = SSIMGet_C;
+#endif
+
+#if !defined(WEBP_DISABLE_STATS)
+  VP8AccumulateSSE = AccumulateSSE_C;
+#endif
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_HAVE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8SSIMDspInitSSE2();
+    }
+#endif
+  }
+}
diff --git a/media/libwebp/dsp/ssim_sse2.c b/media/libwebp/dsp/ssim_sse2.c
new file mode 100644
index 0000000000..89810a77c8
--- /dev/null
+++ b/media/libwebp/dsp/ssim_sse2.c
@@ -0,0 +1,165 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "../dsp/common_sse2.h"
+
+#if !defined(WEBP_DISABLE_STATS)
+
+// Helper function
+static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
+                                               __m128i* const sum) {
+  // take abs(a-b) in 8b
+  const __m128i a_b = _mm_subs_epu8(a, b);
+  const __m128i b_a = _mm_subs_epu8(b, a);
+  const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
+  // zero-extend to 16b
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
+  const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
+  // multiply with self
+  const __m128i sum1 = _mm_madd_epi16(C0, C0);
+  const __m128i sum2 = _mm_madd_epi16(C1, C1);
+  *sum = _mm_add_epi32(sum1, sum2);
+}
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR entry point
+
+static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
+                                   const uint8_t* src2, int len) {
+  int i = 0;
+  uint32_t sse2 = 0;
+  if (len >= 16) {
+    const int limit = len - 32;
+    int32_t tmp[4];
+    __m128i sum1;
+    __m128i sum = _mm_setzero_si128();
+    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+    i += 16;
+    while (i <= limit) {
+      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      __m128i sum2;
+      i += 16;
+      SubtractAndSquare_SSE2(a0, b0, &sum1);
+      sum = _mm_add_epi32(sum, sum1);
+      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      i += 16;
+      SubtractAndSquare_SSE2(a1, b1, &sum2);
+      sum = _mm_add_epi32(sum, sum2);
+    }
+    SubtractAndSquare_SSE2(a0, b0, &sum1);
+    sum = _mm_add_epi32(sum, sum1);
+    _mm_storeu_si128((__m128i*)tmp, sum);
+    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+  }
+
+  for (; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+#endif  // !defined(WEBP_DISABLE_STATS)
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
+  uint16_t tmp[8];
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi16(*m, a);
+  _mm_storeu_si128((__m128i*)tmp, b);
+  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
+}
+
+static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi32(*m, a);
+  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
+  return (uint32_t)_mm_cvtsi128_si32(c);
+}
+
+static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
+
+#define ACCUMULATE_ROW(WEIGHT) do {                         \
+  /* compute row weight (Wx * Wy) */                        \
+  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
+  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
+  /* process 8 bytes at a time (7 bytes, actually) */       \
+  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
+  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
+  /* convert to 16b and multiply by weight */               \
+  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
+  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
+  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
+  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
+  /* accumulate */                                          \
+  xm  = _mm_add_epi16(xm, wa1);                             \
+  ym  = _mm_add_epi16(ym, wb1);                             \
+  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
+  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
+  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
+  src1 += stride1;                                          \
+  src2 += stride2;                                          \
+} while (0)
+
+static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
+                           const uint8_t* src2, int stride2) {
+  VP8DistoStats stats;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i xm = zero, ym = zero;                // 16b accums
+  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
+  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
+  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
+  ACCUMULATE_ROW(1);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(4);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(1);
+  stats.xm  = HorizontalAdd16b_SSE2(&xm);
+  stats.ym  = HorizontalAdd16b_SSE2(&ym);
+  stats.xxm = HorizontalAdd32b_SSE2(&xxm);
+  stats.xym = HorizontalAdd32b_SSE2(&xym);
+  stats.yym = HorizontalAdd32b_SSE2(&yym);
+  return VP8SSIMFromStats(&stats);
+}
+
+#endif  // !defined(WEBP_REDUCE_SIZE)
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
+#if !defined(WEBP_DISABLE_STATS)
+  VP8AccumulateSSE = AccumulateSSE_SSE2;
+#endif
+#if !defined(WEBP_REDUCE_SIZE)
+  VP8SSIMGet = SSIMGet_SSE2;
+#endif
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/media/libwebp/dsp/upsampling.c b/media/libwebp/dsp/upsampling.c
index b76483a3a6..aa1c807e89 100644
--- a/media/libwebp/dsp/upsampling.c
+++ b/media/libwebp/dsp/upsampling.c
@@ -233,12 +233,12 @@ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
   WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitYUV444ConvertersSSE2();
     }
 #endif
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitYUV444ConvertersSSE41();
     }
@@ -278,12 +278,12 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitUpsamplersSSE2();
     }
 #endif
-#if defined(WEBP_USE_SSE41)
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitUpsamplersSSE41();
     }
@@ -300,7 +300,7 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
 #endif
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPInitUpsamplersNEON();
diff --git a/media/libwebp/dsp/upsampling_mips_dsp_r2.c b/media/libwebp/dsp/upsampling_mips_dsp_r2.c
new file mode 100644
index 0000000000..2789c29e02
--- /dev/null
+++ b/media/libwebp/dsp/upsampling_mips_dsp_r2.c
@@ -0,0 +1,291 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV to RGB upsampling functions.
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+//            Djordje Pesut  (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include <assert.h>
+#include "../dsp/yuv.h"
+
+#define YUV_TO_RGB(Y, U, V, R, G, B) do {                                      \
+    const int t1 = MultHi(Y, 19077);                                           \
+    const int t2 = MultHi(V, 13320);                                           \
+    R = MultHi(V, 26149);                                                      \
+    G = MultHi(U, 6419);                                                       \
+    B = MultHi(U, 33050);                                                      \
+    R = t1 + R;                                                                \
+    G = t1 - G;                                                                \
+    B = t1 + B;                                                                \
+    R = R - 14234;                                                             \
+    G = G - t2 + 8708;                                                         \
+    B = B - 17685;                                                             \
+    __asm__ volatile (                                                         \
+      "shll_s.w         %[" #R "],      %[" #R "],        17         \n\t"     \
+      "shll_s.w         %[" #G "],      %[" #G "],        17         \n\t"     \
+      "shll_s.w         %[" #B "],      %[" #B "],        17         \n\t"     \
+      "precrqu_s.qb.ph  %[" #R "],      %[" #R "],        $zero      \n\t"     \
+      "precrqu_s.qb.ph  %[" #G "],      %[" #G "],        $zero      \n\t"     \
+      "precrqu_s.qb.ph  %[" #B "],      %[" #B "],        $zero      \n\t"     \
+      "srl              %[" #R "],      %[" #R "],        24         \n\t"     \
+      "srl              %[" #G "],      %[" #G "],        24         \n\t"     \
+      "srl              %[" #B "],      %[" #B "],        24         \n\t"     \
+      : [R]"+r"(R), [G]"+r"(G), [B]"+r"(B)                                     \
+      :                                                                        \
+    );                                                                         \
+  } while (0)
+
+#if !defined(WEBP_REDUCE_CSP)
+static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  rgb[0] = r;
+  rgb[1] = g;
+  rgb[2] = b;
+}
+static WEBP_INLINE void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  bgr[0] = b;
+  bgr[1] = g;
+  bgr[2] = r;
+}
+static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  {
+    const int rg = (r & 0xf8) | (g >> 5);
+    const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    rgb[0] = gb;
+    rgb[1] = rg;
+#else
+    rgb[0] = rg;
+    rgb[1] = gb;
+#endif
+  }
+}
+static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
+                                      uint8_t* const argb) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  {
+    const int rg = (r & 0xf0) | (g >> 4);
+    const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    argb[0] = ba;
+    argb[1] = rg;
+#else
+    argb[0] = rg;
+    argb[1] = ba;
+#endif
+   }
+}
+#endif   // WEBP_REDUCE_CSP
+
+//-----------------------------------------------------------------------------
+// Alpha handling variants
+
+#if !defined(WEBP_REDUCE_CSP)
+static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
+                                  uint8_t* const argb) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  argb[0] = 0xff;
+  argb[1] = r;
+  argb[2] = g;
+  argb[3] = b;
+}
+#endif   // WEBP_REDUCE_CSP
+static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
+                                  uint8_t* const bgra) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  bgra[0] = b;
+  bgra[1] = g;
+  bgra[2] = r;
+  bgra[3] = 0xff;
+}
+static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
+                                  uint8_t* const rgba) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  rgba[0] = r;
+  rgba[1] = g;
+  rgba[2] = b;
+  rgba[3] = 0xff;
+}
+
+//------------------------------------------------------------------------------
+// Fancy upsampler
+
+#ifdef FANCY_UPSAMPLING
+
+// Given samples laid out in a square as:
+//  [a b]
+//  [c d]
+// we interpolate u/v as:
+//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
+//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
+
+// We process u and v together stashed into 32bit (16bit each).
+#define LOAD_UV(u, v) ((u) | ((v) << 16))
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int x;                                                                       \
+  const int last_pixel_pair = (len - 1) >> 1;                                  \
+  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
+  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
+  assert(top_y != NULL);                                                       \
+  {                                                                            \
+    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
+    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
+  }                                                                            \
+  if (bottom_y != NULL) {                                                      \
+    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
+    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
+  }                                                                            \
+  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
+    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
+    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
+    /* precompute invariant values associated with first and second diagonals*/\
+    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
+    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
+    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
+    {                                                                          \
+      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
+      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
+      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
+           top_dst + (2 * x - 1) * XSTEP);                                     \
+      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
+           top_dst + (2 * x - 0) * XSTEP);                                     \
+    }                                                                          \
+    if (bottom_y != NULL) {                                                    \
+      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
+      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
+      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
+           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
+           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+    }                                                                          \
+    tl_uv = t_uv;                                                              \
+    l_uv = uv;                                                                 \
+  }                                                                            \
+  if (!(len & 1)) {                                                            \
+    {                                                                          \
+      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
+      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
+           top_dst + (len - 1) * XSTEP);                                       \
+    }                                                                          \
+    if (bottom_y != NULL) {                                                    \
+      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
+      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
+           bottom_dst + (len - 1) * XSTEP);                                    \
+    }                                                                          \
+  }                                                                            \
+}
+
+// All variants implemented.
+UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
+UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
+#endif   // WEBP_REDUCE_CSP
+
+#undef LOAD_UV
+#undef UPSAMPLE_FUNC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitUpsamplersMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#endif   // WEBP_REDUCE_CSP
+}
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+// YUV444 converter
+
+#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
+}
+
+YUV444_FUNC(Yuv444ToRgba,     YuvToRgba,     4)
+YUV444_FUNC(Yuv444ToBgra,     YuvToBgra,     4)
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(Yuv444ToRgb,      YuvToRgb,      3)
+YUV444_FUNC(Yuv444ToBgr,      YuvToBgr,      3)
+YUV444_FUNC(Yuv444ToArgb,     YuvToArgb,     4)
+YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2)
+YUV444_FUNC(Yuv444ToRgb565,   YuvToRgb565,   2)
+#endif   // WEBP_REDUCE_CSP
+
+#undef YUV444_FUNC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitYUV444ConvertersMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) {
+  WebPYUV444Converters[MODE_RGBA]      = Yuv444ToRgba;
+  WebPYUV444Converters[MODE_BGRA]      = Yuv444ToBgra;
+  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba;
+  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb;
+  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr;
+  WebPYUV444Converters[MODE_ARGB]      = Yuv444ToArgb;
+  WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
+  WebPYUV444Converters[MODE_RGB_565]   = Yuv444ToRgb565;
+  WebPYUV444Converters[MODE_Argb]      = Yuv444ToArgb;
+  WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
+#endif   // WEBP_REDUCE_CSP
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MIPS_DSP_R2))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersMIPSdspR2)
+#endif
diff --git a/media/libwebp/dsp/upsampling_msa.c b/media/libwebp/dsp/upsampling_msa.c
new file mode 100644
index 0000000000..d8ef6feb58
--- /dev/null
+++ b/media/libwebp/dsp/upsampling_msa.c
@@ -0,0 +1,688 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of YUV to RGB upsampling functions.
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include <string.h>
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "../dsp/msa_macro.h"
+#include "../dsp/yuv.h"
+
+#ifdef FANCY_UPSAMPLING
+
+#define ILVR_UW2(in, out0, out1) do {                            \
+  const v8i16 t0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in);  \
+  out0 = (v4u32)__msa_ilvr_h((v8i16)zero, t0);                   \
+  out1 = (v4u32)__msa_ilvl_h((v8i16)zero, t0);                   \
+} while (0)
+
+#define ILVRL_UW4(in, out0, out1, out2, out3) do {  \
+  v16u8 t0, t1;                                     \
+  ILVRL_B2_UB(zero, in, t0, t1);                    \
+  ILVRL_H2_UW(zero, t0, out0, out1);                \
+  ILVRL_H2_UW(zero, t1, out2, out3);                \
+} while (0)
+
+#define MULTHI_16(in0, in1, in2, in3, cnst, out0, out1) do {   \
+  const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256);        \
+  v4u32 temp0, temp1, temp2, temp3;                            \
+  MUL4(in0, const0, in1, const0, in2, const0, in3, const0,     \
+       temp0, temp1, temp2, temp3);                            \
+  PCKOD_H2_UH(temp1, temp0, temp3, temp2, out0, out1);         \
+} while (0)
+
+#define MULTHI_8(in0, in1, cnst, out0) do {                 \
+  const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256);     \
+  v4u32 temp0, temp1;                                       \
+  MUL2(in0, const0, in1, const0, temp0, temp1);             \
+  out0 = (v8u16)__msa_pckod_h((v8i16)temp1, (v8i16)temp0);  \
+} while (0)
+
+#define CALC_R16(y0, y1, v0, v1, dst) do {                \
+  const v8i16 const_a = (v8i16)__msa_fill_h(14234);       \
+  const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0);  \
+  const v8i16 a1 = __msa_adds_s_h((v8i16)y1, (v8i16)v1);  \
+  v8i16 b0 = __msa_subs_s_h(a0, const_a);                 \
+  v8i16 b1 = __msa_subs_s_h(a1, const_a);                 \
+  SRAI_H2_SH(b0, b1, 6);                                  \
+  CLIP_SH2_0_255(b0, b1);                                 \
+  dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0);       \
+} while (0)
+
+#define CALC_R8(y0, v0, dst) do {                         \
+  const v8i16 const_a = (v8i16)__msa_fill_h(14234);       \
+  const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0);  \
+  v8i16 b0 = __msa_subs_s_h(a0, const_a);                 \
+  b0 = SRAI_H(b0, 6);                                     \
+  CLIP_SH_0_255(b0);                                      \
+  dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0);       \
+} while (0)
+
+#define CALC_G16(y0, y1, u0, u1, v0, v1, dst) do {   \
+  const v8i16 const_a = (v8i16)__msa_fill_h(8708);   \
+  v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0);   \
+  v8i16 a1 = __msa_subs_s_h((v8i16)y1, (v8i16)u1);   \
+  const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0);    \
+  const v8i16 b1 = __msa_subs_s_h(a1, (v8i16)v1);    \
+  a0 = __msa_adds_s_h(b0, const_a);                  \
+  a1 = __msa_adds_s_h(b1, const_a);                  \
+  SRAI_H2_SH(a0, a1, 6);                             \
+  CLIP_SH2_0_255(a0, a1);                            \
+  dst = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);  \
+} while (0)
+
+#define CALC_G8(y0, u0, v0, dst) do {                \
+  const v8i16 const_a = (v8i16)__msa_fill_h(8708);   \
+  v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0);   \
+  const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0);    \
+  a0 = __msa_adds_s_h(b0, const_a);                  \
+  a0 = SRAI_H(a0, 6);                                \
+  CLIP_SH_0_255(a0);                                 \
+  dst = (v16u8)__msa_pckev_b((v16i8)a0, (v16i8)a0);  \
+} while (0)
+
+#define CALC_B16(y0, y1, u0, u1, dst) do {           \
+  const v8u16 const_a = (v8u16)__msa_fill_h(17685);  \
+  const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0);    \
+  const v8u16 a1 = __msa_adds_u_h((v8u16)y1, u1);    \
+  v8u16 b0 = __msa_subs_u_h(a0, const_a);            \
+  v8u16 b1 = __msa_subs_u_h(a1, const_a);            \
+  SRAI_H2_UH(b0, b1, 6);                             \
+  CLIP_UH2_0_255(b0, b1);                            \
+  dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0);  \
+} while (0)
+
+#define CALC_B8(y0, u0, dst) do {                    \
+  const v8u16 const_a = (v8u16)__msa_fill_h(17685);  \
+  const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0);    \
+  v8u16 b0 = __msa_subs_u_h(a0, const_a);            \
+  b0 = SRAI_H(b0, 6);                                \
+  CLIP_UH_0_255(b0);                                 \
+  dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0);  \
+} while (0)
+
+#define CALC_RGB16(y, u, v, R, G, B) do {    \
+  const v16u8 zero = { 0 };                  \
+  v8u16 y0, y1, u0, u1, v0, v1;              \
+  v4u32 p0, p1, p2, p3;                      \
+  const v16u8 in_y = LD_UB(y);               \
+  const v16u8 in_u = LD_UB(u);               \
+  const v16u8 in_v = LD_UB(v);               \
+  ILVRL_UW4(in_y, p0, p1, p2, p3);           \
+  MULTHI_16(p0, p1, p2, p3, 19077, y0, y1);  \
+  ILVRL_UW4(in_v, p0, p1, p2, p3);           \
+  MULTHI_16(p0, p1, p2, p3, 26149, v0, v1);  \
+  CALC_R16(y0, y1, v0, v1, R);               \
+  MULTHI_16(p0, p1, p2, p3, 13320, v0, v1);  \
+  ILVRL_UW4(in_u, p0, p1, p2, p3);           \
+  MULTHI_16(p0, p1, p2, p3, 6419, u0, u1);   \
+  CALC_G16(y0, y1, u0, u1, v0, v1, G);       \
+  MULTHI_16(p0, p1, p2, p3, 33050, u0, u1);  \
+  CALC_B16(y0, y1, u0, u1, B);               \
+} while (0)
+
+#define CALC_RGB8(y, u, v, R, G, B) do {  \
+  const v16u8 zero = { 0 };               \
+  v8u16 y0, u0, v0;                       \
+  v4u32 p0, p1;                           \
+  const v16u8 in_y = LD_UB(y);            \
+  const v16u8 in_u = LD_UB(u);            \
+  const v16u8 in_v = LD_UB(v);            \
+  ILVR_UW2(in_y, p0, p1);                 \
+  MULTHI_8(p0, p1, 19077, y0);            \
+  ILVR_UW2(in_v, p0, p1);                 \
+  MULTHI_8(p0, p1, 26149, v0);            \
+  CALC_R8(y0, v0, R);                     \
+  MULTHI_8(p0, p1, 13320, v0);            \
+  ILVR_UW2(in_u, p0, p1);                 \
+  MULTHI_8(p0, p1, 6419, u0);             \
+  CALC_G8(y0, u0, v0, G);                 \
+  MULTHI_8(p0, p1, 33050, u0);            \
+  CALC_B8(y0, u0, B);                     \
+} while (0)
+
+#define STORE16_3(a0, a1, a2, dst) do {                          \
+  const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19,  \
+                        8, 9, 20, 10 };                          \
+  const v16u8 mask1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7,  \
+                        8, 25, 9, 10 };                          \
+  const v16u8 mask2 = { 26, 0, 1, 27, 2, 3, 28, 4, 5, 29, 6, 7,  \
+                        30, 8, 9, 31 };                          \
+  v16u8 out0, out1, out2, tmp0, tmp1, tmp2;                      \
+  ILVRL_B2_UB(a1, a0, tmp0, tmp1);                               \
+  out0 = VSHF_UB(tmp0, a2, mask0);                               \
+  tmp2 = SLDI_UB(tmp1, tmp0, 11);                                \
+  out1 = VSHF_UB(tmp2, a2, mask1);                               \
+  tmp2 = SLDI_UB(tmp1, tmp1, 6);                                 \
+  out2 = VSHF_UB(tmp2, a2, mask2);                               \
+  ST_UB(out0, dst +  0);                                         \
+  ST_UB(out1, dst + 16);                                         \
+  ST_UB(out2, dst + 32);                                         \
+} while (0)
+
+#define STORE8_3(a0, a1, a2, dst) do {                             \
+  int64_t out_m;                                                   \
+  const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19,    \
+                        8, 9, 20, 10 };                            \
+  const v16u8 mask1 = { 11, 21, 12, 13, 22, 14, 15, 23,            \
+                        255, 255, 255, 255, 255, 255, 255, 255 };  \
+  const v16u8 tmp0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0);    \
+  v16u8 out0, out1;                                                \
+  VSHF_B2_UB(tmp0, a2, tmp0, a2, mask0, mask1, out0, out1);        \
+  ST_UB(out0, dst);                                                \
+  out_m = __msa_copy_s_d((v2i64)out1, 0);                          \
+  SD(out_m, dst + 16);                                             \
+} while (0)
+
+#define STORE16_4(a0, a1, a2, a3, dst) do {  \
+  v16u8 tmp0, tmp1, tmp2, tmp3;              \
+  v16u8 out0, out1, out2, out3;              \
+  ILVRL_B2_UB(a1, a0, tmp0, tmp1);           \
+  ILVRL_B2_UB(a3, a2, tmp2, tmp3);           \
+  ILVRL_H2_UB(tmp2, tmp0, out0, out1);       \
+  ILVRL_H2_UB(tmp3, tmp1, out2, out3);       \
+  ST_UB(out0, dst +  0);                     \
+  ST_UB(out1, dst + 16);                     \
+  ST_UB(out2, dst + 32);                     \
+  ST_UB(out3, dst + 48);                     \
+} while (0)
+
+#define STORE8_4(a0, a1, a2, a3, dst) do {  \
+  v16u8 tmp0, tmp1, tmp2, tmp3;             \
+  ILVR_B2_UB(a1, a0, a3, a2, tmp0, tmp1);   \
+  ILVRL_H2_UB(tmp1, tmp0, tmp2, tmp3);      \
+  ST_UB(tmp2, dst +  0);                    \
+  ST_UB(tmp3, dst + 16);                    \
+} while (0)
+
+#define STORE2_16(a0, a1, dst) do {  \
+  v16u8 out0, out1;                  \
+  ILVRL_B2_UB(a1, a0, out0, out1);   \
+  ST_UB(out0, dst +  0);             \
+  ST_UB(out1, dst + 16);             \
+} while (0)
+
+#define STORE2_8(a0, a1, dst) do {                               \
+  const v16u8 out0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0);  \
+  ST_UB(out0, dst);                                              \
+} while (0)
+
+#define CALC_RGBA4444(y, u, v, out0, out1, N, dst) do {  \
+  CALC_RGB##N(y, u, v, R, G, B);                         \
+  tmp0 = ANDI_B(R, 0xf0);                                \
+  tmp1 = SRAI_B(G, 4);                                   \
+  RG = tmp0 | tmp1;                                      \
+  tmp0 = ANDI_B(B, 0xf0);                                \
+  BA = ORI_B(tmp0, 0x0f);                                \
+  STORE2_##N(out0, out1, dst);                           \
+} while (0)
+
+#define CALC_RGB565(y, u, v, out0, out1, N, dst) do {  \
+  CALC_RGB##N(y, u, v, R, G, B);                       \
+  tmp0 = ANDI_B(R, 0xf8);                              \
+  tmp1 = SRAI_B(G, 5);                                 \
+  RG = tmp0 | tmp1;                                    \
+  tmp0 = SLLI_B(G, 3);                                 \
+  tmp1 = ANDI_B(tmp0, 0xe0);                           \
+  tmp0 = SRAI_B(B, 3);                                 \
+  GB = tmp0 | tmp1;                                    \
+  STORE2_##N(out0, out1, dst);                         \
+} while (0)
+
+static WEBP_INLINE int Clip8(int v) {
+  return v < 0 ? 0 : v > 255 ? 255 : v;
+}
+
+static void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  rgb[0] = Clip8(r1 >> 6);
+  rgb[1] = Clip8(g1 >> 6);
+  rgb[2] = Clip8(b1 >> 6);
+}
+
+static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  bgr[0] = Clip8(b1 >> 6);
+  bgr[1] = Clip8(g1 >> 6);
+  bgr[2] = Clip8(r1 >> 6);
+}
+
+#if !defined(WEBP_REDUCE_CSP)
+static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  const int r = Clip8(r1 >> 6);
+  const int g = Clip8(g1 >> 6);
+  const int b = Clip8(b1 >> 6);
+  const int rg = (r & 0xf8) | (g >> 5);
+  const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#if (WEBP_SWAP_16BIT_CSP == 1)
+  rgb[0] = gb;
+  rgb[1] = rg;
+#else
+  rgb[0] = rg;
+  rgb[1] = gb;
+#endif
+}
+
+static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  const int r = Clip8(r1 >> 6);
+  const int g = Clip8(g1 >> 6);
+  const int b = Clip8(b1 >> 6);
+  const int rg = (r & 0xf0) | (g >> 4);
+  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
+#if (WEBP_SWAP_16BIT_CSP == 1)
+  argb[0] = ba;
+  argb[1] = rg;
+#else
+  argb[0] = rg;
+  argb[1] = ba;
+#endif
+}
+
+static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {
+  argb[0] = 0xff;
+  YuvToRgb(y, u, v, argb + 1);
+}
+#endif  // WEBP_REDUCE_CSP
+
+static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {
+  YuvToBgr(y, u, v, bgra);
+  bgra[3] = 0xff;
+}
+
+static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
+  YuvToRgb(y, u, v, rgba);
+  rgba[3] = 0xff;
+}
+
+#if !defined(WEBP_REDUCE_CSP)
+static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
+                         const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_3(R, G, B, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 3;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[3 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_3(R, G, B, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[3 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_3(R, G, B, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  }
+}
+
+static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
+                         const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_3(B, G, R, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 3;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[3 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_3(B, G, R, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[3 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_3(B, G, R, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  }
+}
+#endif  // WEBP_REDUCE_CSP
+
+static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
+                          const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_4(R, G, B, A, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 4;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[4 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(&temp[0], u, v, R, G, B);
+    STORE16_4(R, G, B, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[4 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_4(R, G, B, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  }
+}
+
+static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
+                          const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_4(B, G, R, A, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 4;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[4 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_4(B, G, R, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[4 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_4(B, G, R, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  }
+}
+
+#if !defined(WEBP_REDUCE_CSP)
+static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
+                          const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_4(A, R, G, B, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 4;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[4 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_4(A, R, G, B, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[4 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_4(A, R, G, B, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  }
+}
+
+static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
+                              const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B, RG, BA, tmp0, tmp1;
+  while (length >= 16) {
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
+#else
+    CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
+#endif
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 2;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[2 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
+#else
+    CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[2 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
+#else
+    CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  }
+}
+
+static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
+                            const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B, RG, GB, tmp0, tmp1;
+  while (length >= 16) {
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    CALC_RGB565(y, u, v, GB, RG, 16, dst);
+#else
+    CALC_RGB565(y, u, v, RG, GB, 16, dst);
+#endif
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 2;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[2 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    CALC_RGB565(temp, u, v, GB, RG, 16, temp);
+#else
+    CALC_RGB565(temp, u, v, RG, GB, 16, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[2 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#if (WEBP_SWAP_16BIT_CSP == 1)
+    CALC_RGB565(temp, u, v, GB, RG, 8, temp);
+#else
+    CALC_RGB565(temp, u, v, RG, GB, 8, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  }
+}
+#endif  // WEBP_REDUCE_CSP
+
+#define UPSAMPLE_32PIXELS(a, b, c, d) do {    \
+  v16u8 s = __msa_aver_u_b(a, d);             \
+  v16u8 t = __msa_aver_u_b(b, c);             \
+  const v16u8 st = s ^ t;                     \
+  v16u8 ad = a ^ d;                           \
+  v16u8 bc = b ^ c;                           \
+  v16u8 t0 = ad | bc;                         \
+  v16u8 t1 = t0 | st;                         \
+  v16u8 t2 = ANDI_B(t1, 1);                   \
+  v16u8 t3 = __msa_aver_u_b(s, t);            \
+  const v16u8 k = t3 - t2;                    \
+  v16u8 diag1, diag2;                         \
+  AVER_UB2_UB(t, k, s, k, t0, t1);            \
+  bc = bc & st;                               \
+  ad = ad & st;                               \
+  t = t ^ k;                                  \
+  s = s ^ k;                                  \
+  t2 = bc | t;                                \
+  t3 = ad | s;                                \
+  t2 = ANDI_B(t2, 1);                         \
+  t3 = ANDI_B(t3, 1);                         \
+  SUB2(t0, t2, t1, t3, diag1, diag2);         \
+  AVER_UB2_UB(a, diag1, b, diag2, t0, t1);    \
+  ILVRL_B2_UB(t1, t0, a, b);                  \
+  if (pbot_y != NULL) {                       \
+    AVER_UB2_UB(c, diag2, d, diag1, t0, t1);  \
+    ILVRL_B2_UB(t1, t0, c, d);                \
+  }                                           \
+} while (0)
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                            \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,        \
+                      const uint8_t* top_u, const uint8_t* top_v,        \
+                      const uint8_t* cur_u, const uint8_t* cur_v,        \
+                      uint8_t* top_dst, uint8_t* bot_dst, int len)       \
+{                                                                        \
+  int size = (len - 1) >> 1;                                             \
+  uint8_t temp_u[64];                                                    \
+  uint8_t temp_v[64];                                                    \
+  const uint32_t tl_uv = ((top_u[0]) | ((top_v[0]) << 16));              \
+  const uint32_t l_uv = ((cur_u[0]) | ((cur_v[0]) << 16));               \
+  const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;            \
+  const uint8_t* ptop_y = &top_y[1];                                     \
+  uint8_t* ptop_dst = top_dst + XSTEP;                                   \
+  const uint8_t* pbot_y = &bot_y[1];                                     \
+  uint8_t* pbot_dst = bot_dst + XSTEP;                                   \
+                                                                         \
+  FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                      \
+  if (bot_y != NULL) {                                                   \
+    const uint32_t uv1 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;          \
+    FUNC(bot_y[0], uv1 & 0xff, (uv1 >> 16), bot_dst);                    \
+  }                                                                      \
+  while (size >= 16) {                                                   \
+    v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1;                        \
+    LD_UB2(top_u, 1, tu0, tu1);                                          \
+    LD_UB2(cur_u, 1, cu0, cu1);                                          \
+    LD_UB2(top_v, 1, tv0, tv1);                                          \
+    LD_UB2(cur_v, 1, cv0, cv1);                                          \
+    UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1);                               \
+    UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1);                               \
+    ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16);                          \
+    ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16);                          \
+    FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, 32);           \
+    if (bot_y != NULL) {                                                 \
+      FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, 32);        \
+    }                                                                    \
+    ptop_y   += 32;                                                      \
+    pbot_y   += 32;                                                      \
+    ptop_dst += XSTEP * 32;                                              \
+    pbot_dst += XSTEP * 32;                                              \
+    top_u    += 16;                                                      \
+    top_v    += 16;                                                      \
+    cur_u    += 16;                                                      \
+    cur_v    += 16;                                                      \
+    size     -= 16;                                                      \
+  }                                                                      \
+  if (size > 0) {                                                        \
+    v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1;                        \
+    memcpy(&temp_u[ 0], top_u, 17 * sizeof(uint8_t));                    \
+    memcpy(&temp_u[32], cur_u, 17 * sizeof(uint8_t));                    \
+    memcpy(&temp_v[ 0], top_v, 17 * sizeof(uint8_t));                    \
+    memcpy(&temp_v[32], cur_v, 17 * sizeof(uint8_t));                    \
+    LD_UB2(&temp_u[ 0], 1, tu0, tu1);                                    \
+    LD_UB2(&temp_u[32], 1, cu0, cu1);                                    \
+    LD_UB2(&temp_v[ 0], 1, tv0, tv1);                                    \
+    LD_UB2(&temp_v[32], 1, cv0, cv1);                                    \
+    UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1);                               \
+    UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1);                               \
+    ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16);                          \
+    ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16);                          \
+    FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, size * 2);     \
+    if (bot_y != NULL) {                                                 \
+      FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, size * 2);  \
+    }                                                                    \
+    top_u += size;                                                       \
+    top_v += size;                                                       \
+    cur_u += size;                                                       \
+    cur_v += size;                                                       \
+  }                                                                      \
+  if (!(len & 1)) {                                                      \
+    const uint32_t t0 = ((top_u[0]) | ((top_v[0]) << 16));               \
+    const uint32_t c0  = ((cur_u[0]) | ((cur_v[0]) << 16));              \
+    const uint32_t tmp0 = (3 * t0 + c0 + 0x00020002u) >> 2;              \
+    FUNC(top_y[len - 1], tmp0 & 0xff, (tmp0 >> 16),                      \
+                top_dst + (len - 1) * XSTEP);                            \
+    if (bot_y != NULL) {                                                 \
+      const uint32_t tmp1 = (3 * c0 + t0 + 0x00020002u) >> 2;            \
+      FUNC(bot_y[len - 1], tmp1 & 0xff, (tmp1 >> 16),                    \
+           bot_dst + (len - 1) * XSTEP);                                 \
+    }                                                                    \
+  }                                                                      \
+}
+
+UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
+UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
+#endif   // WEBP_REDUCE_CSP
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+extern void WebPInitUpsamplersMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#endif   // WEBP_REDUCE_CSP
+}
+
+#endif  // FANCY_UPSAMPLING
+
+#endif  // WEBP_USE_MSA
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MSA))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersMSA)
+#endif
diff --git a/media/libwebp/dsp/upsampling_neon.c b/media/libwebp/dsp/upsampling_neon.c
index c847d70d46..41cb44b03f 100644
--- a/media/libwebp/dsp/upsampling_neon.c
+++ b/media/libwebp/dsp/upsampling_neon.c
@@ -58,8 +58,8 @@
 } while (0)
 
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample16Pixels_NEON(const uint8_t *r1, const uint8_t *r2,
-                                  uint8_t *out) {
+static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2,
+                                  uint8_t* out) {
   UPSAMPLE_16PIXELS(r1, r2, out);
 }
 
@@ -190,14 +190,14 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
 }
 
 #define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP)                       \
-static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
-                      const uint8_t *top_u, const uint8_t *top_v,       \
-                      const uint8_t *cur_u, const uint8_t *cur_v,       \
-                      uint8_t *top_dst, uint8_t *bottom_dst, int len) { \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,    \
+                      const uint8_t* top_u, const uint8_t* top_v,       \
+                      const uint8_t* cur_u, const uint8_t* cur_v,       \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
   int block;                                                            \
   /* 16 byte aligned array to cache reconstructed u and v */            \
   uint8_t uv_buf[2 * 32 + 15];                                          \
-  uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
+  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
   const int uv_len = (len + 1) >> 1;                                    \
   /* 9 pixels must be read-able for each block */                       \
   const int num_blocks = (uv_len - 1) >> 3;                             \
diff --git a/media/libwebp/dsp/yuv.c b/media/libwebp/dsp/yuv.c
index 12c04ca426..bd9db04149 100644
--- a/media/libwebp/dsp/yuv.c
+++ b/media/libwebp/dsp/yuv.c
@@ -90,16 +90,16 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitSamplersSSE2();
     }
-#endif  // WEBP_USE_SSE2
-#if defined(WEBP_USE_SSE41)
+#endif  // WEBP_HAVE_SSE2
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitSamplersSSE41();
     }
-#endif  // WEBP_USE_SSE41
+#endif  // WEBP_HAVE_SSE41
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       WebPInitSamplersMIPS32();
@@ -276,26 +276,26 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
 #endif
 
   if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_HAVE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitConvertARGBToYUVSSE2();
       WebPInitSharpYUVSSE2();
     }
-#endif  // WEBP_USE_SSE2
-#if defined(WEBP_USE_SSE41)
+#endif  // WEBP_HAVE_SSE2
+#if defined(WEBP_HAVE_SSE41)
     if (VP8GetCPUInfo(kSSE4_1)) {
       WebPInitConvertARGBToYUVSSE41();
     }
-#endif  // WEBP_USE_SSE41
+#endif  // WEBP_HAVE_SSE41
   }
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_HAVE_NEON)
   if (WEBP_NEON_OMIT_C_CODE ||
       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
     WebPInitConvertARGBToYUVNEON();
     WebPInitSharpYUVNEON();
   }
-#endif  // WEBP_USE_NEON
+#endif  // WEBP_HAVE_NEON
 
   assert(WebPConvertARGBToY != NULL);
   assert(WebPConvertARGBToUV != NULL);
diff --git a/media/libwebp/dsp/yuv.h b/media/libwebp/dsp/yuv.h
index 947b89e13c..28524ec422 100644
--- a/media/libwebp/dsp/yuv.h
+++ b/media/libwebp/dsp/yuv.h
@@ -10,7 +10,7 @@
 // inline YUV<->RGB conversion function
 //
 // The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
-// More information at: http://en.wikipedia.org/wiki/YCbCr
+// More information at: https://en.wikipedia.org/wiki/YCbCr
 // Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
 // U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
 // V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
diff --git a/media/libwebp/dsp/yuv_mips32.c b/media/libwebp/dsp/yuv_mips32.c
new file mode 100644
index 0000000000..fc7c2cda0e
--- /dev/null
+++ b/media/libwebp/dsp/yuv_mips32.c
@@ -0,0 +1,103 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of YUV to RGB upsampling functions.
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../dsp/yuv.h"
+
+//------------------------------------------------------------------------------
+// simple point-sampling
+
+#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
+static void FUNC_NAME(const uint8_t* y,                                        \
+                      const uint8_t* u, const uint8_t* v,                      \
+                      uint8_t* dst, int len) {                                 \
+  int i, r, g, b;                                                              \
+  int temp0, temp1, temp2, temp3, temp4;                                       \
+  for (i = 0; i < (len >> 1); i++) {                                           \
+    temp1 = MultHi(v[0], 26149);                                               \
+    temp3 = MultHi(v[0], 13320);                                               \
+    temp2 = MultHi(u[0], 6419);                                                \
+    temp4 = MultHi(u[0], 33050);                                               \
+    temp0 = MultHi(y[0], 19077);                                               \
+    temp1 -= 14234;                                                            \
+    temp3 -= 8708;                                                             \
+    temp2 += temp3;                                                            \
+    temp4 -= 17685;                                                            \
+    r = VP8Clip8(temp0 + temp1);                                               \
+    g = VP8Clip8(temp0 - temp2);                                               \
+    b = VP8Clip8(temp0 + temp4);                                               \
+    temp0 = MultHi(y[1], 19077);                                               \
+    dst[R] = r;                                                                \
+    dst[G] = g;                                                                \
+    dst[B] = b;                                                                \
+    if (A) dst[A] = 0xff;                                                      \
+    r = VP8Clip8(temp0 + temp1);                                               \
+    g = VP8Clip8(temp0 - temp2);                                               \
+    b = VP8Clip8(temp0 + temp4);                                               \
+    dst[R + XSTEP] = r;                                                        \
+    dst[G + XSTEP] = g;                                                        \
+    dst[B + XSTEP] = b;                                                        \
+    if (A) dst[A + XSTEP] = 0xff;                                              \
+    y += 2;                                                                    \
+    ++u;                                                                       \
+    ++v;                                                                       \
+    dst += 2 * XSTEP;                                                          \
+  }                                                                            \
+  if (len & 1) {                                                               \
+    temp1 = MultHi(v[0], 26149);                                               \
+    temp3 = MultHi(v[0], 13320);                                               \
+    temp2 = MultHi(u[0], 6419);                                                \
+    temp4 = MultHi(u[0], 33050);                                               \
+    temp0 = MultHi(y[0], 19077);                                               \
+    temp1 -= 14234;                                                            \
+    temp3 -= 8708;                                                             \
+    temp2 += temp3;                                                            \
+    temp4 -= 17685;                                                            \
+    r = VP8Clip8(temp0 + temp1);                                               \
+    g = VP8Clip8(temp0 - temp2);                                               \
+    b = VP8Clip8(temp0 + temp4);                                               \
+    dst[R] = r;                                                                \
+    dst[G] = g;                                                                \
+    dst[B] = b;                                                                \
+    if (A) dst[A] = 0xff;                                                      \
+  }                                                                            \
+}
+
+ROW_FUNC(YuvToRgbRow_MIPS32,      3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow_MIPS32,     4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow_MIPS32,      3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow_MIPS32,     4, 2, 1, 0, 3)
+
+#undef ROW_FUNC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitSamplersMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) {
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_MIPS32;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPS32;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_MIPS32;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPS32;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/media/libwebp/dsp/yuv_mips_dsp_r2.c b/media/libwebp/dsp/yuv_mips_dsp_r2.c
new file mode 100644
index 0000000000..1418a9fba1
--- /dev/null
+++ b/media/libwebp/dsp/yuv_mips_dsp_r2.c
@@ -0,0 +1,134 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS DSPr2 version of YUV to RGB upsampling functions.
+//
+// Author(s):  Branimir Vasic (branimir.vasic@imgtec.com)
+//             Djordje Pesut  (djordje.pesut@imgtec.com)
+
+#include "../dsp/dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/yuv.h"
+
+//------------------------------------------------------------------------------
+// simple point-sampling
+
+#define ROW_FUNC_PART_1()                                                      \
+  "lbu              %[temp3],   0(%[v])                         \n\t"          \
+  "lbu              %[temp4],   0(%[u])                         \n\t"          \
+  "lbu              %[temp0],   0(%[y])                         \n\t"          \
+  "mul              %[temp1],   %[t_con_1],     %[temp3]        \n\t"          \
+  "mul              %[temp3],   %[t_con_2],     %[temp3]        \n\t"          \
+  "mul              %[temp2],   %[t_con_3],     %[temp4]        \n\t"          \
+  "mul              %[temp4],   %[t_con_4],     %[temp4]        \n\t"          \
+  "mul              %[temp0],   %[t_con_5],     %[temp0]        \n\t"          \
+  "subu             %[temp1],   %[temp1],       %[t_con_6]      \n\t"          \
+  "subu             %[temp3],   %[temp3],       %[t_con_7]      \n\t"          \
+  "addu             %[temp2],   %[temp2],       %[temp3]        \n\t"          \
+  "subu             %[temp4],   %[temp4],       %[t_con_8]      \n\t"          \
+
+#define ROW_FUNC_PART_2(R, G, B, K)                                            \
+  "addu             %[temp5],   %[temp0],       %[temp1]        \n\t"          \
+  "subu             %[temp6],   %[temp0],       %[temp2]        \n\t"          \
+  "addu             %[temp7],   %[temp0],       %[temp4]        \n\t"          \
+".if " #K "                                                     \n\t"          \
+  "lbu              %[temp0],   1(%[y])                         \n\t"          \
+".endif                                                         \n\t"          \
+  "shll_s.w         %[temp5],   %[temp5],       17              \n\t"          \
+  "shll_s.w         %[temp6],   %[temp6],       17              \n\t"          \
+".if " #K "                                                     \n\t"          \
+  "mul              %[temp0],   %[t_con_5],     %[temp0]        \n\t"          \
+".endif                                                         \n\t"          \
+  "shll_s.w         %[temp7],   %[temp7],       17              \n\t"          \
+  "precrqu_s.qb.ph  %[temp5],   %[temp5],       $zero           \n\t"          \
+  "precrqu_s.qb.ph  %[temp6],   %[temp6],       $zero           \n\t"          \
+  "precrqu_s.qb.ph  %[temp7],   %[temp7],       $zero           \n\t"          \
+  "srl              %[temp5],   %[temp5],       24              \n\t"          \
+  "srl              %[temp6],   %[temp6],       24              \n\t"          \
+  "srl              %[temp7],   %[temp7],       24              \n\t"          \
+  "sb               %[temp5],   " #R "(%[dst])                  \n\t"          \
+  "sb               %[temp6],   " #G "(%[dst])                  \n\t"          \
+  "sb               %[temp7],   " #B "(%[dst])                  \n\t"          \
+
+#define ASM_CLOBBER_LIST()                                                     \
+  : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),             \
+    [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),             \
+    [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)                                   \
+  : [t_con_1]"r"(t_con_1), [t_con_2]"r"(t_con_2), [t_con_3]"r"(t_con_3),       \
+    [t_con_4]"r"(t_con_4), [t_con_5]"r"(t_con_5), [t_con_6]"r"(t_con_6),       \
+    [u]"r"(u), [v]"r"(v), [y]"r"(y), [dst]"r"(dst),                            \
+    [t_con_7]"r"(t_con_7), [t_con_8]"r"(t_con_8)                               \
+  : "memory", "hi", "lo"                                                       \
+
+#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
+static void FUNC_NAME(const uint8_t* y,                                        \
+                      const uint8_t* u, const uint8_t* v,                      \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;             \
+  const int t_con_1 = 26149;                                                   \
+  const int t_con_2 = 13320;                                                   \
+  const int t_con_3 = 6419;                                                    \
+  const int t_con_4 = 33050;                                                   \
+  const int t_con_5 = 19077;                                                   \
+  const int t_con_6 = 14234;                                                   \
+  const int t_con_7 = 8708;                                                    \
+  const int t_con_8 = 17685;                                                   \
+  for (i = 0; i < (len >> 1); i++) {                                           \
+    __asm__ volatile (                                                         \
+      ROW_FUNC_PART_1()                                                        \
+      ROW_FUNC_PART_2(R, G, B, 1)                                              \
+      ROW_FUNC_PART_2(R + XSTEP, G + XSTEP, B + XSTEP, 0)                      \
+      ASM_CLOBBER_LIST()                                                       \
+    );                                                                         \
+    if (A) dst[A] = dst[A + XSTEP] = 0xff;                                     \
+    y += 2;                                                                    \
+    ++u;                                                                       \
+    ++v;                                                                       \
+    dst += 2 * XSTEP;                                                          \
+  }                                                                            \
+  if (len & 1) {                                                               \
+    __asm__ volatile (                                                         \
+      ROW_FUNC_PART_1()                                                        \
+      ROW_FUNC_PART_2(R, G, B, 0)                                              \
+      ASM_CLOBBER_LIST()                                                       \
+    );                                                                         \
+    if (A) dst[A] = 0xff;                                                      \
+  }                                                                            \
+}
+
+ROW_FUNC(YuvToRgbRow_MIPSdspR2,      3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow_MIPSdspR2,     4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow_MIPSdspR2,      3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow_MIPSdspR2,     4, 2, 1, 0, 3)
+
+#undef ROW_FUNC
+#undef ASM_CLOBBER_LIST
+#undef ROW_FUNC_PART_2
+#undef ROW_FUNC_PART_1
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitSamplersMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) {
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_MIPSdspR2;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPSdspR2;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_MIPSdspR2;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPSdspR2;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/media/libwebp/enc/alpha_enc.c b/media/libwebp/enc/alpha_enc.c
new file mode 100644
index 0000000000..edfe95ec94
--- /dev/null
+++ b/media/libwebp/enc/alpha_enc.c
@@ -0,0 +1,443 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Alpha-plane compression.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../dsp/dsp.h"
+#include "../utils/filters_utils.h"
+#include "../utils/quant_levels_utils.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+// -----------------------------------------------------------------------------
+// Encodes the given alpha data via specified compression method 'method'.
+// The pre-processing (quantization) is performed if 'quality' is less than 100.
+// For such cases, the encoding is lossy. The valid range is [0, 100] for
+// 'quality' and [0, 1] for 'method':
+//   'method = 0' - No compression;
+//   'method = 1' - Use lossless coder on the alpha plane only
+// 'filter' values [0, 4] correspond to prediction modes none, horizontal,
+// vertical & gradient filters. The prediction mode 4 will try all the
+// prediction modes 0 to 3 and pick the best one.
+// 'effort_level': specifies how much effort must be spent to try and reduce
+//  the compressed output size. In range 0 (quick) to 6 (slow).
+//
+// 'output' corresponds to the buffer containing compressed alpha data.
+//          This buffer is allocated by this method and caller should call
+//          WebPSafeFree(*output) when done.
+// 'output_size' corresponds to size of this compressed alpha buffer.
+//
+// Returns 1 on successfully encoding the alpha and
+//         0 if either:
+//           invalid quality or method, or
+//           memory allocation for the compressed data fails.
+
+#include "../enc/vp8li_enc.h"
+
+static int EncodeLossless(const uint8_t* const data, int width, int height,
+                          int effort_level,  // in [0..6] range
+                          int use_quality_100, VP8LBitWriter* const bw,
+                          WebPAuxStats* const stats) {
+  int ok = 0;
+  WebPConfig config;
+  WebPPicture picture;
+
+  WebPPictureInit(&picture);
+  picture.width = width;
+  picture.height = height;
+  picture.use_argb = 1;
+  picture.stats = stats;
+  if (!WebPPictureAlloc(&picture)) return 0;
+
+  // Transfer the alpha values to the green channel.
+  WebPDispatchAlphaToGreen(data, width, picture.width, picture.height,
+                           picture.argb, picture.argb_stride);
+
+  WebPConfigInit(&config);
+  config.lossless = 1;
+  // Enable exact, or it would alter RGB values of transparent alpha, which is
+  // normally OK but not here since we are not encoding the input image but  an
+  // internal encoding-related image containing necessary exact information in
+  // RGB channels.
+  config.exact = 1;
+  config.method = effort_level;  // impact is very small
+  // Set a low default quality for encoding alpha. Ensure that Alpha quality at
+  // lower methods (3 and below) is less than the threshold for triggering
+  // costly 'BackwardReferencesTraceBackwards'.
+  // If the alpha quality is set to 100 and the method to 6, allow for a high
+  // lossless quality to trigger the cruncher.
+  config.quality =
+      (use_quality_100 && effort_level == 6) ? 100 : 8.f * effort_level;
+  assert(config.quality >= 0 && config.quality <= 100.f);
+
+  // TODO(urvang): Temporary fix to avoid generating images that trigger
+  // a decoder bug related to alpha with color cache.
+  // See: https://code.google.com/p/webp/issues/detail?id=239
+  // Need to re-enable this later.
+  ok = (VP8LEncodeStream(&config, &picture, bw, 0 /*use_cache*/) == VP8_ENC_OK);
+  WebPPictureFree(&picture);
+  ok = ok && !bw->error_;
+  if (!ok) {
+    VP8LBitWriterWipeOut(bw);
+    return 0;
+  }
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+
+// Small struct to hold the result of a filter mode compression attempt.
+typedef struct {
+  size_t score;
+  VP8BitWriter bw;
+  WebPAuxStats stats;
+} FilterTrial;
+
+// This function always returns an initialized 'bw' object, even upon error.
+static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
+                               int method, int filter, int reduce_levels,
+                               int effort_level,  // in [0..6] range
+                               uint8_t* const tmp_alpha,
+                               FilterTrial* result) {
+  int ok = 0;
+  const uint8_t* alpha_src;
+  WebPFilterFunc filter_func;
+  uint8_t header;
+  const size_t data_size = width * height;
+  const uint8_t* output = NULL;
+  size_t output_size = 0;
+  VP8LBitWriter tmp_bw;
+
+  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
+  assert(filter >= 0 && filter < WEBP_FILTER_LAST);
+  assert(method >= ALPHA_NO_COMPRESSION);
+  assert(method <= ALPHA_LOSSLESS_COMPRESSION);
+  assert(sizeof(header) == ALPHA_HEADER_LEN);
+
+  filter_func = WebPFilters[filter];
+  if (filter_func != NULL) {
+    filter_func(data, width, height, width, tmp_alpha);
+    alpha_src = tmp_alpha;
+  }  else {
+    alpha_src = data;
+  }
+
+  if (method != ALPHA_NO_COMPRESSION) {
+    ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
+    ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
+                              !reduce_levels, &tmp_bw, &result->stats);
+    if (ok) {
+      output = VP8LBitWriterFinish(&tmp_bw);
+      output_size = VP8LBitWriterNumBytes(&tmp_bw);
+      if (output_size > data_size) {
+        // compressed size is larger than source! Revert to uncompressed mode.
+        method = ALPHA_NO_COMPRESSION;
+        VP8LBitWriterWipeOut(&tmp_bw);
+      }
+    } else {
+      VP8LBitWriterWipeOut(&tmp_bw);
+      return 0;
+    }
+  }
+
+  if (method == ALPHA_NO_COMPRESSION) {
+    output = alpha_src;
+    output_size = data_size;
+    ok = 1;
+  }
+
+  // Emit final result.
+  header = method | (filter << 2);
+  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
+
+  VP8BitWriterInit(&result->bw, ALPHA_HEADER_LEN + output_size);
+  ok = ok && VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
+  ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
+
+  if (method != ALPHA_NO_COMPRESSION) {
+    VP8LBitWriterWipeOut(&tmp_bw);
+  }
+  ok = ok && !result->bw.error_;
+  result->score = VP8BitWriterSize(&result->bw);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+
+static int GetNumColors(const uint8_t* data, int width, int height,
+                        int stride) {
+  int j;
+  int colors = 0;
+  uint8_t color[256] = { 0 };
+
+  for (j = 0; j < height; ++j) {
+    int i;
+    const uint8_t* const p = data + j * stride;
+    for (i = 0; i < width; ++i) {
+      color[p[i]] = 1;
+    }
+  }
+  for (j = 0; j < 256; ++j) {
+    if (color[j] > 0) ++colors;
+  }
+  return colors;
+}
+
+#define FILTER_TRY_NONE (1 << WEBP_FILTER_NONE)
+#define FILTER_TRY_ALL ((1 << WEBP_FILTER_LAST) - 1)
+
+// Given the input 'filter' option, return an OR'd bit-set of filters to try.
+static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
+                             int filter, int effort_level) {
+  uint32_t bit_map = 0U;
+  if (filter == WEBP_FILTER_FAST) {
+    // Quick estimate of the best candidate.
+    int try_filter_none = (effort_level > 3);
+    const int kMinColorsForFilterNone = 16;
+    const int kMaxColorsForFilterNone = 192;
+    const int num_colors = GetNumColors(alpha, width, height, width);
+    // For low number of colors, NONE yields better compression.
+    filter = (num_colors <= kMinColorsForFilterNone)
+        ? WEBP_FILTER_NONE
+        : WebPEstimateBestFilter(alpha, width, height, width);
+    bit_map |= 1 << filter;
+    // For large number of colors, try FILTER_NONE in addition to the best
+    // filter as well.
+    if (try_filter_none || num_colors > kMaxColorsForFilterNone) {
+      bit_map |= FILTER_TRY_NONE;
+    }
+  } else if (filter == WEBP_FILTER_NONE) {
+    bit_map = FILTER_TRY_NONE;
+  } else {  // WEBP_FILTER_BEST -> try all
+    bit_map = FILTER_TRY_ALL;
+  }
+  return bit_map;
+}
+
+static void InitFilterTrial(FilterTrial* const score) {
+  score->score = (size_t)~0U;
+  VP8BitWriterInit(&score->bw, 0);
+}
+
+static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
+                                 size_t data_size, int method, int filter,
+                                 int reduce_levels, int effort_level,
+                                 uint8_t** const output,
+                                 size_t* const output_size,
+                                 WebPAuxStats* const stats) {
+  int ok = 1;
+  FilterTrial best;
+  uint32_t try_map =
+      GetFilterMap(alpha, width, height, filter, effort_level);
+  InitFilterTrial(&best);
+
+  if (try_map != FILTER_TRY_NONE) {
+    uint8_t* filtered_alpha =  (uint8_t*)WebPSafeMalloc(1ULL, data_size);
+    if (filtered_alpha == NULL) return 0;
+
+    for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
+      if (try_map & 1) {
+        FilterTrial trial;
+        ok = EncodeAlphaInternal(alpha, width, height, method, filter,
+                                 reduce_levels, effort_level, filtered_alpha,
+                                 &trial);
+        if (ok && trial.score < best.score) {
+          VP8BitWriterWipeOut(&best.bw);
+          best = trial;
+        } else {
+          VP8BitWriterWipeOut(&trial.bw);
+        }
+      }
+    }
+    WebPSafeFree(filtered_alpha);
+  } else {
+    ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
+                             reduce_levels, effort_level, NULL, &best);
+  }
+  if (ok) {
+#if !defined(WEBP_DISABLE_STATS)
+    if (stats != NULL) {
+      stats->lossless_features = best.stats.lossless_features;
+      stats->histogram_bits = best.stats.histogram_bits;
+      stats->transform_bits = best.stats.transform_bits;
+      stats->cache_bits = best.stats.cache_bits;
+      stats->palette_size = best.stats.palette_size;
+      stats->lossless_size = best.stats.lossless_size;
+      stats->lossless_hdr_size = best.stats.lossless_hdr_size;
+      stats->lossless_data_size = best.stats.lossless_data_size;
+    }
+#else
+    (void)stats;
+#endif
+    *output_size = VP8BitWriterSize(&best.bw);
+    *output = VP8BitWriterBuf(&best.bw);
+  } else {
+    VP8BitWriterWipeOut(&best.bw);
+  }
+  return ok;
+}
+
+static int EncodeAlpha(VP8Encoder* const enc,
+                       int quality, int method, int filter,
+                       int effort_level,
+                       uint8_t** const output, size_t* const output_size) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+
+  uint8_t* quant_alpha = NULL;
+  const size_t data_size = width * height;
+  uint64_t sse = 0;
+  int ok = 1;
+  const int reduce_levels = (quality < 100);
+
+  // quick correctness checks
+  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
+  assert(enc != NULL && pic != NULL && pic->a != NULL);
+  assert(output != NULL && output_size != NULL);
+  assert(width > 0 && height > 0);
+  assert(pic->a_stride >= width);
+  assert(filter >= WEBP_FILTER_NONE && filter <= WEBP_FILTER_FAST);
+
+  if (quality < 0 || quality > 100) {
+    return 0;
+  }
+
+  if (method < ALPHA_NO_COMPRESSION || method > ALPHA_LOSSLESS_COMPRESSION) {
+    return 0;
+  }
+
+  if (method == ALPHA_NO_COMPRESSION) {
+    // Don't filter, as filtering will make no impact on compressed size.
+    filter = WEBP_FILTER_NONE;
+  }
+
+  quant_alpha = (uint8_t*)WebPSafeMalloc(1ULL, data_size);
+  if (quant_alpha == NULL) {
+    return 0;
+  }
+
+  // Extract alpha data (width x height) from raw_data (stride x height).
+  WebPCopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);
+
+  if (reduce_levels) {  // No Quantization required for 'quality = 100'.
+    // 16 alpha levels gives quite a low MSE w.r.t original alpha plane hence
+    // mapped to moderate quality 70. Hence Quality:[0, 70] -> Levels:[2, 16]
+    // and Quality:]70, 100] -> Levels:]16, 256].
+    const int alpha_levels = (quality <= 70) ? (2 + quality / 5)
+                                             : (16 + (quality - 70) * 8);
+    ok = QuantizeLevels(quant_alpha, width, height, alpha_levels, &sse);
+  }
+
+  if (ok) {
+    VP8FiltersInit();
+    ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
+                               filter, reduce_levels, effort_level, output,
+                               output_size, pic->stats);
+#if !defined(WEBP_DISABLE_STATS)
+    if (pic->stats != NULL) {  // need stats?
+      pic->stats->coded_size += (int)(*output_size);
+      enc->sse_[3] = sse;
+    }
+#endif
+  }
+
+  WebPSafeFree(quant_alpha);
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+// Main calls
+
+static int CompressAlphaJob(void* arg1, void* unused) {
+  VP8Encoder* const enc = (VP8Encoder*)arg1;
+  const WebPConfig* config = enc->config_;
+  uint8_t* alpha_data = NULL;
+  size_t alpha_size = 0;
+  const int effort_level = config->method;  // maps to [0..6]
+  const WEBP_FILTER_TYPE filter =
+      (config->alpha_filtering == 0) ? WEBP_FILTER_NONE :
+      (config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
+                                       WEBP_FILTER_BEST;
+  if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
+                   filter, effort_level, &alpha_data, &alpha_size)) {
+    return 0;
+  }
+  if (alpha_size != (uint32_t)alpha_size) {  // Soundness check.
+    WebPSafeFree(alpha_data);
+    return 0;
+  }
+  enc->alpha_data_size_ = (uint32_t)alpha_size;
+  enc->alpha_data_ = alpha_data;
+  (void)unused;
+  return 1;
+}
+
+void VP8EncInitAlpha(VP8Encoder* const enc) {
+  WebPInitAlphaProcessing();
+  enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
+  enc->alpha_data_ = NULL;
+  enc->alpha_data_size_ = 0;
+  if (enc->thread_level_ > 0) {
+    WebPWorker* const worker = &enc->alpha_worker_;
+    WebPGetWorkerInterface()->Init(worker);
+    worker->data1 = enc;
+    worker->data2 = NULL;
+    worker->hook = CompressAlphaJob;
+  }
+}
+
+int VP8EncStartAlpha(VP8Encoder* const enc) {
+  if (enc->has_alpha_) {
+    if (enc->thread_level_ > 0) {
+      WebPWorker* const worker = &enc->alpha_worker_;
+      // Makes sure worker is good to go.
+      if (!WebPGetWorkerInterface()->Reset(worker)) {
+        return 0;
+      }
+      WebPGetWorkerInterface()->Launch(worker);
+      return 1;
+    } else {
+      return CompressAlphaJob(enc, NULL);   // just do the job right away
+    }
+  }
+  return 1;
+}
+
+int VP8EncFinishAlpha(VP8Encoder* const enc) {
+  if (enc->has_alpha_) {
+    if (enc->thread_level_ > 0) {
+      WebPWorker* const worker = &enc->alpha_worker_;
+      if (!WebPGetWorkerInterface()->Sync(worker)) return 0;  // error
+    }
+  }
+  return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+}
+
+int VP8EncDeleteAlpha(VP8Encoder* const enc) {
+  int ok = 1;
+  if (enc->thread_level_ > 0) {
+    WebPWorker* const worker = &enc->alpha_worker_;
+    // finish anything left in flight
+    ok = WebPGetWorkerInterface()->Sync(worker);
+    // still need to end the worker, even if !ok
+    WebPGetWorkerInterface()->End(worker);
+  }
+  WebPSafeFree(enc->alpha_data_);
+  enc->alpha_data_ = NULL;
+  enc->alpha_data_size_ = 0;
+  enc->has_alpha_ = 0;
+  return ok;
+}
diff --git a/media/libwebp/enc/analysis_enc.c b/media/libwebp/enc/analysis_enc.c
new file mode 100644
index 0000000000..489434b3d1
--- /dev/null
+++ b/media/libwebp/enc/analysis_enc.c
@@ -0,0 +1,475 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Macroblock analysis
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../enc/cost_enc.h"
+#include "../utils/utils.h"
+
+#define MAX_ITERS_K_MEANS  6
+
+//------------------------------------------------------------------------------
+// Smooth the segment map by replacing isolated block by the majority of its
+// neighbours.
+
+static void SmoothSegmentMap(VP8Encoder* const enc) {
+  int n, x, y;
+  const int w = enc->mb_w_;
+  const int h = enc->mb_h_;
+  const int majority_cnt_3_x_3_grid = 5;
+  uint8_t* const tmp = (uint8_t*)WebPSafeMalloc(w * h, sizeof(*tmp));
+  assert((uint64_t)(w * h) == (uint64_t)w * h);   // no overflow, as per spec
+
+  if (tmp == NULL) return;
+  for (y = 1; y < h - 1; ++y) {
+    for (x = 1; x < w - 1; ++x) {
+      int cnt[NUM_MB_SEGMENTS] = { 0 };
+      const VP8MBInfo* const mb = &enc->mb_info_[x + w * y];
+      int majority_seg = mb->segment_;
+      // Check the 8 neighbouring segment values.
+      cnt[mb[-w - 1].segment_]++;  // top-left
+      cnt[mb[-w + 0].segment_]++;  // top
+      cnt[mb[-w + 1].segment_]++;  // top-right
+      cnt[mb[   - 1].segment_]++;  // left
+      cnt[mb[   + 1].segment_]++;  // right
+      cnt[mb[ w - 1].segment_]++;  // bottom-left
+      cnt[mb[ w + 0].segment_]++;  // bottom
+      cnt[mb[ w + 1].segment_]++;  // bottom-right
+      for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
+        if (cnt[n] >= majority_cnt_3_x_3_grid) {
+          majority_seg = n;
+          break;
+        }
+      }
+      tmp[x + y * w] = majority_seg;
+    }
+  }
+  for (y = 1; y < h - 1; ++y) {
+    for (x = 1; x < w - 1; ++x) {
+      VP8MBInfo* const mb = &enc->mb_info_[x + w * y];
+      mb->segment_ = tmp[x + y * w];
+    }
+  }
+  WebPSafeFree(tmp);
+}
+
+//------------------------------------------------------------------------------
+// set segment susceptibility alpha_ / beta_
+
+static WEBP_INLINE int clip(int v, int m, int M) {
+  return (v < m) ? m : (v > M) ? M : v;
+}
+
+static void SetSegmentAlphas(VP8Encoder* const enc,
+                             const int centers[NUM_MB_SEGMENTS],
+                             int mid) {
+  const int nb = enc->segment_hdr_.num_segments_;
+  int min = centers[0], max = centers[0];
+  int n;
+
+  if (nb > 1) {
+    for (n = 0; n < nb; ++n) {
+      if (min > centers[n]) min = centers[n];
+      if (max < centers[n]) max = centers[n];
+    }
+  }
+  if (max == min) max = min + 1;
+  assert(mid <= max && mid >= min);
+  for (n = 0; n < nb; ++n) {
+    const int alpha = 255 * (centers[n] - mid) / (max - min);
+    const int beta = 255 * (centers[n] - min) / (max - min);
+    enc->dqm_[n].alpha_ = clip(alpha, -127, 127);
+    enc->dqm_[n].beta_ = clip(beta, 0, 255);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+#define MAX_ALPHA 255                // 8b of precision for susceptibilities.
+#define ALPHA_SCALE (2 * MAX_ALPHA)  // scaling factor for alpha.
+#define DEFAULT_ALPHA (-1)
+#define IS_BETTER_ALPHA(alpha, best_alpha) ((alpha) > (best_alpha))
+
+static int FinalAlphaValue(int alpha) {
+  alpha = MAX_ALPHA - alpha;
+  return clip(alpha, 0, MAX_ALPHA);
+}
+
+static int GetAlpha(const VP8Histogram* const histo) {
+  // 'alpha' will later be clipped to [0..MAX_ALPHA] range, clamping outer
+  // values which happen to be mostly noise. This leaves the maximum precision
+  // for handling the useful small values which contribute most.
+  const int max_value = histo->max_value;
+  const int last_non_zero = histo->last_non_zero;
+  const int alpha =
+      (max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
+  return alpha;
+}
+
+static void InitHistogram(VP8Histogram* const histo) {
+  histo->max_value = 0;
+  histo->last_non_zero = 1;
+}
+
+//------------------------------------------------------------------------------
+// Simplified k-Means, to assign Nb segments based on alpha-histogram
+
+static void AssignSegments(VP8Encoder* const enc,
+                           const int alphas[MAX_ALPHA + 1]) {
+  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
+  // explicit check is needed to avoid spurious warning about 'n + 1' exceeding
+  // array bounds of 'centers' with some compilers (noticed with gcc-4.9).
+  const int nb = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS) ?
+                 enc->segment_hdr_.num_segments_ : NUM_MB_SEGMENTS;
+  int centers[NUM_MB_SEGMENTS];
+  int weighted_average = 0;
+  int map[MAX_ALPHA + 1];
+  int a, n, k;
+  int min_a = 0, max_a = MAX_ALPHA, range_a;
+  // 'int' type is ok for histo, and won't overflow
+  int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];
+
+  assert(nb >= 1);
+  assert(nb <= NUM_MB_SEGMENTS);
+
+  // bracket the input
+  for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
+  min_a = n;
+  for (n = MAX_ALPHA; n > min_a && alphas[n] == 0; --n) {}
+  max_a = n;
+  range_a = max_a - min_a;
+
+  // Spread initial centers evenly
+  for (k = 0, n = 1; k < nb; ++k, n += 2) {
+    assert(n < 2 * nb);
+    centers[k] = min_a + (n * range_a) / (2 * nb);
+  }
+
+  for (k = 0; k < MAX_ITERS_K_MEANS; ++k) {     // few iters are enough
+    int total_weight;
+    int displaced;
+    // Reset stats
+    for (n = 0; n < nb; ++n) {
+      accum[n] = 0;
+      dist_accum[n] = 0;
+    }
+    // Assign nearest center for each 'a'
+    n = 0;    // track the nearest center for current 'a'
+    for (a = min_a; a <= max_a; ++a) {
+      if (alphas[a]) {
+        while (n + 1 < nb && abs(a - centers[n + 1]) < abs(a - centers[n])) {
+          n++;
+        }
+        map[a] = n;
+        // accumulate contribution into best centroid
+        dist_accum[n] += a * alphas[a];
+        accum[n] += alphas[a];
+      }
+    }
+    // All point are classified. Move the centroids to the
+    // center of their respective cloud.
+    displaced = 0;
+    weighted_average = 0;
+    total_weight = 0;
+    for (n = 0; n < nb; ++n) {
+      if (accum[n]) {
+        const int new_center = (dist_accum[n] + accum[n] / 2) / accum[n];
+        displaced += abs(centers[n] - new_center);
+        centers[n] = new_center;
+        weighted_average += new_center * accum[n];
+        total_weight += accum[n];
+      }
+    }
+    weighted_average = (weighted_average + total_weight / 2) / total_weight;
+    if (displaced < 5) break;   // no need to keep on looping...
+  }
+
+  // Map each original value to the closest centroid
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    VP8MBInfo* const mb = &enc->mb_info_[n];
+    const int alpha = mb->alpha_;
+    mb->segment_ = map[alpha];
+    mb->alpha_ = centers[map[alpha]];  // for the record.
+  }
+
+  if (nb > 1) {
+    const int smooth = (enc->config_->preprocessing & 1);
+    if (smooth) SmoothSegmentMap(enc);
+  }
+
+  SetSegmentAlphas(enc, centers, weighted_average);  // pick some alphas.
+}
+
+//------------------------------------------------------------------------------
+// Macroblock analysis: collect histogram for each mode, deduce the maximal
+// susceptibility and set best modes for this macroblock.
+// Segment assignment is done later.
+
+// Number of modes to inspect for alpha_ evaluation. We don't need to test all
+// the possible modes during the analysis phase: we risk falling into a local
+// optimum, or be subject to boundary effect
+#define MAX_INTRA16_MODE 2
+#define MAX_INTRA4_MODE  2
+#define MAX_UV_MODE      2
+
+static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
+  const int max_mode = MAX_INTRA16_MODE;
+  int mode;
+  int best_alpha = DEFAULT_ALPHA;
+  int best_mode = 0;
+
+  VP8MakeLuma16Preds(it);
+  for (mode = 0; mode < max_mode; ++mode) {
+    VP8Histogram histo;
+    int alpha;
+
+    InitHistogram(&histo);
+    VP8CollectHistogram(it->yuv_in_ + Y_OFF_ENC,
+                        it->yuv_p_ + VP8I16ModeOffsets[mode],
+                        0, 16, &histo);
+    alpha = GetAlpha(&histo);
+    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
+      best_alpha = alpha;
+      best_mode = mode;
+    }
+  }
+  VP8SetIntra16Mode(it, best_mode);
+  return best_alpha;
+}
+
+static int FastMBAnalyze(VP8EncIterator* const it) {
+  // Empirical cut-off value, should be around 16 (~=block size). We use the
+  // [8-17] range and favor intra4 at high quality, intra16 for low quality.
+  const int q = (int)it->enc_->config_->quality;
+  const uint32_t kThreshold = 8 + (17 - 8) * q / 100;
+  int k;
+  uint32_t dc[16], m, m2;
+  for (k = 0; k < 16; k += 4) {
+    VP8Mean16x4(it->yuv_in_ + Y_OFF_ENC + k * BPS, &dc[k]);
+  }
+  for (m = 0, m2 = 0, k = 0; k < 16; ++k) {
+    m += dc[k];
+    m2 += dc[k] * dc[k];
+  }
+  if (kThreshold * m2 < m * m) {
+    VP8SetIntra16Mode(it, 0);   // DC16
+  } else {
+    const uint8_t modes[16] = { 0 };  // DC4
+    VP8SetIntra4Mode(it, modes);
+  }
+  return 0;
+}
+
+static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
+  int best_alpha = DEFAULT_ALPHA;
+  int smallest_alpha = 0;
+  int best_mode = 0;
+  const int max_mode = MAX_UV_MODE;
+  int mode;
+
+  VP8MakeChroma8Preds(it);
+  for (mode = 0; mode < max_mode; ++mode) {
+    VP8Histogram histo;
+    int alpha;
+    InitHistogram(&histo);
+    VP8CollectHistogram(it->yuv_in_ + U_OFF_ENC,
+                        it->yuv_p_ + VP8UVModeOffsets[mode],
+                        16, 16 + 4 + 4, &histo);
+    alpha = GetAlpha(&histo);
+    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
+      best_alpha = alpha;
+    }
+    // The best prediction mode tends to be the one with the smallest alpha.
+    if (mode == 0 || alpha < smallest_alpha) {
+      smallest_alpha = alpha;
+      best_mode = mode;
+    }
+  }
+  VP8SetIntraUVMode(it, best_mode);
+  return best_alpha;
+}
+
+static void MBAnalyze(VP8EncIterator* const it,
+                      int alphas[MAX_ALPHA + 1],
+                      int* const alpha, int* const uv_alpha) {
+  const VP8Encoder* const enc = it->enc_;
+  int best_alpha, best_uv_alpha;
+
+  VP8SetIntra16Mode(it, 0);  // default: Intra16, DC_PRED
+  VP8SetSkip(it, 0);         // not skipped
+  VP8SetSegment(it, 0);      // default segment, spec-wise.
+
+  if (enc->method_ <= 1) {
+    best_alpha = FastMBAnalyze(it);
+  } else {
+    best_alpha = MBAnalyzeBestIntra16Mode(it);
+  }
+  best_uv_alpha = MBAnalyzeBestUVMode(it);
+
+  // Final susceptibility mix
+  best_alpha = (3 * best_alpha + best_uv_alpha + 2) >> 2;
+  best_alpha = FinalAlphaValue(best_alpha);
+  alphas[best_alpha]++;
+  it->mb_->alpha_ = best_alpha;   // for later remapping.
+
+  // Accumulate for later complexity analysis.
+  *alpha += best_alpha;   // mixed susceptibility (not just luma)
+  *uv_alpha += best_uv_alpha;
+}
+
+static void DefaultMBInfo(VP8MBInfo* const mb) {
+  mb->type_ = 1;     // I16x16
+  mb->uv_mode_ = 0;
+  mb->skip_ = 0;     // not skipped
+  mb->segment_ = 0;  // default segment
+  mb->alpha_ = 0;
+}
+
+//------------------------------------------------------------------------------
+// Main analysis loop:
+// Collect all susceptibilities for each macroblock and record their
+// distribution in alphas[]. Segments is assigned a-posteriori, based on
+// this histogram.
+// We also pick an intra16 prediction mode, which shouldn't be considered
+// final except for fast-encode settings. We can also pick some intra4 modes
+// and decide intra4/intra16, but that's usually almost always a bad choice at
+// this stage.
+
+static void ResetAllMBInfo(VP8Encoder* const enc) {
+  int n;
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    DefaultMBInfo(&enc->mb_info_[n]);
+  }
+  // Default susceptibilities.
+  enc->dqm_[0].alpha_ = 0;
+  enc->dqm_[0].beta_ = 0;
+  // Note: we can't compute this alpha_ / uv_alpha_ -> set to default value.
+  enc->alpha_ = 0;
+  enc->uv_alpha_ = 0;
+  WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+}
+
+// struct used to collect job result
+typedef struct {
+  WebPWorker worker;
+  int alphas[MAX_ALPHA + 1];
+  int alpha, uv_alpha;
+  VP8EncIterator it;
+  int delta_progress;
+} SegmentJob;
+
+// main work call
+static int DoSegmentsJob(void* arg1, void* arg2) {
+  SegmentJob* const job = (SegmentJob*)arg1;
+  VP8EncIterator* const it = (VP8EncIterator*)arg2;
+  int ok = 1;
+  if (!VP8IteratorIsDone(it)) {
+    uint8_t tmp[32 + WEBP_ALIGN_CST];
+    uint8_t* const scratch = (uint8_t*)WEBP_ALIGN(tmp);
+    do {
+      // Let's pretend we have perfect lossless reconstruction.
+      VP8IteratorImport(it, scratch);
+      MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha);
+      ok = VP8IteratorProgress(it, job->delta_progress);
+    } while (ok && VP8IteratorNext(it));
+  }
+  return ok;
+}
+
+static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
+  int i;
+  for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
+  dst->alpha += src->alpha;
+  dst->uv_alpha += src->uv_alpha;
+}
+
+// initialize the job struct with some tasks to perform
+static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
+                           int start_row, int end_row) {
+  WebPGetWorkerInterface()->Init(&job->worker);
+  job->worker.data1 = job;
+  job->worker.data2 = &job->it;
+  job->worker.hook = DoSegmentsJob;
+  VP8IteratorInit(enc, &job->it);
+  VP8IteratorSetRow(&job->it, start_row);
+  VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
+  memset(job->alphas, 0, sizeof(job->alphas));
+  job->alpha = 0;
+  job->uv_alpha = 0;
+  // only one of both jobs can record the progress, since we don't
+  // expect the user's hook to be multi-thread safe
+  job->delta_progress = (start_row == 0) ? 20 : 0;
+}
+
+// main entry point
+int VP8EncAnalyze(VP8Encoder* const enc) {
+  int ok = 1;
+  const int do_segments =
+      enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
+      (enc->segment_hdr_.num_segments_ > 1) ||
+      (enc->method_ <= 1);  // for method 0 - 1, we need preds_[] to be filled.
+  if (do_segments) {
+    const int last_row = enc->mb_h_;
+    // We give a little more than a half work to the main thread.
+    const int split_row = (9 * last_row + 15) >> 4;
+    const int total_mb = last_row * enc->mb_w_;
+#ifdef WEBP_USE_THREAD
+    const int kMinSplitRow = 2;  // minimal rows needed for mt to be worth it
+    const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
+#else
+    const int do_mt = 0;
+#endif
+    const WebPWorkerInterface* const worker_interface =
+        WebPGetWorkerInterface();
+    SegmentJob main_job;
+    if (do_mt) {
+      SegmentJob side_job;
+      // Note the use of '&' instead of '&&' because we must call the functions
+      // no matter what.
+      InitSegmentJob(enc, &main_job, 0, split_row);
+      InitSegmentJob(enc, &side_job, split_row, last_row);
+      // we don't need to call Reset() on main_job.worker, since we're calling
+      // WebPWorkerExecute() on it
+      ok &= worker_interface->Reset(&side_job.worker);
+      // launch the two jobs in parallel
+      if (ok) {
+        worker_interface->Launch(&side_job.worker);
+        worker_interface->Execute(&main_job.worker);
+        ok &= worker_interface->Sync(&side_job.worker);
+        ok &= worker_interface->Sync(&main_job.worker);
+      }
+      worker_interface->End(&side_job.worker);
+      if (ok) MergeJobs(&side_job, &main_job);  // merge results together
+    } else {
+      // Even for single-thread case, we use the generic Worker tools.
+      InitSegmentJob(enc, &main_job, 0, last_row);
+      worker_interface->Execute(&main_job.worker);
+      ok &= worker_interface->Sync(&main_job.worker);
+    }
+    worker_interface->End(&main_job.worker);
+    if (ok) {
+      enc->alpha_ = main_job.alpha / total_mb;
+      enc->uv_alpha_ = main_job.uv_alpha / total_mb;
+      AssignSegments(enc, main_job.alphas);
+    }
+  } else {   // Use only one default segment.
+    ResetAllMBInfo(enc);
+  }
+  return ok;
+}
+
diff --git a/media/libwebp/enc/backward_references_cost_enc.c b/media/libwebp/enc/backward_references_cost_enc.c
new file mode 100644
index 0000000000..59e2c0f611
--- /dev/null
+++ b/media/libwebp/enc/backward_references_cost_enc.c
@@ -0,0 +1,790 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Improves a given set of backward references by analyzing its bit cost.
+// The algorithm is similar to the Zopfli compression algorithm but tailored to
+// images.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+//
+
+#include <assert.h>
+
+#include "../enc/backward_references_enc.h"
+#include "../enc/histogram_enc.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/color_cache_utils.h"
+#include "../utils/utils.h"
+
+#define VALUES_IN_BYTE 256
+
+extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+extern int VP8LDistanceToPlaneCode(int xsize, int dist);
+extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                                      const PixOrCopy v);
+
+typedef struct {
+  double alpha_[VALUES_IN_BYTE];
+  double red_[VALUES_IN_BYTE];
+  double blue_[VALUES_IN_BYTE];
+  double distance_[NUM_DISTANCE_CODES];
+  double* literal_;
+} CostModel;
+
+static void ConvertPopulationCountTableToBitEstimates(
+    int num_symbols, const uint32_t population_counts[], double output[]) {
+  uint32_t sum = 0;
+  int nonzeros = 0;
+  int i;
+  for (i = 0; i < num_symbols; ++i) {
+    sum += population_counts[i];
+    if (population_counts[i] > 0) {
+      ++nonzeros;
+    }
+  }
+  if (nonzeros <= 1) {
+    memset(output, 0, num_symbols * sizeof(*output));
+  } else {
+    const double logsum = VP8LFastLog2(sum);
+    for (i = 0; i < num_symbols; ++i) {
+      output[i] = logsum - VP8LFastLog2(population_counts[i]);
+    }
+  }
+}
+
+static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
+                          const VP8LBackwardRefs* const refs) {
+  int ok = 0;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
+  if (histo == NULL) goto Error;
+
+  // The following code is similar to VP8LHistogramCreate but converts the
+  // distance to plane code.
+  VP8LHistogramInit(histo, cache_bits, /*init_arrays=*/ 1);
+  while (VP8LRefsCursorOk(&c)) {
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode,
+                                    xsize);
+    VP8LRefsCursorNext(&c);
+  }
+
+  ConvertPopulationCountTableToBitEstimates(
+      VP8LHistogramNumCodes(histo->palette_code_bits_),
+      histo->literal_, m->literal_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->red_, m->red_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->blue_, m->blue_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->alpha_, m->alpha_);
+  ConvertPopulationCountTableToBitEstimates(
+      NUM_DISTANCE_CODES, histo->distance_, m->distance_);
+  ok = 1;
+
+ Error:
+  VP8LFreeHistogram(histo);
+  return ok;
+}
+
+static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+  return m->alpha_[v >> 24] +
+         m->red_[(v >> 16) & 0xff] +
+         m->literal_[(v >> 8) & 0xff] +
+         m->blue_[v & 0xff];
+}
+
+static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
+  const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
+  return m->literal_[literal_idx];
+}
+
+static WEBP_INLINE double GetLengthCost(const CostModel* const m,
+                                        uint32_t length) {
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(length, &code, &extra_bits);
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
+}
+
+static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
+                                          uint32_t distance) {
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
+  return m->distance_[code] + extra_bits;
+}
+
+static WEBP_INLINE void AddSingleLiteralWithCostModel(
+    const uint32_t* const argb, VP8LColorCache* const hashers,
+    const CostModel* const cost_model, int idx, int use_color_cache,
+    float prev_cost, float* const cost, uint16_t* const dist_array) {
+  double cost_val = prev_cost;
+  const uint32_t color = argb[idx];
+  const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
+  if (ix >= 0) {
+    // use_color_cache is true and hashers contains color
+    const double mul0 = 0.68;
+    cost_val += GetCacheCost(cost_model, ix) * mul0;
+  } else {
+    const double mul1 = 0.82;
+    if (use_color_cache) VP8LColorCacheInsert(hashers, color);
+    cost_val += GetLiteralCost(cost_model, color) * mul1;
+  }
+  if (cost[idx] > cost_val) {
+    cost[idx] = (float)cost_val;
+    dist_array[idx] = 1;  // only one is inserted.
+  }
+}
+
+// -----------------------------------------------------------------------------
+// CostManager and interval handling
+
+// Empirical value to avoid high memory consumption but good for performance.
+#define COST_CACHE_INTERVAL_SIZE_MAX 500
+
+// To perform backward reference every pixel at index index_ is considered and
+// the cost for the MAX_LENGTH following pixels computed. Those following pixels
+// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
+//     cost_ = distance cost at index + GetLengthCost(cost_model, k)
+// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
+// array of size MAX_LENGTH.
+// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
+// minimal values using intervals of constant cost.
+// An interval is defined by the index_ of the pixel that generated it and
+// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
+// it contains the minimum value for pixels between start_ and end_.
+// Intervals are stored in a linked list and ordered by start_. When a new
+// interval has a better value, old intervals are split or removed. There are
+// therefore no overlapping intervals.
+typedef struct CostInterval CostInterval;
+struct CostInterval {
+  float cost_;
+  int start_;
+  int end_;
+  int index_;
+  CostInterval* previous_;
+  CostInterval* next_;
+};
+
+// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
+typedef struct {
+  double cost_;
+  int start_;
+  int end_;       // Exclusive.
+} CostCacheInterval;
+
+// This structure is in charge of managing intervals and costs.
+// It caches the different CostCacheInterval, caches the different
+// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
+// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
+#define COST_MANAGER_MAX_FREE_LIST 10
+typedef struct {
+  CostInterval* head_;
+  int count_;  // The number of stored intervals.
+  CostCacheInterval* cache_intervals_;
+  size_t cache_intervals_size_;
+  double cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
+  float* costs_;
+  uint16_t* dist_array_;
+  // Most of the time, we only need few intervals -> use a free-list, to avoid
+  // fragmentation with small allocs in most common cases.
+  CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
+  CostInterval* free_intervals_;
+  // These are regularly malloc'd remains. This list can't grow larger than than
+  // size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
+  CostInterval* recycled_intervals_;
+} CostManager;
+
+static void CostIntervalAddToFreeList(CostManager* const manager,
+                                      CostInterval* const interval) {
+  interval->next_ = manager->free_intervals_;
+  manager->free_intervals_ = interval;
+}
+
+static int CostIntervalIsInFreeList(const CostManager* const manager,
+                                    const CostInterval* const interval) {
+  return (interval >= &manager->intervals_[0] &&
+          interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
+}
+
+static void CostManagerInitFreeList(CostManager* const manager) {
+  int i;
+  manager->free_intervals_ = NULL;
+  for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
+    CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
+  }
+}
+
+static void DeleteIntervalList(CostManager* const manager,
+                               const CostInterval* interval) {
+  while (interval != NULL) {
+    const CostInterval* const next = interval->next_;
+    if (!CostIntervalIsInFreeList(manager, interval)) {
+      WebPSafeFree((void*)interval);
+    }  // else: do nothing
+    interval = next;
+  }
+}
+
+static void CostManagerClear(CostManager* const manager) {
+  if (manager == NULL) return;
+
+  WebPSafeFree(manager->costs_);
+  WebPSafeFree(manager->cache_intervals_);
+
+  // Clear the interval lists.
+  DeleteIntervalList(manager, manager->head_);
+  manager->head_ = NULL;
+  DeleteIntervalList(manager, manager->recycled_intervals_);
+  manager->recycled_intervals_ = NULL;
+
+  // Reset pointers, count_ and cache_intervals_size_.
+  memset(manager, 0, sizeof(*manager));
+  CostManagerInitFreeList(manager);
+}
+
+static int CostManagerInit(CostManager* const manager,
+                           uint16_t* const dist_array, int pix_count,
+                           const CostModel* const cost_model) {
+  int i;
+  const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
+
+  manager->costs_ = NULL;
+  manager->cache_intervals_ = NULL;
+  manager->head_ = NULL;
+  manager->recycled_intervals_ = NULL;
+  manager->count_ = 0;
+  manager->dist_array_ = dist_array;
+  CostManagerInitFreeList(manager);
+
+  // Fill in the cost_cache_.
+  manager->cache_intervals_size_ = 1;
+  manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
+  for (i = 1; i < cost_cache_size; ++i) {
+    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
+    // Get the number of bound intervals.
+    if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
+      ++manager->cache_intervals_size_;
+    }
+  }
+
+  // With the current cost model, we usually have below 20 intervals.
+  // The worst case scenario with a cost model would be if every length has a
+  // different cost, hence MAX_LENGTH but that is impossible with the current
+  // implementation that spirals around a pixel.
+  assert(manager->cache_intervals_size_ <= MAX_LENGTH);
+  manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
+      manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
+  if (manager->cache_intervals_ == NULL) {
+    CostManagerClear(manager);
+    return 0;
+  }
+
+  // Fill in the cache_intervals_.
+  {
+    CostCacheInterval* cur = manager->cache_intervals_;
+
+    // Consecutive values in cost_cache_ are compared and if a big enough
+    // difference is found, a new interval is created and bounded.
+    cur->start_ = 0;
+    cur->end_ = 1;
+    cur->cost_ = manager->cost_cache_[0];
+    for (i = 1; i < cost_cache_size; ++i) {
+      const double cost_val = manager->cost_cache_[i];
+      if (cost_val != cur->cost_) {
+        ++cur;
+        // Initialize an interval.
+        cur->start_ = i;
+        cur->cost_ = cost_val;
+      }
+      cur->end_ = i + 1;
+    }
+  }
+
+  manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
+  if (manager->costs_ == NULL) {
+    CostManagerClear(manager);
+    return 0;
+  }
+  // Set the initial costs_ high for every pixel as we will keep the minimum.
+  for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
+
+  return 1;
+}
+
+// Given the cost and the position that define an interval, update the cost at
+// pixel 'i' if it is smaller than the previously computed value.
+static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
+                                   int position, float cost) {
+  const int k = i - position;
+  assert(k >= 0 && k < MAX_LENGTH);
+
+  if (manager->costs_[i] > cost) {
+    manager->costs_[i] = cost;
+    manager->dist_array_[i] = k + 1;
+  }
+}
+
+// Given the cost and the position that define an interval, update the cost for
+// all the pixels between 'start' and 'end' excluded.
+static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
+                                              int start, int end, int position,
+                                              float cost) {
+  int i;
+  for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
+}
+
+// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
+static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
+                                         CostInterval* const prev,
+                                         CostInterval* const next) {
+  if (prev != NULL) {
+    prev->next_ = next;
+  } else {
+    manager->head_ = next;
+  }
+
+  if (next != NULL) next->previous_ = prev;
+}
+
+// Pop an interval in the manager.
+static WEBP_INLINE void PopInterval(CostManager* const manager,
+                                    CostInterval* const interval) {
+  if (interval == NULL) return;
+
+  ConnectIntervals(manager, interval->previous_, interval->next_);
+  if (CostIntervalIsInFreeList(manager, interval)) {
+    CostIntervalAddToFreeList(manager, interval);
+  } else {  // recycle regularly malloc'd intervals too
+    interval->next_ = manager->recycled_intervals_;
+    manager->recycled_intervals_ = interval;
+  }
+  --manager->count_;
+  assert(manager->count_ >= 0);
+}
+
+// Update the cost at index i by going over all the stored intervals that
+// overlap with i.
+// If 'do_clean_intervals' is set to something different than 0, intervals that
+// end before 'i' will be popped.
+static WEBP_INLINE void UpdateCostAtIndex(CostManager* const manager, int i,
+                                          int do_clean_intervals) {
+  CostInterval* current = manager->head_;
+
+  while (current != NULL && current->start_ <= i) {
+    CostInterval* const next = current->next_;
+    if (current->end_ <= i) {
+      if (do_clean_intervals) {
+        // We have an outdated interval, remove it.
+        PopInterval(manager, current);
+      }
+    } else {
+      UpdateCost(manager, i, current->index_, current->cost_);
+    }
+    current = next;
+  }
+}
+
+// Given a current orphan interval and its previous interval, before
+// it was orphaned (which can be NULL), set it at the right place in the list
+// of intervals using the start_ ordering and the previous interval as a hint.
+static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
+                                               CostInterval* const current,
+                                               CostInterval* previous) {
+  assert(current != NULL);
+
+  if (previous == NULL) previous = manager->head_;
+  while (previous != NULL && current->start_ < previous->start_) {
+    previous = previous->previous_;
+  }
+  while (previous != NULL && previous->next_ != NULL &&
+         previous->next_->start_ < current->start_) {
+    previous = previous->next_;
+  }
+
+  if (previous != NULL) {
+    ConnectIntervals(manager, current, previous->next_);
+  } else {
+    ConnectIntervals(manager, current, manager->head_);
+  }
+  ConnectIntervals(manager, previous, current);
+}
+
+// Insert an interval in the list contained in the manager by starting at
+// interval_in as a hint. The intervals are sorted by start_ value.
+static WEBP_INLINE void InsertInterval(CostManager* const manager,
+                                       CostInterval* const interval_in,
+                                       float cost, int position, int start,
+                                       int end) {
+  CostInterval* interval_new;
+
+  if (start >= end) return;
+  if (manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
+    // Serialize the interval if we cannot store it.
+    UpdateCostPerInterval(manager, start, end, position, cost);
+    return;
+  }
+  if (manager->free_intervals_ != NULL) {
+    interval_new = manager->free_intervals_;
+    manager->free_intervals_ = interval_new->next_;
+  } else if (manager->recycled_intervals_ != NULL) {
+    interval_new = manager->recycled_intervals_;
+    manager->recycled_intervals_ = interval_new->next_;
+  } else {  // malloc for good
+    interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
+    if (interval_new == NULL) {
+      // Write down the interval if we cannot create it.
+      UpdateCostPerInterval(manager, start, end, position, cost);
+      return;
+    }
+  }
+
+  interval_new->cost_ = cost;
+  interval_new->index_ = position;
+  interval_new->start_ = start;
+  interval_new->end_ = end;
+  PositionOrphanInterval(manager, interval_new, interval_in);
+
+  ++manager->count_;
+}
+
+// Given a new cost interval defined by its start at position, its length value
+// and distance_cost, add its contributions to the previous intervals and costs.
+// If handling the interval or one of its subintervals becomes to heavy, its
+// contribution is added to the costs right away.
+static WEBP_INLINE void PushInterval(CostManager* const manager,
+                                     double distance_cost, int position,
+                                     int len) {
+  size_t i;
+  CostInterval* interval = manager->head_;
+  CostInterval* interval_next;
+  const CostCacheInterval* const cost_cache_intervals =
+      manager->cache_intervals_;
+  // If the interval is small enough, no need to deal with the heavy
+  // interval logic, just serialize it right away. This constant is empirical.
+  const int kSkipDistance = 10;
+
+  if (len < kSkipDistance) {
+    int j;
+    for (j = position; j < position + len; ++j) {
+      const int k = j - position;
+      float cost_tmp;
+      assert(k >= 0 && k < MAX_LENGTH);
+      cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
+
+      if (manager->costs_[j] > cost_tmp) {
+        manager->costs_[j] = cost_tmp;
+        manager->dist_array_[j] = k + 1;
+      }
+    }
+    return;
+  }
+
+  for (i = 0; i < manager->cache_intervals_size_ &&
+              cost_cache_intervals[i].start_ < len;
+       ++i) {
+    // Define the intersection of the ith interval with the new one.
+    int start = position + cost_cache_intervals[i].start_;
+    const int end = position + (cost_cache_intervals[i].end_ > len
+                                 ? len
+                                 : cost_cache_intervals[i].end_);
+    const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
+
+    for (; interval != NULL && interval->start_ < end;
+         interval = interval_next) {
+      interval_next = interval->next_;
+
+      // Make sure we have some overlap
+      if (start >= interval->end_) continue;
+
+      if (cost >= interval->cost_) {
+        // When intervals are represented, the lower, the better.
+        // [**********************************************************[
+        // start                                                    end
+        //                   [----------------------------------[
+        //                   interval->start_       interval->end_
+        // If we are worse than what we already have, add whatever we have so
+        // far up to interval.
+        const int start_new = interval->end_;
+        InsertInterval(manager, interval, cost, position, start,
+                       interval->start_);
+        start = start_new;
+        if (start >= end) break;
+        continue;
+      }
+
+      if (start <= interval->start_) {
+        if (interval->end_ <= end) {
+          //                   [----------------------------------[
+          //                   interval->start_       interval->end_
+          // [**************************************************************[
+          // start                                                        end
+          // We can safely remove the old interval as it is fully included.
+          PopInterval(manager, interval);
+        } else {
+          //              [------------------------------------[
+          //              interval->start_        interval->end_
+          // [*****************************[
+          // start                       end
+          interval->start_ = end;
+          break;
+        }
+      } else {
+        if (end < interval->end_) {
+          // [--------------------------------------------------------------[
+          // interval->start_                                  interval->end_
+          //                     [*****************************[
+          //                     start                       end
+          // We have to split the old interval as it fully contains the new one.
+          const int end_original = interval->end_;
+          interval->end_ = start;
+          InsertInterval(manager, interval, interval->cost_, interval->index_,
+                         end, end_original);
+          interval = interval->next_;
+          break;
+        } else {
+          // [------------------------------------[
+          // interval->start_        interval->end_
+          //                     [*****************************[
+          //                     start                       end
+          interval->end_ = start;
+        }
+      }
+    }
+    // Insert the remaining interval from start to end.
+    InsertInterval(manager, interval, cost, position, start, end);
+  }
+}
+
+static int BackwardReferencesHashChainDistanceOnly(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain, const VP8LBackwardRefs* const refs,
+    uint16_t* const dist_array) {
+  int i;
+  int ok = 0;
+  int cc_init = 0;
+  const int pix_count = xsize * ysize;
+  const int use_color_cache = (cache_bits > 0);
+  const size_t literal_array_size =
+      sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
+                        ((cache_bits > 0) ? (1 << cache_bits) : 0));
+  const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
+  CostModel* const cost_model =
+      (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
+  VP8LColorCache hashers;
+  CostManager* cost_manager =
+      (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
+  int offset_prev = -1, len_prev = -1;
+  double offset_cost = -1;
+  int first_offset_is_constant = -1;  // initialized with 'impossible' value
+  int reach = 0;
+
+  if (cost_model == NULL || cost_manager == NULL) goto Error;
+
+  cost_model->literal_ = (double*)(cost_model + 1);
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  if (!CostModelBuild(cost_model, xsize, cache_bits, refs)) {
+    goto Error;
+  }
+
+  if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
+    goto Error;
+  }
+
+  // We loop one pixel at a time, but store all currently best points to
+  // non-processed locations from this point.
+  dist_array[0] = 0;
+  // Add first pixel as literal.
+  AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
+                                0.f, cost_manager->costs_, dist_array);
+
+  for (i = 1; i < pix_count; ++i) {
+    const float prev_cost = cost_manager->costs_[i - 1];
+    int offset, len;
+    VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
+
+    // Try adding the pixel as a literal.
+    AddSingleLiteralWithCostModel(argb, &hashers, cost_model, i,
+                                  use_color_cache, prev_cost,
+                                  cost_manager->costs_, dist_array);
+
+    // If we are dealing with a non-literal.
+    if (len >= 2) {
+      if (offset != offset_prev) {
+        const int code = VP8LDistanceToPlaneCode(xsize, offset);
+        offset_cost = GetDistanceCost(cost_model, code);
+        first_offset_is_constant = 1;
+        PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+      } else {
+        assert(offset_cost >= 0);
+        assert(len_prev >= 0);
+        assert(first_offset_is_constant == 0 || first_offset_is_constant == 1);
+        // Instead of considering all contributions from a pixel i by calling:
+        //         PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+        // we optimize these contributions in case offset_cost stays the same
+        // for consecutive pixels. This describes a set of pixels similar to a
+        // previous set (e.g. constant color regions).
+        if (first_offset_is_constant) {
+          reach = i - 1 + len_prev - 1;
+          first_offset_is_constant = 0;
+        }
+
+        if (i + len - 1 > reach) {
+          // We can only be go further with the same offset if the previous
+          // length was maxed, hence len_prev == len == MAX_LENGTH.
+          // TODO(vrabaud), bump i to the end right away (insert cache and
+          // update cost).
+          // TODO(vrabaud), check if one of the points in between does not have
+          // a lower cost.
+          // Already consider the pixel at "reach" to add intervals that are
+          // better than whatever we add.
+          int offset_j, len_j = 0;
+          int j;
+          assert(len == MAX_LENGTH || len == pix_count - i);
+          // Figure out the last consecutive pixel within [i, reach + 1] with
+          // the same offset.
+          for (j = i; j <= reach; ++j) {
+            VP8LHashChainFindCopy(hash_chain, j + 1, &offset_j, &len_j);
+            if (offset_j != offset) {
+              VP8LHashChainFindCopy(hash_chain, j, &offset_j, &len_j);
+              break;
+            }
+          }
+          // Update the cost at j - 1 and j.
+          UpdateCostAtIndex(cost_manager, j - 1, 0);
+          UpdateCostAtIndex(cost_manager, j, 0);
+
+          PushInterval(cost_manager, cost_manager->costs_[j - 1] + offset_cost,
+                       j, len_j);
+          reach = j + len_j - 1;
+        }
+      }
+    }
+
+    UpdateCostAtIndex(cost_manager, i, 1);
+    offset_prev = offset;
+    len_prev = len;
+  }
+
+  ok = !refs->error_;
+Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  CostManagerClear(cost_manager);
+  WebPSafeFree(cost_model);
+  WebPSafeFree(cost_manager);
+  return ok;
+}
+
+// We pack the path at the end of *dist_array and return
+// a pointer to this part of the array. Example:
+// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
+static void TraceBackwards(uint16_t* const dist_array,
+                           int dist_array_size,
+                           uint16_t** const chosen_path,
+                           int* const chosen_path_size) {
+  uint16_t* path = dist_array + dist_array_size;
+  uint16_t* cur = dist_array + dist_array_size - 1;
+  while (cur >= dist_array) {
+    const int k = *cur;
+    --path;
+    *path = k;
+    cur -= k;
+  }
+  *chosen_path = path;
+  *chosen_path_size = (int)(dist_array + dist_array_size - path);
+}
+
+static int BackwardReferencesHashChainFollowChosenPath(
+    const uint32_t* const argb, int cache_bits,
+    const uint16_t* const chosen_path, int chosen_path_size,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
+  const int use_color_cache = (cache_bits > 0);
+  int ix;
+  int i = 0;
+  int ok = 0;
+  int cc_init = 0;
+  VP8LColorCache hashers;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  VP8LClearBackwardRefs(refs);
+  for (ix = 0; ix < chosen_path_size; ++ix) {
+    const int len = chosen_path[ix];
+    if (len != 1) {
+      int k;
+      const int offset = VP8LHashChainFindOffset(hash_chain, i);
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
+      if (use_color_cache) {
+        for (k = 0; k < len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      i += len;
+    } else {
+      PixOrCopy v;
+      const int idx =
+          use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
+      if (idx >= 0) {
+        // use_color_cache is true and hashers contains argb[i]
+        // push pixel as a color cache index
+        v = PixOrCopyCreateCacheIdx(idx);
+      } else {
+        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
+        v = PixOrCopyCreateLiteral(argb[i]);
+      }
+      VP8LBackwardRefsCursorAdd(refs, v);
+      ++i;
+    }
+  }
+  ok = !refs->error_;
+ Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  return ok;
+}
+
+// Returns 1 on success.
+extern int VP8LBackwardReferencesTraceBackwards(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain,
+    const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
+int VP8LBackwardReferencesTraceBackwards(int xsize, int ysize,
+                                         const uint32_t* const argb,
+                                         int cache_bits,
+                                         const VP8LHashChain* const hash_chain,
+                                         const VP8LBackwardRefs* const refs_src,
+                                         VP8LBackwardRefs* const refs_dst) {
+  int ok = 0;
+  const int dist_array_size = xsize * ysize;
+  uint16_t* chosen_path = NULL;
+  int chosen_path_size = 0;
+  uint16_t* dist_array =
+      (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
+
+  if (dist_array == NULL) goto Error;
+
+  if (!BackwardReferencesHashChainDistanceOnly(
+          xsize, ysize, argb, cache_bits, hash_chain, refs_src, dist_array)) {
+    goto Error;
+  }
+  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
+  if (!BackwardReferencesHashChainFollowChosenPath(
+          argb, cache_bits, chosen_path, chosen_path_size, hash_chain,
+          refs_dst)) {
+    goto Error;
+  }
+  ok = 1;
+ Error:
+  WebPSafeFree(dist_array);
+  return ok;
+}
diff --git a/media/libwebp/enc/backward_references_enc.c b/media/libwebp/enc/backward_references_enc.c
new file mode 100644
index 0000000000..b78610565a
--- /dev/null
+++ b/media/libwebp/enc/backward_references_enc.c
@@ -0,0 +1,1030 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+
+#include "../dsp/dsp.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../enc/backward_references_enc.h"
+#include "../enc/histogram_enc.h"
+#include "../utils/color_cache_utils.h"
+#include "../utils/utils.h"
+
+#define MIN_BLOCK_SIZE 256  // minimum block size for backward references
+
+#define MAX_ENTROPY    (1e30f)
+
+// 1M window (4M bytes) minus 120 special codes for short distances.
+#define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
+
+// Minimum number of pixels for which it is cheaper to encode a
+// distance + length instead of each pixel as a literal.
+#define MIN_LENGTH 4
+
+// -----------------------------------------------------------------------------
+
+static const uint8_t plane_to_code_lut[128] = {
+ 96,   73,  55,  39,  23,  13,   5,  1,  255, 255, 255, 255, 255, 255, 255, 255,
+ 101,  78,  58,  42,  26,  16,   8,  2,    0,   3,  9,   17,  27,  43,  59,  79,
+ 102,  86,  62,  46,  32,  20,  10,  6,    4,   7,  11,  21,  33,  47,  63,  87,
+ 105,  90,  70,  52,  37,  28,  18,  14,  12,  15,  19,  29,  38,  53,  71,  91,
+ 110,  99,  82,  66,  48,  35,  30,  24,  22,  25,  31,  36,  49,  67,  83, 100,
+ 115, 108,  94,  76,  64,  50,  44,  40,  34,  41,  45,  51,  65,  77,  95, 109,
+ 118, 113, 103,  92,  80,  68,  60,  56,  54,  57,  61,  69,  81,  93, 104, 114,
+ 119, 116, 111, 106,  97,  88,  84,  74,  72,  75,  85,  89,  98, 107, 112, 117
+};
+
+extern int VP8LDistanceToPlaneCode(int xsize, int dist);
+int VP8LDistanceToPlaneCode(int xsize, int dist) {
+  const int yoffset = dist / xsize;
+  const int xoffset = dist - yoffset * xsize;
+  if (xoffset <= 8 && yoffset < 8) {
+    return plane_to_code_lut[yoffset * 16 + 8 - xoffset] + 1;
+  } else if (xoffset > xsize - 8 && yoffset < 7) {
+    return plane_to_code_lut[(yoffset + 1) * 16 + 8 + (xsize - xoffset)] + 1;
+  }
+  return dist + 120;
+}
+
+// Returns the exact index where array1 and array2 are different. For an index
+// inferior or equal to best_len_match, the return value just has to be strictly
+// inferior to best_len_match. The current behavior is to return 0 if this index
+// is best_len_match, and the index itself otherwise.
+// If no two elements are the same, it returns max_limit.
+static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
+                                       const uint32_t* const array2,
+                                       int best_len_match, int max_limit) {
+  // Before 'expensive' linear match, check if the two arrays match at the
+  // current best length index.
+  if (array1[best_len_match] != array2[best_len_match]) return 0;
+
+  return VP8LVectorMismatch(array1, array2, max_limit);
+}
+
+// -----------------------------------------------------------------------------
+//  VP8LBackwardRefs
+
+struct PixOrCopyBlock {
+  PixOrCopyBlock* next_;   // next block (or NULL)
+  PixOrCopy* start_;       // data start
+  int size_;               // currently used size
+};
+
+extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) {
+  assert(refs != NULL);
+  if (refs->tail_ != NULL) {
+    *refs->tail_ = refs->free_blocks_;  // recycle all blocks at once
+  }
+  refs->free_blocks_ = refs->refs_;
+  refs->tail_ = &refs->refs_;
+  refs->last_block_ = NULL;
+  refs->refs_ = NULL;
+}
+
+void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
+  assert(refs != NULL);
+  VP8LClearBackwardRefs(refs);
+  while (refs->free_blocks_ != NULL) {
+    PixOrCopyBlock* const next = refs->free_blocks_->next_;
+    WebPSafeFree(refs->free_blocks_);
+    refs->free_blocks_ = next;
+  }
+}
+
+// Swaps the content of two VP8LBackwardRefs.
+static void BackwardRefsSwap(VP8LBackwardRefs* const refs1,
+                             VP8LBackwardRefs* const refs2) {
+  const int point_to_refs1 =
+      (refs1->tail_ != NULL && refs1->tail_ == &refs1->refs_);
+  const int point_to_refs2 =
+      (refs2->tail_ != NULL && refs2->tail_ == &refs2->refs_);
+  const VP8LBackwardRefs tmp = *refs1;
+  *refs1 = *refs2;
+  *refs2 = tmp;
+  if (point_to_refs2) refs1->tail_ = &refs1->refs_;
+  if (point_to_refs1) refs2->tail_ = &refs2->refs_;
+}
+
+void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
+  assert(refs != NULL);
+  memset(refs, 0, sizeof(*refs));
+  refs->tail_ = &refs->refs_;
+  refs->block_size_ =
+      (block_size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : block_size;
+}
+
+VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs) {
+  VP8LRefsCursor c;
+  c.cur_block_ = refs->refs_;
+  if (refs->refs_ != NULL) {
+    c.cur_pos = c.cur_block_->start_;
+    c.last_pos_ = c.cur_pos + c.cur_block_->size_;
+  } else {
+    c.cur_pos = NULL;
+    c.last_pos_ = NULL;
+  }
+  return c;
+}
+
+void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c) {
+  PixOrCopyBlock* const b = c->cur_block_->next_;
+  c->cur_pos = (b == NULL) ? NULL : b->start_;
+  c->last_pos_ = (b == NULL) ? NULL : b->start_ + b->size_;
+  c->cur_block_ = b;
+}
+
+// Create a new block, either from the free list or allocated
+static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
+  PixOrCopyBlock* b = refs->free_blocks_;
+  if (b == NULL) {   // allocate new memory chunk
+    const size_t total_size =
+        sizeof(*b) + refs->block_size_ * sizeof(*b->start_);
+    b = (PixOrCopyBlock*)WebPSafeMalloc(1ULL, total_size);
+    if (b == NULL) {
+      refs->error_ |= 1;
+      return NULL;
+    }
+    b->start_ = (PixOrCopy*)((uint8_t*)b + sizeof(*b));  // not always aligned
+  } else {  // recycle from free-list
+    refs->free_blocks_ = b->next_;
+  }
+  *refs->tail_ = b;
+  refs->tail_ = &b->next_;
+  refs->last_block_ = b;
+  b->next_ = NULL;
+  b->size_ = 0;
+  return b;
+}
+
+// Return 1 on success, 0 on error.
+static int BackwardRefsClone(const VP8LBackwardRefs* const from,
+                             VP8LBackwardRefs* const to) {
+  const PixOrCopyBlock* block_from = from->refs_;
+  VP8LClearBackwardRefs(to);
+  while (block_from != NULL) {
+    PixOrCopyBlock* const block_to = BackwardRefsNewBlock(to);
+    if (block_to == NULL) return 0;
+    memcpy(block_to->start_, block_from->start_,
+           block_from->size_ * sizeof(PixOrCopy));
+    block_to->size_ = block_from->size_;
+    block_from = block_from->next_;
+  }
+  return 1;
+}
+
+extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                                      const PixOrCopy v);
+void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                               const PixOrCopy v) {
+  PixOrCopyBlock* b = refs->last_block_;
+  if (b == NULL || b->size_ == refs->block_size_) {
+    b = BackwardRefsNewBlock(refs);
+    if (b == NULL) return;   // refs->error_ is set
+  }
+  b->start_[b->size_++] = v;
+}
+
+// -----------------------------------------------------------------------------
+// Hash chains
+
+int VP8LHashChainInit(VP8LHashChain* const p, int size) {
+  assert(p->size_ == 0);
+  assert(p->offset_length_ == NULL);
+  assert(size > 0);
+  p->offset_length_ =
+      (uint32_t*)WebPSafeMalloc(size, sizeof(*p->offset_length_));
+  if (p->offset_length_ == NULL) return 0;
+  p->size_ = size;
+
+  return 1;
+}
+
+void VP8LHashChainClear(VP8LHashChain* const p) {
+  assert(p != NULL);
+  WebPSafeFree(p->offset_length_);
+
+  p->size_ = 0;
+  p->offset_length_ = NULL;
+}
+
+// -----------------------------------------------------------------------------
+
+static const uint32_t kHashMultiplierHi = 0xc6a4a793u;
+static const uint32_t kHashMultiplierLo = 0x5bd1e996u;
+
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+uint32_t GetPixPairHash64(const uint32_t* const argb) {
+  uint32_t key;
+  key  = argb[1] * kHashMultiplierHi;
+  key += argb[0] * kHashMultiplierLo;
+  key = key >> (32 - HASH_BITS);
+  return key;
+}
+
+// Returns the maximum number of hash chain lookups to do for a
+// given compression quality. Return value in range [8, 86].
+static int GetMaxItersForQuality(int quality) {
+  return 8 + (quality * quality) / 128;
+}
+
+static int GetWindowSizeForHashChain(int quality, int xsize) {
+  const int max_window_size = (quality > 75) ? WINDOW_SIZE
+                            : (quality > 50) ? (xsize << 8)
+                            : (quality > 25) ? (xsize << 6)
+                            : (xsize << 4);
+  assert(xsize > 0);
+  return (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE : max_window_size;
+}
+
+static WEBP_INLINE int MaxFindCopyLength(int len) {
+  return (len < MAX_LENGTH) ? len : MAX_LENGTH;
+}
+
+int VP8LHashChainFill(VP8LHashChain* const p, int quality,
+                      const uint32_t* const argb, int xsize, int ysize,
+                      int low_effort) {
+  const int size = xsize * ysize;
+  const int iter_max = GetMaxItersForQuality(quality);
+  const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
+  int pos;
+  int argb_comp;
+  uint32_t base_position;
+  int32_t* hash_to_first_index;
+  // Temporarily use the p->offset_length_ as a hash chain.
+  int32_t* chain = (int32_t*)p->offset_length_;
+  assert(size > 0);
+  assert(p->size_ != 0);
+  assert(p->offset_length_ != NULL);
+
+  if (size <= 2) {
+    p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+    return 1;
+  }
+
+  hash_to_first_index =
+      (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
+  if (hash_to_first_index == NULL) return 0;
+
+  // Set the int32_t array to -1.
+  memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
+  // Fill the chain linking pixels with the same hash.
+  argb_comp = (argb[0] == argb[1]);
+  for (pos = 0; pos < size - 2;) {
+    uint32_t hash_code;
+    const int argb_comp_next = (argb[pos + 1] == argb[pos + 2]);
+    if (argb_comp && argb_comp_next) {
+      // Consecutive pixels with the same color will share the same hash.
+      // We therefore use a different hash: the color and its repetition
+      // length.
+      uint32_t tmp[2];
+      uint32_t len = 1;
+      tmp[0] = argb[pos];
+      // Figure out how far the pixels are the same.
+      // The last pixel has a different 64 bit hash, as its next pixel does
+      // not have the same color, so we just need to get to the last pixel equal
+      // to its follower.
+      while (pos + (int)len + 2 < size && argb[pos + len + 2] == argb[pos]) {
+        ++len;
+      }
+      if (len > MAX_LENGTH) {
+        // Skip the pixels that match for distance=1 and length>MAX_LENGTH
+        // because they are linked to their predecessor and we automatically
+        // check that in the main for loop below. Skipping means setting no
+        // predecessor in the chain, hence -1.
+        memset(chain + pos, 0xff, (len - MAX_LENGTH) * sizeof(*chain));
+        pos += len - MAX_LENGTH;
+        len = MAX_LENGTH;
+      }
+      // Process the rest of the hash chain.
+      while (len) {
+        tmp[1] = len--;
+        hash_code = GetPixPairHash64(tmp);
+        chain[pos] = hash_to_first_index[hash_code];
+        hash_to_first_index[hash_code] = pos++;
+      }
+      argb_comp = 0;
+    } else {
+      // Just move one pixel forward.
+      hash_code = GetPixPairHash64(argb + pos);
+      chain[pos] = hash_to_first_index[hash_code];
+      hash_to_first_index[hash_code] = pos++;
+      argb_comp = argb_comp_next;
+    }
+  }
+  // Process the penultimate pixel.
+  chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
+
+  WebPSafeFree(hash_to_first_index);
+
+  // Find the best match interval at each pixel, defined by an offset to the
+  // pixel and a length. The right-most pixel cannot match anything to the right
+  // (hence a best length of 0) and the left-most pixel nothing to the left
+  // (hence an offset of 0).
+  assert(size > 2);
+  p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+  for (base_position = size - 2; base_position > 0;) {
+    const int max_len = MaxFindCopyLength(size - 1 - base_position);
+    const uint32_t* const argb_start = argb + base_position;
+    int iter = iter_max;
+    int best_length = 0;
+    uint32_t best_distance = 0;
+    uint32_t best_argb;
+    const int min_pos =
+        (base_position > window_size) ? base_position - window_size : 0;
+    const int length_max = (max_len < 256) ? max_len : 256;
+    uint32_t max_base_position;
+
+    pos = chain[base_position];
+    if (!low_effort) {
+      int curr_length;
+      // Heuristic: use the comparison with the above line as an initialization.
+      if (base_position >= (uint32_t)xsize) {
+        curr_length = FindMatchLength(argb_start - xsize, argb_start,
+                                      best_length, max_len);
+        if (curr_length > best_length) {
+          best_length = curr_length;
+          best_distance = xsize;
+        }
+        --iter;
+      }
+      // Heuristic: compare to the previous pixel.
+      curr_length =
+          FindMatchLength(argb_start - 1, argb_start, best_length, max_len);
+      if (curr_length > best_length) {
+        best_length = curr_length;
+        best_distance = 1;
+      }
+      --iter;
+      // Skip the for loop if we already have the maximum.
+      if (best_length == MAX_LENGTH) pos = min_pos - 1;
+    }
+    best_argb = argb_start[best_length];
+
+    for (; pos >= min_pos && --iter; pos = chain[pos]) {
+      int curr_length;
+      assert(base_position > (uint32_t)pos);
+
+      if (argb[pos + best_length] != best_argb) continue;
+
+      curr_length = VP8LVectorMismatch(argb + pos, argb_start, max_len);
+      if (best_length < curr_length) {
+        best_length = curr_length;
+        best_distance = base_position - pos;
+        best_argb = argb_start[best_length];
+        // Stop if we have reached a good enough length.
+        if (best_length >= length_max) break;
+      }
+    }
+    // We have the best match but in case the two intervals continue matching
+    // to the left, we have the best matches for the left-extended pixels.
+    max_base_position = base_position;
+    while (1) {
+      assert(best_length <= MAX_LENGTH);
+      assert(best_distance <= WINDOW_SIZE);
+      p->offset_length_[base_position] =
+          (best_distance << MAX_LENGTH_BITS) | (uint32_t)best_length;
+      --base_position;
+      // Stop if we don't have a match or if we are out of bounds.
+      if (best_distance == 0 || base_position == 0) break;
+      // Stop if we cannot extend the matching intervals to the left.
+      if (base_position < best_distance ||
+          argb[base_position - best_distance] != argb[base_position]) {
+        break;
+      }
+      // Stop if we are matching at its limit because there could be a closer
+      // matching interval with the same maximum length. Then again, if the
+      // matching interval is as close as possible (best_distance == 1), we will
+      // never find anything better so let's continue.
+      if (best_length == MAX_LENGTH && best_distance != 1 &&
+          base_position + MAX_LENGTH < max_base_position) {
+        break;
+      }
+      if (best_length < MAX_LENGTH) {
+        ++best_length;
+        max_base_position = base_position;
+      }
+    }
+  }
+  return 1;
+}
+
+static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
+                                         VP8LColorCache* const hashers,
+                                         VP8LBackwardRefs* const refs) {
+  PixOrCopy v;
+  if (use_color_cache) {
+    const uint32_t key = VP8LColorCacheGetIndex(hashers, pixel);
+    if (VP8LColorCacheLookup(hashers, key) == pixel) {
+      v = PixOrCopyCreateCacheIdx(key);
+    } else {
+      v = PixOrCopyCreateLiteral(pixel);
+      VP8LColorCacheSet(hashers, key, pixel);
+    }
+  } else {
+    v = PixOrCopyCreateLiteral(pixel);
+  }
+  VP8LBackwardRefsCursorAdd(refs, v);
+}
+
+static int BackwardReferencesRle(int xsize, int ysize,
+                                 const uint32_t* const argb,
+                                 int cache_bits, VP8LBackwardRefs* const refs) {
+  const int pix_count = xsize * ysize;
+  int i, k;
+  const int use_color_cache = (cache_bits > 0);
+  VP8LColorCache hashers;
+
+  if (use_color_cache && !VP8LColorCacheInit(&hashers, cache_bits)) {
+    return 0;
+  }
+  VP8LClearBackwardRefs(refs);
+  // Add first pixel as literal.
+  AddSingleLiteral(argb[0], use_color_cache, &hashers, refs);
+  i = 1;
+  while (i < pix_count) {
+    const int max_len = MaxFindCopyLength(pix_count - i);
+    const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
+    const int prev_row_len = (i < xsize) ? 0 :
+        FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
+    if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) {
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
+      // We don't need to update the color cache here since it is always the
+      // same pixel being copied, and that does not change the color cache
+      // state.
+      i += rle_len;
+    } else if (prev_row_len >= MIN_LENGTH) {
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
+      if (use_color_cache) {
+        for (k = 0; k < prev_row_len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      i += prev_row_len;
+    } else {
+      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+      i++;
+    }
+  }
+  if (use_color_cache) VP8LColorCacheClear(&hashers);
+  return !refs->error_;
+}
+
+static int BackwardReferencesLz77(int xsize, int ysize,
+                                  const uint32_t* const argb, int cache_bits,
+                                  const VP8LHashChain* const hash_chain,
+                                  VP8LBackwardRefs* const refs) {
+  int i;
+  int i_last_check = -1;
+  int ok = 0;
+  int cc_init = 0;
+  const int use_color_cache = (cache_bits > 0);
+  const int pix_count = xsize * ysize;
+  VP8LColorCache hashers;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+  VP8LClearBackwardRefs(refs);
+  for (i = 0; i < pix_count;) {
+    // Alternative#1: Code the pixels starting at 'i' using backward reference.
+    int offset = 0;
+    int len = 0;
+    int j;
+    VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
+    if (len >= MIN_LENGTH) {
+      const int len_ini = len;
+      int max_reach = 0;
+      const int j_max =
+          (i + len_ini >= pix_count) ? pix_count - 1 : i + len_ini;
+      // Only start from what we have not checked already.
+      i_last_check = (i > i_last_check) ? i : i_last_check;
+      // We know the best match for the current pixel but we try to find the
+      // best matches for the current pixel AND the next one combined.
+      // The naive method would use the intervals:
+      // [i,i+len) + [i+len, length of best match at i+len)
+      // while we check if we can use:
+      // [i,j) (where j<=i+len) + [j, length of best match at j)
+      for (j = i_last_check + 1; j <= j_max; ++j) {
+        const int len_j = VP8LHashChainFindLength(hash_chain, j);
+        const int reach =
+            j + (len_j >= MIN_LENGTH ? len_j : 1);  // 1 for single literal.
+        if (reach > max_reach) {
+          len = j - i;
+          max_reach = reach;
+          if (max_reach >= pix_count) break;
+        }
+      }
+    } else {
+      len = 1;
+    }
+    // Go with literal or backward reference.
+    assert(len > 0);
+    if (len == 1) {
+      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+    } else {
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
+      if (use_color_cache) {
+        for (j = i; j < i + len; ++j) VP8LColorCacheInsert(&hashers, argb[j]);
+      }
+    }
+    i += len;
+  }
+
+  ok = !refs->error_;
+ Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  return ok;
+}
+
+// Compute an LZ77 by forcing matches to happen within a given distance cost.
+// We therefore limit the algorithm to the lowest 32 values in the PlaneCode
+// definition.
+#define WINDOW_OFFSETS_SIZE_MAX 32
+static int BackwardReferencesLz77Box(int xsize, int ysize,
+                                     const uint32_t* const argb, int cache_bits,
+                                     const VP8LHashChain* const hash_chain_best,
+                                     VP8LHashChain* hash_chain,
+                                     VP8LBackwardRefs* const refs) {
+  int i;
+  const int pix_count = xsize * ysize;
+  uint16_t* counts;
+  int window_offsets[WINDOW_OFFSETS_SIZE_MAX] = {0};
+  int window_offsets_new[WINDOW_OFFSETS_SIZE_MAX] = {0};
+  int window_offsets_size = 0;
+  int window_offsets_new_size = 0;
+  uint16_t* const counts_ini =
+      (uint16_t*)WebPSafeMalloc(xsize * ysize, sizeof(*counts_ini));
+  int best_offset_prev = -1, best_length_prev = -1;
+  if (counts_ini == NULL) return 0;
+
+  // counts[i] counts how many times a pixel is repeated starting at position i.
+  i = pix_count - 2;
+  counts = counts_ini + i;
+  counts[1] = 1;
+  for (; i >= 0; --i, --counts) {
+    if (argb[i] == argb[i + 1]) {
+      // Max out the counts to MAX_LENGTH.
+      counts[0] = counts[1] + (counts[1] != MAX_LENGTH);
+    } else {
+      counts[0] = 1;
+    }
+  }
+
+  // Figure out the window offsets around a pixel. They are stored in a
+  // spiraling order around the pixel as defined by VP8LDistanceToPlaneCode.
+  {
+    int x, y;
+    for (y = 0; y <= 6; ++y) {
+      for (x = -6; x <= 6; ++x) {
+        const int offset = y * xsize + x;
+        int plane_code;
+        // Ignore offsets that bring us after the pixel.
+        if (offset <= 0) continue;
+        plane_code = VP8LDistanceToPlaneCode(xsize, offset) - 1;
+        if (plane_code >= WINDOW_OFFSETS_SIZE_MAX) continue;
+        window_offsets[plane_code] = offset;
+      }
+    }
+    // For narrow images, not all plane codes are reached, so remove those.
+    for (i = 0; i < WINDOW_OFFSETS_SIZE_MAX; ++i) {
+      if (window_offsets[i] == 0) continue;
+      window_offsets[window_offsets_size++] = window_offsets[i];
+    }
+    // Given a pixel P, find the offsets that reach pixels unreachable from P-1
+    // with any of the offsets in window_offsets[].
+    for (i = 0; i < window_offsets_size; ++i) {
+      int j;
+      int is_reachable = 0;
+      for (j = 0; j < window_offsets_size && !is_reachable; ++j) {
+        is_reachable |= (window_offsets[i] == window_offsets[j] + 1);
+      }
+      if (!is_reachable) {
+        window_offsets_new[window_offsets_new_size] = window_offsets[i];
+        ++window_offsets_new_size;
+      }
+    }
+  }
+
+  hash_chain->offset_length_[0] = 0;
+  for (i = 1; i < pix_count; ++i) {
+    int ind;
+    int best_length = VP8LHashChainFindLength(hash_chain_best, i);
+    int best_offset;
+    int do_compute = 1;
+
+    if (best_length >= MAX_LENGTH) {
+      // Do not recompute the best match if we already have a maximal one in the
+      // window.
+      best_offset = VP8LHashChainFindOffset(hash_chain_best, i);
+      for (ind = 0; ind < window_offsets_size; ++ind) {
+        if (best_offset == window_offsets[ind]) {
+          do_compute = 0;
+          break;
+        }
+      }
+    }
+    if (do_compute) {
+      // Figure out if we should use the offset/length from the previous pixel
+      // as an initial guess and therefore only inspect the offsets in
+      // window_offsets_new[].
+      const int use_prev =
+          (best_length_prev > 1) && (best_length_prev < MAX_LENGTH);
+      const int num_ind =
+          use_prev ? window_offsets_new_size : window_offsets_size;
+      best_length = use_prev ? best_length_prev - 1 : 0;
+      best_offset = use_prev ? best_offset_prev : 0;
+      // Find the longest match in a window around the pixel.
+      for (ind = 0; ind < num_ind; ++ind) {
+        int curr_length = 0;
+        int j = i;
+        int j_offset =
+            use_prev ? i - window_offsets_new[ind] : i - window_offsets[ind];
+        if (j_offset < 0 || argb[j_offset] != argb[i]) continue;
+        // The longest match is the sum of how many times each pixel is
+        // repeated.
+        do {
+          const int counts_j_offset = counts_ini[j_offset];
+          const int counts_j = counts_ini[j];
+          if (counts_j_offset != counts_j) {
+            curr_length +=
+                (counts_j_offset < counts_j) ? counts_j_offset : counts_j;
+            break;
+          }
+          // The same color is repeated counts_pos times at j_offset and j.
+          curr_length += counts_j_offset;
+          j_offset += counts_j_offset;
+          j += counts_j_offset;
+        } while (curr_length <= MAX_LENGTH && j < pix_count &&
+                 argb[j_offset] == argb[j]);
+        if (best_length < curr_length) {
+          best_offset =
+              use_prev ? window_offsets_new[ind] : window_offsets[ind];
+          if (curr_length >= MAX_LENGTH) {
+            best_length = MAX_LENGTH;
+            break;
+          } else {
+            best_length = curr_length;
+          }
+        }
+      }
+    }
+
+    assert(i + best_length <= pix_count);
+    assert(best_length <= MAX_LENGTH);
+    if (best_length <= MIN_LENGTH) {
+      hash_chain->offset_length_[i] = 0;
+      best_offset_prev = 0;
+      best_length_prev = 0;
+    } else {
+      hash_chain->offset_length_[i] =
+          (best_offset << MAX_LENGTH_BITS) | (uint32_t)best_length;
+      best_offset_prev = best_offset;
+      best_length_prev = best_length;
+    }
+  }
+  hash_chain->offset_length_[0] = 0;
+  WebPSafeFree(counts_ini);
+
+  return BackwardReferencesLz77(xsize, ysize, argb, cache_bits, hash_chain,
+                                refs);
+}
+
+// -----------------------------------------------------------------------------
+
+static void BackwardReferences2DLocality(int xsize,
+                                         const VP8LBackwardRefs* const refs) {
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    if (PixOrCopyIsCopy(c.cur_pos)) {
+      const int dist = c.cur_pos->argb_or_distance;
+      const int transformed_dist = VP8LDistanceToPlaneCode(xsize, dist);
+      c.cur_pos->argb_or_distance = transformed_dist;
+    }
+    VP8LRefsCursorNext(&c);
+  }
+}
+
+// Evaluate optimal cache bits for the local color cache.
+// The input *best_cache_bits sets the maximum cache bits to use (passing 0
+// implies disabling the local color cache). The local color cache is also
+// disabled for the lower (<= 25) quality.
+// Returns 0 in case of memory error.
+static int CalculateBestCacheSize(const uint32_t* argb, int quality,
+                                  const VP8LBackwardRefs* const refs,
+                                  int* const best_cache_bits) {
+  int i;
+  const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
+  double entropy_min = MAX_ENTROPY;
+  int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
+  VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL };
+  int ok = 0;
+
+  assert(cache_bits_max >= 0 && cache_bits_max <= MAX_COLOR_CACHE_BITS);
+
+  if (cache_bits_max == 0) {
+    *best_cache_bits = 0;
+    // Local color cache is disabled.
+    return 1;
+  }
+
+  // Allocate data.
+  for (i = 0; i <= cache_bits_max; ++i) {
+    histos[i] = VP8LAllocateHistogram(i);
+    if (histos[i] == NULL) goto Error;
+    VP8LHistogramInit(histos[i], i, /*init_arrays=*/ 1);
+    if (i == 0) continue;
+    cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
+    if (!cc_init[i]) goto Error;
+  }
+
+  // Find the cache_bits giving the lowest entropy. The search is done in a
+  // brute-force way as the function (entropy w.r.t cache_bits) can be
+  // anything in practice.
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
+    if (PixOrCopyIsLiteral(v)) {
+      const uint32_t pix = *argb++;
+      const uint32_t a = (pix >> 24) & 0xff;
+      const uint32_t r = (pix >> 16) & 0xff;
+      const uint32_t g = (pix >>  8) & 0xff;
+      const uint32_t b = (pix >>  0) & 0xff;
+      // The keys of the caches can be derived from the longest one.
+      int key = VP8LHashPix(pix, 32 - cache_bits_max);
+      // Do not use the color cache for cache_bits = 0.
+      ++histos[0]->blue_[b];
+      ++histos[0]->literal_[g];
+      ++histos[0]->red_[r];
+      ++histos[0]->alpha_[a];
+      // Deal with cache_bits > 0.
+      for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+        if (VP8LColorCacheLookup(&hashers[i], key) == pix) {
+          ++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
+        } else {
+          VP8LColorCacheSet(&hashers[i], key, pix);
+          ++histos[i]->blue_[b];
+          ++histos[i]->literal_[g];
+          ++histos[i]->red_[r];
+          ++histos[i]->alpha_[a];
+        }
+      }
+    } else {
+      int code, extra_bits, extra_bits_value;
+      // We should compute the contribution of the (distance,length)
+      // histograms but those are the same independently from the cache size.
+      // As those constant contributions are in the end added to the other
+      // histogram contributions, we can ignore them, except for the length
+      // prefix that is part of the literal_ histogram.
+      int len = PixOrCopyLength(v);
+      uint32_t argb_prev = *argb ^ 0xffffffffu;
+      VP8LPrefixEncode(len, &code, &extra_bits, &extra_bits_value);
+      for (i = 0; i <= cache_bits_max; ++i) {
+        ++histos[i]->literal_[NUM_LITERAL_CODES + code];
+      }
+      // Update the color caches.
+      do {
+        if (*argb != argb_prev) {
+          // Efficiency: insert only if the color changes.
+          int key = VP8LHashPix(*argb, 32 - cache_bits_max);
+          for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+            hashers[i].colors_[key] = *argb;
+          }
+          argb_prev = *argb;
+        }
+        argb++;
+      } while (--len != 0);
+    }
+    VP8LRefsCursorNext(&c);
+  }
+
+  for (i = 0; i <= cache_bits_max; ++i) {
+    const double entropy = VP8LHistogramEstimateBits(histos[i]);
+    if (i == 0 || entropy < entropy_min) {
+      entropy_min = entropy;
+      *best_cache_bits = i;
+    }
+  }
+  ok = 1;
+Error:
+  for (i = 0; i <= cache_bits_max; ++i) {
+    if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
+    VP8LFreeHistogram(histos[i]);
+  }
+  return ok;
+}
+
+// Update (in-place) backward references for specified cache_bits.
+static int BackwardRefsWithLocalCache(const uint32_t* const argb,
+                                      int cache_bits,
+                                      VP8LBackwardRefs* const refs) {
+  int pixel_index = 0;
+  VP8LColorCache hashers;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
+
+  while (VP8LRefsCursorOk(&c)) {
+    PixOrCopy* const v = c.cur_pos;
+    if (PixOrCopyIsLiteral(v)) {
+      const uint32_t argb_literal = v->argb_or_distance;
+      const int ix = VP8LColorCacheContains(&hashers, argb_literal);
+      if (ix >= 0) {
+        // hashers contains argb_literal
+        *v = PixOrCopyCreateCacheIdx(ix);
+      } else {
+        VP8LColorCacheInsert(&hashers, argb_literal);
+      }
+      ++pixel_index;
+    } else {
+      // refs was created without local cache, so it can not have cache indexes.
+      int k;
+      assert(PixOrCopyIsCopy(v));
+      for (k = 0; k < v->len; ++k) {
+        VP8LColorCacheInsert(&hashers, argb[pixel_index++]);
+      }
+    }
+    VP8LRefsCursorNext(&c);
+  }
+  VP8LColorCacheClear(&hashers);
+  return 1;
+}
+
+static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
+    int width, int height, const uint32_t* const argb,
+    int* const cache_bits, const VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs* const refs_lz77) {
+  *cache_bits = 0;
+  if (!BackwardReferencesLz77(width, height, argb, 0, hash_chain, refs_lz77)) {
+    return NULL;
+  }
+  BackwardReferences2DLocality(width, refs_lz77);
+  return refs_lz77;
+}
+
+extern int VP8LBackwardReferencesTraceBackwards(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain,
+    const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
+static int GetBackwardReferences(int width, int height,
+                                 const uint32_t* const argb, int quality,
+                                 int lz77_types_to_try, int cache_bits_max,
+                                 int do_no_cache,
+                                 const VP8LHashChain* const hash_chain,
+                                 VP8LBackwardRefs* const refs,
+                                 int* const cache_bits_best) {
+  VP8LHistogram* histo = NULL;
+  int i, lz77_type;
+  // Index 0 is for a color cache, index 1 for no cache (if needed).
+  int lz77_types_best[2] = {0, 0};
+  double bit_costs_best[2] = {DBL_MAX, DBL_MAX};
+  VP8LHashChain hash_chain_box;
+  VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1];
+  int status = 0;
+  memset(&hash_chain_box, 0, sizeof(hash_chain_box));
+
+  histo = VP8LAllocateHistogram(MAX_COLOR_CACHE_BITS);
+  if (histo == NULL) goto Error;
+
+  for (lz77_type = 1; lz77_types_to_try;
+       lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
+    int res = 0;
+    double bit_cost = 0.;
+    if ((lz77_types_to_try & lz77_type) == 0) continue;
+    switch (lz77_type) {
+      case kLZ77RLE:
+        res = BackwardReferencesRle(width, height, argb, 0, refs_tmp);
+        break;
+      case kLZ77Standard:
+        // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color
+        // cache is not that different in practice.
+        res = BackwardReferencesLz77(width, height, argb, 0, hash_chain,
+                                     refs_tmp);
+        break;
+      case kLZ77Box:
+        if (!VP8LHashChainInit(&hash_chain_box, width * height)) goto Error;
+        res = BackwardReferencesLz77Box(width, height, argb, 0, hash_chain,
+                                        &hash_chain_box, refs_tmp);
+        break;
+      default:
+        assert(0);
+    }
+    if (!res) goto Error;
+
+    // Start with the no color cache case.
+    for (i = 1; i >= 0; --i) {
+      int cache_bits = (i == 1) ? 0 : cache_bits_max;
+
+      if (i == 1 && !do_no_cache) continue;
+
+      if (i == 0) {
+        // Try with a color cache.
+        if (!CalculateBestCacheSize(argb, quality, refs_tmp, &cache_bits)) {
+          goto Error;
+        }
+        if (cache_bits > 0) {
+          if (!BackwardRefsWithLocalCache(argb, cache_bits, refs_tmp)) {
+            goto Error;
+          }
+        }
+      }
+
+      if (i == 0 && do_no_cache && cache_bits == 0) {
+        // No need to re-compute bit_cost as it was computed at i == 1.
+      } else {
+        VP8LHistogramCreate(histo, refs_tmp, cache_bits);
+        bit_cost = VP8LHistogramEstimateBits(histo);
+      }
+
+      if (bit_cost < bit_costs_best[i]) {
+        if (i == 1) {
+          // Do not swap as the full cache analysis would have the wrong
+          // VP8LBackwardRefs to start with.
+          if (!BackwardRefsClone(refs_tmp, &refs[1])) goto Error;
+        } else {
+          BackwardRefsSwap(refs_tmp, &refs[0]);
+        }
+        bit_costs_best[i] = bit_cost;
+        lz77_types_best[i] = lz77_type;
+        if (i == 0) *cache_bits_best = cache_bits;
+      }
+    }
+  }
+  assert(lz77_types_best[0] > 0);
+  assert(!do_no_cache || lz77_types_best[1] > 0);
+
+  // Improve on simple LZ77 but only for high quality (TraceBackwards is
+  // costly).
+  for (i = 1; i >= 0; --i) {
+    if (i == 1 && !do_no_cache) continue;
+    if ((lz77_types_best[i] == kLZ77Standard ||
+         lz77_types_best[i] == kLZ77Box) &&
+        quality >= 25) {
+      const VP8LHashChain* const hash_chain_tmp =
+          (lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box;
+      const int cache_bits = (i == 1) ? 0 : *cache_bits_best;
+      if (VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
+                                               hash_chain_tmp, &refs[i],
+                                               refs_tmp)) {
+        double bit_cost_trace;
+        VP8LHistogramCreate(histo, refs_tmp, cache_bits);
+        bit_cost_trace = VP8LHistogramEstimateBits(histo);
+        if (bit_cost_trace < bit_costs_best[i]) {
+          BackwardRefsSwap(refs_tmp, &refs[i]);
+        }
+      }
+    }
+
+    BackwardReferences2DLocality(width, &refs[i]);
+
+    if (i == 1 && lz77_types_best[0] == lz77_types_best[1] &&
+        *cache_bits_best == 0) {
+      // If the best cache size is 0 and we have the same best LZ77, just copy
+      // the data over and stop here.
+      if (!BackwardRefsClone(&refs[1], &refs[0])) goto Error;
+      break;
+    }
+  }
+  status = 1;
+
+Error:
+  VP8LHashChainClear(&hash_chain_box);
+  VP8LFreeHistogram(histo);
+  return status;
+}
+
+WebPEncodingError VP8LGetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
+    int* const cache_bits_best) {
+  if (low_effort) {
+    VP8LBackwardRefs* refs_best;
+    *cache_bits_best = cache_bits_max;
+    refs_best = GetBackwardReferencesLowEffort(
+        width, height, argb, cache_bits_best, hash_chain, refs);
+    if (refs_best == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+    // Set it in first position.
+    BackwardRefsSwap(refs_best, &refs[0]);
+  } else {
+    if (!GetBackwardReferences(width, height, argb, quality, lz77_types_to_try,
+                               cache_bits_max, do_no_cache, hash_chain, refs,
+                               cache_bits_best)) {
+      return VP8_ENC_ERROR_OUT_OF_MEMORY;
+    }
+  }
+  return VP8_ENC_OK;
+}
diff --git a/media/libwebp/enc/backward_references_enc.h b/media/libwebp/enc/backward_references_enc.h
index 539e991cfc..292c630e5e 100644
--- a/media/libwebp/enc/backward_references_enc.h
+++ b/media/libwebp/enc/backward_references_enc.h
@@ -16,6 +16,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include "../webp/types.h"
+#include "../webp/encode.h"
 #include "../webp/format_constants.h"
 
 #ifdef __cplusplus
@@ -218,14 +219,19 @@ enum VP8LLZ77Type {
 // Evaluates best possible backward references for specified quality.
 // The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
 // bits to use (passing 0 implies disabling the local color cache).
-// The optimal cache bits is evaluated and set for the *cache_bits parameter.
-// The return value is the pointer to the best of the two backward refs viz,
-// refs[0] or refs[1].
-VP8LBackwardRefs* VP8LGetBackwardReferences(
+// The optimal cache bits is evaluated and set for the *cache_bits_best
+// parameter with the matching refs_best.
+// If do_no_cache == 0, refs is an array of 2 values and the best
+// VP8LBackwardRefs is put in the first element.
+// If do_no_cache != 0, refs is an array of 3 values and the best
+// VP8LBackwardRefs is put in the first element, the best value with no-cache in
+// the second element.
+// In both cases, the last element is used as temporary internally.
+WebPEncodingError VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int lz77_types_to_try, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
-    VP8LBackwardRefs* const refs_tmp2);
+    int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
+    int* const cache_bits_best);
 
 #ifdef __cplusplus
 }
diff --git a/media/libwebp/enc/config_enc.c b/media/libwebp/enc/config_enc.c
new file mode 100644
index 0000000000..97df32d0d4
--- /dev/null
+++ b/media/libwebp/enc/config_enc.c
@@ -0,0 +1,157 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Coding tools configuration
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include "../webp/encode.h"
+
+//------------------------------------------------------------------------------
+// WebPConfig
+//------------------------------------------------------------------------------
+
+int WebPConfigInitInternal(WebPConfig* config,
+                           WebPPreset preset, float quality, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
+    return 0;   // caller/system version mismatch!
+  }
+  if (config == NULL) return 0;
+
+  config->quality = quality;
+  config->target_size = 0;
+  config->target_PSNR = 0.;
+  config->method = 4;
+  config->sns_strength = 50;
+  config->filter_strength = 60;   // mid-filtering
+  config->filter_sharpness = 0;
+  config->filter_type = 1;        // default: strong (so U/V is filtered too)
+  config->partitions = 0;
+  config->segments = 4;
+  config->pass = 1;
+  config->qmin = 0;
+  config->qmax = 100;
+  config->show_compressed = 0;
+  config->preprocessing = 0;
+  config->autofilter = 0;
+  config->partition_limit = 0;
+  config->alpha_compression = 1;
+  config->alpha_filtering = 1;
+  config->alpha_quality = 100;
+  config->lossless = 0;
+  config->exact = 0;
+  config->image_hint = WEBP_HINT_DEFAULT;
+  config->emulate_jpeg_size = 0;
+  config->thread_level = 0;
+  config->low_memory = 0;
+  config->near_lossless = 100;
+  config->use_delta_palette = 0;
+  config->use_sharp_yuv = 0;
+
+  // TODO(skal): tune.
+  switch (preset) {
+    case WEBP_PRESET_PICTURE:
+      config->sns_strength = 80;
+      config->filter_sharpness = 4;
+      config->filter_strength = 35;
+      config->preprocessing &= ~2;   // no dithering
+      break;
+    case WEBP_PRESET_PHOTO:
+      config->sns_strength = 80;
+      config->filter_sharpness = 3;
+      config->filter_strength = 30;
+      config->preprocessing |= 2;
+      break;
+    case WEBP_PRESET_DRAWING:
+      config->sns_strength = 25;
+      config->filter_sharpness = 6;
+      config->filter_strength = 10;
+      break;
+    case WEBP_PRESET_ICON:
+      config->sns_strength = 0;
+      config->filter_strength = 0;   // disable filtering to retain sharpness
+      config->preprocessing &= ~2;   // no dithering
+      break;
+    case WEBP_PRESET_TEXT:
+      config->sns_strength = 0;
+      config->filter_strength = 0;   // disable filtering to retain sharpness
+      config->preprocessing &= ~2;   // no dithering
+      config->segments = 2;
+      break;
+    case WEBP_PRESET_DEFAULT:
+    default:
+      break;
+  }
+  return WebPValidateConfig(config);
+}
+
+int WebPValidateConfig(const WebPConfig* config) {
+  if (config == NULL) return 0;
+  if (config->quality < 0 || config->quality > 100) return 0;
+  if (config->target_size < 0) return 0;
+  if (config->target_PSNR < 0) return 0;
+  if (config->method < 0 || config->method > 6) return 0;
+  if (config->segments < 1 || config->segments > 4) return 0;
+  if (config->sns_strength < 0 || config->sns_strength > 100) return 0;
+  if (config->filter_strength < 0 || config->filter_strength > 100) return 0;
+  if (config->filter_sharpness < 0 || config->filter_sharpness > 7) return 0;
+  if (config->filter_type < 0 || config->filter_type > 1) return 0;
+  if (config->autofilter < 0 || config->autofilter > 1) return 0;
+  if (config->pass < 1 || config->pass > 10) return 0;
+  if (config->qmin < 0 || config->qmax > 100 || config->qmin > config->qmax) {
+    return 0;
+  }
+  if (config->show_compressed < 0 || config->show_compressed > 1) return 0;
+  if (config->preprocessing < 0 || config->preprocessing > 7) return 0;
+  if (config->partitions < 0 || config->partitions > 3) return 0;
+  if (config->partition_limit < 0 || config->partition_limit > 100) return 0;
+  if (config->alpha_compression < 0) return 0;
+  if (config->alpha_filtering < 0) return 0;
+  if (config->alpha_quality < 0 || config->alpha_quality > 100) return 0;
+  if (config->lossless < 0 || config->lossless > 1) return 0;
+  if (config->near_lossless < 0 || config->near_lossless > 100) return 0;
+  if (config->image_hint >= WEBP_HINT_LAST) return 0;
+  if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1) return 0;
+  if (config->thread_level < 0 || config->thread_level > 1) return 0;
+  if (config->low_memory < 0 || config->low_memory > 1) return 0;
+  if (config->exact < 0 || config->exact > 1) return 0;
+  if (config->use_delta_palette < 0 || config->use_delta_palette > 1) {
+    return 0;
+  }
+  if (config->use_sharp_yuv < 0 || config->use_sharp_yuv > 1) return 0;
+
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+#define MAX_LEVEL 9
+
+// Mapping between -z level and -m / -q parameter settings.
+static const struct {
+  uint8_t method_;
+  uint8_t quality_;
+} kLosslessPresets[MAX_LEVEL + 1] = {
+  { 0,  0 }, { 1, 20 }, { 2, 25 }, { 3, 30 }, { 3, 50 },
+  { 4, 50 }, { 4, 75 }, { 4, 90 }, { 5, 90 }, { 6, 100 }
+};
+
+int WebPConfigLosslessPreset(WebPConfig* config, int level) {
+  if (config == NULL || level < 0 || level > MAX_LEVEL) return 0;
+  config->lossless = 1;
+  config->method = kLosslessPresets[level].method_;
+  config->quality = kLosslessPresets[level].quality_;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/cost_enc.c b/media/libwebp/enc/cost_enc.c
new file mode 100644
index 0000000000..bb7fe64fa2
--- /dev/null
+++ b/media/libwebp/enc/cost_enc.c
@@ -0,0 +1,342 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Cost tables for level and modes
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../enc/cost_enc.h"
+
+//------------------------------------------------------------------------------
+// Level cost tables
+
+// For each given level, the following table gives the pattern of contexts to
+// use for coding it (in [][0]) as well as the bit value to use for each
+// context (in [][1]).
+const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
+                  {0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005},
+  {0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023},
+  {0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x0d3, 0x013},
+  {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x013},
+  {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x013}, {0x0d3, 0x093},
+  {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093},
+  {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093},
+  {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093},
+  {0x0d3, 0x093}, {0x0d3, 0x093}, {0x0d3, 0x093}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053},
+  {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x153}
+};
+
+static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {
+  int pattern = VP8LevelCodes[level - 1][0];
+  int bits = VP8LevelCodes[level - 1][1];
+  int cost = 0;
+  int i;
+  for (i = 2; pattern; ++i) {
+    if (pattern & 1) {
+      cost += VP8BitCost(bits & 1, probas[i]);
+    }
+    bits >>= 1;
+    pattern >>= 1;
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Pre-calc level costs once for all
+
+void VP8CalculateLevelCosts(VP8EncProba* const proba) {
+  int ctype, band, ctx;
+
+  if (!proba->dirty_) return;  // nothing to do.
+
+  for (ctype = 0; ctype < NUM_TYPES; ++ctype) {
+    int n;
+    for (band = 0; band < NUM_BANDS; ++band) {
+      for (ctx = 0; ctx < NUM_CTX; ++ctx) {
+        const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
+        uint16_t* const table = proba->level_cost_[ctype][band][ctx];
+        const int cost0 = (ctx > 0) ? VP8BitCost(1, p[0]) : 0;
+        const int cost_base = VP8BitCost(1, p[1]) + cost0;
+        int v;
+        table[0] = VP8BitCost(0, p[1]) + cost0;
+        for (v = 1; v <= MAX_VARIABLE_LEVEL; ++v) {
+          table[v] = cost_base + VariableLevelCost(v, p);
+        }
+        // Starting at level 67 and up, the variable part of the cost is
+        // actually constant.
+      }
+    }
+    for (n = 0; n < 16; ++n) {    // replicate bands. We don't need to sentinel.
+      for (ctx = 0; ctx < NUM_CTX; ++ctx) {
+        proba->remapped_costs_[ctype][n][ctx] =
+            proba->level_cost_[ctype][VP8EncBands[n]][ctx];
+      }
+    }
+  }
+  proba->dirty_ = 0;
+}
+
+//------------------------------------------------------------------------------
+// Mode cost tables.
+
+// These are the fixed probabilities (in the coding trees) turned into bit-cost
+// by calling VP8BitCost().
+const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 };
+// note: these values include the fixed VP8BitCost(1, 145) mode selection cost.
+const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 };
+const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
+  { {   40, 1151, 1723, 1874, 2103, 2019, 1628, 1777, 2226, 2137 },
+    {  192,  469, 1296, 1308, 1849, 1794, 1781, 1703, 1713, 1522 },
+    {  142,  910,  762, 1684, 1849, 1576, 1460, 1305, 1801, 1657 },
+    {  559,  641, 1370,  421, 1182, 1569, 1612, 1725,  863, 1007 },
+    {  299, 1059, 1256, 1108,  636, 1068, 1581, 1883,  869, 1142 },
+    {  277, 1111,  707, 1362, 1089,  672, 1603, 1541, 1545, 1291 },
+    {  214,  781, 1609, 1303, 1632, 2229,  726, 1560, 1713,  918 },
+    {  152, 1037, 1046, 1759, 1983, 2174, 1358,  742, 1740, 1390 },
+    {  512, 1046, 1420,  753,  752, 1297, 1486, 1613,  460, 1207 },
+    {  424,  827, 1362,  719, 1462, 1202, 1199, 1476, 1199,  538 } },
+  { {  240,  402, 1134, 1491, 1659, 1505, 1517, 1555, 1979, 2099 },
+    {  467,  242,  960, 1232, 1714, 1620, 1834, 1570, 1676, 1391 },
+    {  500,  455,  463, 1507, 1699, 1282, 1564,  982, 2114, 2114 },
+    {  672,  643, 1372,  331, 1589, 1667, 1453, 1938,  996,  876 },
+    {  458,  783, 1037,  911,  738,  968, 1165, 1518,  859, 1033 },
+    {  504,  815,  504, 1139, 1219,  719, 1506, 1085, 1268, 1268 },
+    {  333,  630, 1445, 1239, 1883, 3672,  799, 1548, 1865,  598 },
+    {  399,  644,  746, 1342, 1856, 1350, 1493,  613, 1855, 1015 },
+    {  622,  749, 1205,  608, 1066, 1408, 1290, 1406,  546,  971 },
+    {  500,  753, 1041,  668, 1230, 1617, 1297, 1425, 1383,  523 } },
+  { {  394,  553,  523, 1502, 1536,  981, 1608, 1142, 1666, 2181 },
+    {  655,  430,  375, 1411, 1861, 1220, 1677, 1135, 1978, 1553 },
+    {  690,  640,  245, 1954, 2070, 1194, 1528,  982, 1972, 2232 },
+    {  559,  834,  741,  867, 1131,  980, 1225,  852, 1092,  784 },
+    {  690,  875,  516,  959,  673,  894, 1056, 1190, 1528, 1126 },
+    {  740,  951,  384, 1277, 1177,  492, 1579, 1155, 1846, 1513 },
+    {  323,  775, 1062, 1776, 3062, 1274,  813, 1188, 1372,  655 },
+    {  488,  971,  484, 1767, 1515, 1775, 1115,  503, 1539, 1461 },
+    {  740, 1006,  998,  709,  851, 1230, 1337,  788,  741,  721 },
+    {  522, 1073,  573, 1045, 1346,  887, 1046, 1146, 1203,  697 } },
+  { {  105,  864, 1442, 1009, 1934, 1840, 1519, 1920, 1673, 1579 },
+    {  534,  305, 1193,  683, 1388, 2164, 1802, 1894, 1264, 1170 },
+    {  305,  518,  877, 1108, 1426, 3215, 1425, 1064, 1320, 1242 },
+    {  683,  732, 1927,  257, 1493, 2048, 1858, 1552, 1055,  947 },
+    {  394,  814, 1024,  660,  959, 1556, 1282, 1289,  893, 1047 },
+    {  528,  615,  996,  940, 1201,  635, 1094, 2515,  803, 1358 },
+    {  347,  614, 1609, 1187, 3133, 1345, 1007, 1339, 1017,  667 },
+    {  218,  740,  878, 1605, 3650, 3650, 1345,  758, 1357, 1617 },
+    {  672,  750, 1541,  558, 1257, 1599, 1870, 2135,  402, 1087 },
+    {  592,  684, 1161,  430, 1092, 1497, 1475, 1489, 1095,  822 } },
+  { {  228, 1056, 1059, 1368,  752,  982, 1512, 1518,  987, 1782 },
+    {  494,  514,  818,  942,  965,  892, 1610, 1356, 1048, 1363 },
+    {  512,  648,  591, 1042,  761,  991, 1196, 1454, 1309, 1463 },
+    {  683,  749, 1043,  676,  841, 1396, 1133, 1138,  654,  939 },
+    {  622, 1101, 1126,  994,  361, 1077, 1203, 1318,  877, 1219 },
+    {  631, 1068,  857, 1650,  651,  477, 1650, 1419,  828, 1170 },
+    {  555,  727, 1068, 1335, 3127, 1339,  820, 1331, 1077,  429 },
+    {  504,  879,  624, 1398,  889,  889, 1392,  808,  891, 1406 },
+    {  683, 1602, 1289,  977,  578,  983, 1280, 1708,  406, 1122 },
+    {  399,  865, 1433, 1070, 1072,  764,  968, 1477, 1223,  678 } },
+  { {  333,  760,  935, 1638, 1010,  529, 1646, 1410, 1472, 2219 },
+    {  512,  494,  750, 1160, 1215,  610, 1870, 1868, 1628, 1169 },
+    {  572,  646,  492, 1934, 1208,  603, 1580, 1099, 1398, 1995 },
+    {  786,  789,  942,  581, 1018,  951, 1599, 1207,  731,  768 },
+    {  690, 1015,  672, 1078,  582,  504, 1693, 1438, 1108, 2897 },
+    {  768, 1267,  571, 2005, 1243,  244, 2881, 1380, 1786, 1453 },
+    {  452,  899, 1293,  903, 1311, 3100,  465, 1311, 1319,  813 },
+    {  394,  927,  942, 1103, 1358, 1104,  946,  593, 1363, 1109 },
+    {  559, 1005, 1007, 1016,  658, 1173, 1021, 1164,  623, 1028 },
+    {  564,  796,  632, 1005, 1014,  863, 2316, 1268,  938,  764 } },
+  { {  266,  606, 1098, 1228, 1497, 1243,  948, 1030, 1734, 1461 },
+    {  366,  585,  901, 1060, 1407, 1247,  876, 1134, 1620, 1054 },
+    {  452,  565,  542, 1729, 1479, 1479, 1016,  886, 2938, 1150 },
+    {  555, 1088, 1533,  950, 1354,  895,  834, 1019, 1021,  496 },
+    {  704,  815, 1193,  971,  973,  640, 1217, 2214,  832,  578 },
+    {  672, 1245,  579,  871,  875,  774,  872, 1273, 1027,  949 },
+    {  296, 1134, 2050, 1784, 1636, 3425,  442, 1550, 2076,  722 },
+    {  342,  982, 1259, 1846, 1848, 1848,  622,  568, 1847, 1052 },
+    {  555, 1064, 1304,  828,  746, 1343, 1075, 1329, 1078,  494 },
+    {  288, 1167, 1285, 1174, 1639, 1639,  833, 2254, 1304,  509 } },
+  { {  342,  719,  767, 1866, 1757, 1270, 1246,  550, 1746, 2151 },
+    {  483,  653,  694, 1509, 1459, 1410, 1218,  507, 1914, 1266 },
+    {  488,  757,  447, 2979, 1813, 1268, 1654,  539, 1849, 2109 },
+    {  522, 1097, 1085,  851, 1365, 1111,  851,  901,  961,  605 },
+    {  709,  716,  841,  728,  736,  945,  941,  862, 2845, 1057 },
+    {  512, 1323,  500, 1336, 1083,  681, 1342,  717, 1604, 1350 },
+    {  452, 1155, 1372, 1900, 1501, 3290,  311,  944, 1919,  922 },
+    {  403, 1520,  977, 2132, 1733, 3522, 1076,  276, 3335, 1547 },
+    {  559, 1374, 1101,  615,  673, 2462,  974,  795,  984,  984 },
+    {  547, 1122, 1062,  812, 1410,  951, 1140,  622, 1268,  651 } },
+  { {  165,  982, 1235,  938, 1334, 1366, 1659, 1578,  964, 1612 },
+    {  592,  422,  925,  847, 1139, 1112, 1387, 2036,  861, 1041 },
+    {  403,  837,  732,  770,  941, 1658, 1250,  809, 1407, 1407 },
+    {  896,  874, 1071,  381, 1568, 1722, 1437, 2192,  480, 1035 },
+    {  640, 1098, 1012, 1032,  684, 1382, 1581, 2106,  416,  865 },
+    {  559, 1005,  819,  914,  710,  770, 1418,  920,  838, 1435 },
+    {  415, 1258, 1245,  870, 1278, 3067,  770, 1021, 1287,  522 },
+    {  406,  990,  601, 1009, 1265, 1265, 1267,  759, 1017, 1277 },
+    {  968, 1182, 1329,  788, 1032, 1292, 1705, 1714,  203, 1403 },
+    {  732,  877, 1279,  471,  901, 1161, 1545, 1294,  755,  755 } },
+  { {  111,  931, 1378, 1185, 1933, 1648, 1148, 1714, 1873, 1307 },
+    {  406,  414, 1030, 1023, 1910, 1404, 1313, 1647, 1509,  793 },
+    {  342,  640,  575, 1088, 1241, 1349, 1161, 1350, 1756, 1502 },
+    {  559,  766, 1185,  357, 1682, 1428, 1329, 1897, 1219,  802 },
+    {  473,  909, 1164,  771,  719, 2508, 1427, 1432,  722,  782 },
+    {  342,  892,  785, 1145, 1150,  794, 1296, 1550,  973, 1057 },
+    {  208, 1036, 1326, 1343, 1606, 3395,  815, 1455, 1618,  712 },
+    {  228,  928,  890, 1046, 3499, 1711,  994,  829, 1720, 1318 },
+    {  768,  724, 1058,  636,  991, 1075, 1319, 1324,  616,  825 },
+    {  305, 1167, 1358,  899, 1587, 1587,  987, 1988, 1332,  501 } }
+};
+
+//------------------------------------------------------------------------------
+// helper functions for residuals struct VP8Residual.
+
+void VP8InitResidual(int first, int coeff_type,
+                     VP8Encoder* const enc, VP8Residual* const res) {
+  res->coeff_type = coeff_type;
+  res->prob  = enc->proba_.coeffs_[coeff_type];
+  res->stats = enc->proba_.stats_[coeff_type];
+  res->costs = enc->proba_.remapped_costs_[coeff_type];
+  res->first = first;
+}
+
+//------------------------------------------------------------------------------
+// Mode costs
+
+int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]) {
+  const int x = (it->i4_ & 3), y = (it->i4_ >> 2);
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+  int R = 0;
+  int ctx;
+
+  VP8InitResidual(0, 3, enc, &res);
+  ctx = it->top_nz_[x] + it->left_nz_[y];
+  VP8SetResidualCoeffs(levels, &res);
+  R += VP8GetResidualCost(ctx, &res);
+  return R;
+}
+
+int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd) {
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+  int x, y;
+  int R = 0;
+
+  VP8IteratorNzToBytes(it);   // re-import the non-zero context
+
+  // DC
+  VP8InitResidual(0, 1, enc, &res);
+  VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+  R += VP8GetResidualCost(it->top_nz_[8] + it->left_nz_[8], &res);
+
+  // AC
+  VP8InitResidual(1, 0, enc, &res);
+  for (y = 0; y < 4; ++y) {
+    for (x = 0; x < 4; ++x) {
+      const int ctx = it->top_nz_[x] + it->left_nz_[y];
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      R += VP8GetResidualCost(ctx, &res);
+      it->top_nz_[x] = it->left_nz_[y] = (res.last >= 0);
+    }
+  }
+  return R;
+}
+
+int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+  int ch, x, y;
+  int R = 0;
+
+  VP8IteratorNzToBytes(it);  // re-import the non-zero context
+
+  VP8InitResidual(0, 2, enc, &res);
+  for (ch = 0; ch <= 2; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x) {
+        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        R += VP8GetResidualCost(ctx, &res);
+        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = (res.last >= 0);
+      }
+    }
+  }
+  return R;
+}
+
+
+//------------------------------------------------------------------------------
+// Recording of token probabilities.
+
+// We keep the table-free variant around for reference, in case.
+#define USE_LEVEL_CODE_TABLE
+
+// Simulate block coding, but only record statistics.
+// Note: no need to record the fixed probas.
+int VP8RecordCoeffs(int ctx, const VP8Residual* const res) {
+  int n = res->first;
+  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  proba_t* s = res->stats[n][ctx];
+  if (res->last  < 0) {
+    VP8RecordStats(0, s + 0);
+    return 0;
+  }
+  while (n <= res->last) {
+    int v;
+    VP8RecordStats(1, s + 0);  // order of record doesn't matter
+    while ((v = res->coeffs[n++]) == 0) {
+      VP8RecordStats(0, s + 1);
+      s = res->stats[VP8EncBands[n]][0];
+    }
+    VP8RecordStats(1, s + 1);
+    if (!VP8RecordStats(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
+      s = res->stats[VP8EncBands[n]][1];
+    } else {
+      v = abs(v);
+#if !defined(USE_LEVEL_CODE_TABLE)
+      if (!VP8RecordStats(v > 4, s + 3)) {
+        if (VP8RecordStats(v != 2, s + 4))
+          VP8RecordStats(v == 4, s + 5);
+      } else if (!VP8RecordStats(v > 10, s + 6)) {
+        VP8RecordStats(v > 6, s + 7);
+      } else if (!VP8RecordStats((v >= 3 + (8 << 2)), s + 8)) {
+        VP8RecordStats((v >= 3 + (8 << 1)), s + 9);
+      } else {
+        VP8RecordStats((v >= 3 + (8 << 3)), s + 10);
+      }
+#else
+      if (v > MAX_VARIABLE_LEVEL) {
+        v = MAX_VARIABLE_LEVEL;
+      }
+
+      {
+        const int bits = VP8LevelCodes[v - 1][1];
+        int pattern = VP8LevelCodes[v - 1][0];
+        int i;
+        for (i = 0; (pattern >>= 1) != 0; ++i) {
+          const int mask = 2 << i;
+          if (pattern & 1) VP8RecordStats(!!(bits & mask), s + 3 + i);
+        }
+      }
+#endif
+      s = res->stats[VP8EncBands[n]][2];
+    }
+  }
+  if (n < 16) VP8RecordStats(0, s + 0);
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/delta_palettization_enc.h b/media/libwebp/enc/delta_palettization_enc.h
deleted file mode 100644
index 63048ec6e8..0000000000
--- a/media/libwebp/enc/delta_palettization_enc.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Author: Mislav Bradac (mislavm@google.com)
-//
-
-#ifndef WEBP_ENC_DELTA_PALETTIZATION_H_
-#define WEBP_ENC_DELTA_PALETTIZATION_H_
-
-#include "../webp/encode.h"
-#include "../enc/vp8li_enc.h"
-
-// Replaces enc->argb_[] input by a palettizable approximation of it,
-// and generates optimal enc->palette_[].
-// This function can revert enc->use_palette_ / enc->use_predict_ flag
-// if delta-palettization is not producing expected saving.
-WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc);
-
-#endif  // WEBP_ENC_DELTA_PALETTIZATION_H_
diff --git a/media/libwebp/enc/filter_enc.c b/media/libwebp/enc/filter_enc.c
new file mode 100644
index 0000000000..5ffc232626
--- /dev/null
+++ b/media/libwebp/enc/filter_enc.c
@@ -0,0 +1,235 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Selecting filter level
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#include <assert.h>
+#include "../enc/vp8i_enc.h"
+#include "../dsp/dsp.h"
+
+// This table gives, for a given sharpness, the filtering strength to be
+// used (at least) in order to filter a given edge step delta.
+// This is constructed by brute force inspection: for all delta, we iterate
+// over all possible filtering strength / thresh until needs_filter() returns
+// true.
+#define MAX_DELTA_SIZE 64
+static const uint8_t kLevelsFromDelta[8][MAX_DELTA_SIZE] = {
+  { 0,   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
+  { 0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 17, 18,
+    20, 21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42,
+    44, 45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 16, 17, 19,
+    20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43,
+    44, 46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19,
+    21, 22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43,
+    45, 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20,
+    21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44,
+    45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 17, 19, 20,
+    22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43, 44,
+    46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19, 21,
+    22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45,
+    46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20, 21,
+    23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44, 45,
+    47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }
+};
+
+int VP8FilterStrengthFromDelta(int sharpness, int delta) {
+  const int pos = (delta < MAX_DELTA_SIZE) ? delta : MAX_DELTA_SIZE - 1;
+  assert(sharpness >= 0 && sharpness <= 7);
+  return kLevelsFromDelta[sharpness][pos];
+}
+
+//------------------------------------------------------------------------------
+// Paragraph 15.4: compute the inner-edge filtering strength
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+static int GetILevel(int sharpness, int level) {
+  if (sharpness > 0) {
+    if (sharpness > 4) {
+      level >>= 2;
+    } else {
+      level >>= 1;
+    }
+    if (level > 9 - sharpness) {
+      level = 9 - sharpness;
+    }
+  }
+  if (level < 1) level = 1;
+  return level;
+}
+
+static void DoFilter(const VP8EncIterator* const it, int level) {
+  const VP8Encoder* const enc = it->enc_;
+  const int ilevel = GetILevel(enc->config_->filter_sharpness, level);
+  const int limit = 2 * level + ilevel;
+
+  uint8_t* const y_dst = it->yuv_out2_ + Y_OFF_ENC;
+  uint8_t* const u_dst = it->yuv_out2_ + U_OFF_ENC;
+  uint8_t* const v_dst = it->yuv_out2_ + V_OFF_ENC;
+
+  // copy current block to yuv_out2_
+  memcpy(y_dst, it->yuv_out_, YUV_SIZE_ENC * sizeof(uint8_t));
+
+  if (enc->filter_hdr_.simple_ == 1) {   // simple
+    VP8SimpleHFilter16i(y_dst, BPS, limit);
+    VP8SimpleVFilter16i(y_dst, BPS, limit);
+  } else {    // complex
+    const int hev_thresh = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
+    VP8HFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
+    VP8HFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
+    VP8VFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
+    VP8VFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SSIM metric for one macroblock
+
+static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
+  int x, y;
+  double sum = 0.;
+
+  // compute SSIM in a 10 x 10 window
+  for (y = VP8_SSIM_KERNEL; y < 16 - VP8_SSIM_KERNEL; y++) {
+    for (x = VP8_SSIM_KERNEL; x < 16 - VP8_SSIM_KERNEL; x++) {
+      sum += VP8SSIMGetClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
+                               x, y, 16, 16);
+    }
+  }
+  for (x = 1; x < 7; x++) {
+    for (y = 1; y < 7; y++) {
+      sum += VP8SSIMGetClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
+                               x, y, 8, 8);
+      sum += VP8SSIMGetClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
+                               x, y, 8, 8);
+    }
+  }
+  return sum;
+}
+
+#endif  // !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+// Exposed APIs: Encoder should call the following 3 functions to adjust
+// loop filter strength
+
+void VP8InitFilter(VP8EncIterator* const it) {
+#if !defined(WEBP_REDUCE_SIZE)
+  if (it->lf_stats_ != NULL) {
+    int s, i;
+    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+      for (i = 0; i < MAX_LF_LEVELS; i++) {
+        (*it->lf_stats_)[s][i] = 0;
+      }
+    }
+    VP8SSIMDspInit();
+  }
+#else
+  (void)it;
+#endif
+}
+
+void VP8StoreFilterStats(VP8EncIterator* const it) {
+#if !defined(WEBP_REDUCE_SIZE)
+  int d;
+  VP8Encoder* const enc = it->enc_;
+  const int s = it->mb_->segment_;
+  const int level0 = enc->dqm_[s].fstrength_;
+
+  // explore +/-quant range of values around level0
+  const int delta_min = -enc->dqm_[s].quant_;
+  const int delta_max = enc->dqm_[s].quant_;
+  const int step_size = (delta_max - delta_min >= 4) ? 4 : 1;
+
+  if (it->lf_stats_ == NULL) return;
+
+  // NOTE: Currently we are applying filter only across the sublock edges
+  // There are two reasons for that.
+  // 1. Applying filter on macro block edges will change the pixels in
+  // the left and top macro blocks. That will be hard to restore
+  // 2. Macro Blocks on the bottom and right are not yet compressed. So we
+  // cannot apply filter on the right and bottom macro block edges.
+  if (it->mb_->type_ == 1 && it->mb_->skip_) return;
+
+  // Always try filter level  zero
+  (*it->lf_stats_)[s][0] += GetMBSSIM(it->yuv_in_, it->yuv_out_);
+
+  for (d = delta_min; d <= delta_max; d += step_size) {
+    const int level = level0 + d;
+    if (level <= 0 || level >= MAX_LF_LEVELS) {
+      continue;
+    }
+    DoFilter(it, level);
+    (*it->lf_stats_)[s][level] += GetMBSSIM(it->yuv_in_, it->yuv_out2_);
+  }
+#else  // defined(WEBP_REDUCE_SIZE)
+  (void)it;
+#endif  // !defined(WEBP_REDUCE_SIZE)
+}
+
+void VP8AdjustFilterStrength(VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+#if !defined(WEBP_REDUCE_SIZE)
+  if (it->lf_stats_ != NULL) {
+    int s;
+    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+      int i, best_level = 0;
+      // Improvement over filter level 0 should be at least 1e-5 (relatively)
+      double best_v = 1.00001 * (*it->lf_stats_)[s][0];
+      for (i = 1; i < MAX_LF_LEVELS; i++) {
+        const double v = (*it->lf_stats_)[s][i];
+        if (v > best_v) {
+          best_v = v;
+          best_level = i;
+        }
+      }
+      enc->dqm_[s].fstrength_ = best_level;
+    }
+    return;
+  }
+#endif  // !defined(WEBP_REDUCE_SIZE)
+  if (enc->config_->filter_strength > 0) {
+    int max_level = 0;
+    int s;
+    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+      VP8SegmentInfo* const dqm = &enc->dqm_[s];
+      // this '>> 3' accounts for some inverse WHT scaling
+      const int delta = (dqm->max_edge_ * dqm->y2_.q_[1]) >> 3;
+      const int level =
+          VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, delta);
+      if (level > dqm->fstrength_) {
+        dqm->fstrength_ = level;
+      }
+      if (max_level < dqm->fstrength_) {
+        max_level = dqm->fstrength_;
+      }
+    }
+    enc->filter_hdr_.level_ = max_level;
+  }
+}
+
+// -----------------------------------------------------------------------------
diff --git a/media/libwebp/enc/frame_enc.c b/media/libwebp/enc/frame_enc.c
new file mode 100644
index 0000000000..c8698ca5b5
--- /dev/null
+++ b/media/libwebp/enc/frame_enc.c
@@ -0,0 +1,899 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   frame coding and analysis
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <string.h>
+#include <math.h>
+
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../dsp/dsp.h"
+#include "../webp/format_constants.h"  // RIFF constants
+
+#define SEGMENT_VISU 0
+#define DEBUG_SEARCH 0    // useful to track search convergence
+
+//------------------------------------------------------------------------------
+// multi-pass convergence
+
+#define HEADER_SIZE_ESTIMATE (RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE +  \
+                              VP8_FRAME_HEADER_SIZE)
+#define DQ_LIMIT 0.4  // convergence is considered reached if dq < DQ_LIMIT
+// we allow 2k of extra head-room in PARTITION0 limit.
+#define PARTITION0_SIZE_LIMIT ((VP8_MAX_PARTITION0_SIZE - 2048ULL) << 11)
+
+static float Clamp(float v, float min, float max) {
+  return (v < min) ? min : (v > max) ? max : v;
+}
+
+typedef struct {  // struct for organizing convergence in either size or PSNR
+  int is_first;
+  float dq;
+  float q, last_q;
+  float qmin, qmax;
+  double value, last_value;   // PSNR or size
+  double target;
+  int do_size_search;
+} PassStats;
+
+static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
+  const uint64_t target_size = (uint64_t)enc->config_->target_size;
+  const int do_size_search = (target_size != 0);
+  const float target_PSNR = enc->config_->target_PSNR;
+
+  s->is_first = 1;
+  s->dq = 10.f;
+  s->qmin = 1.f * enc->config_->qmin;
+  s->qmax = 1.f * enc->config_->qmax;
+  s->q = s->last_q = Clamp(enc->config_->quality, s->qmin, s->qmax);
+  s->target = do_size_search ? (double)target_size
+            : (target_PSNR > 0.) ? target_PSNR
+            : 40.;   // default, just in case
+  s->value = s->last_value = 0.;
+  s->do_size_search = do_size_search;
+  return do_size_search;
+}
+
+static float ComputeNextQ(PassStats* const s) {
+  float dq;
+  if (s->is_first) {
+    dq = (s->value > s->target) ? -s->dq : s->dq;
+    s->is_first = 0;
+  } else if (s->value != s->last_value) {
+    const double slope = (s->target - s->value) / (s->last_value - s->value);
+    dq = (float)(slope * (s->last_q - s->q));
+  } else {
+    dq = 0.;  // we're done?!
+  }
+  // Limit variable to avoid large swings.
+  s->dq = Clamp(dq, -30.f, 30.f);
+  s->last_q = s->q;
+  s->last_value = s->value;
+  s->q = Clamp(s->q + s->dq, s->qmin, s->qmax);
+  return s->q;
+}
+
+//------------------------------------------------------------------------------
+// Tables for level coding
+
+const uint8_t VP8Cat3[] = { 173, 148, 140 };
+const uint8_t VP8Cat4[] = { 176, 155, 140, 135 };
+const uint8_t VP8Cat5[] = { 180, 157, 141, 134, 130 };
+const uint8_t VP8Cat6[] =
+    { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129 };
+
+//------------------------------------------------------------------------------
+// Reset the statistics about: number of skips, token proba, level cost,...
+
+static void ResetStats(VP8Encoder* const enc) {
+  VP8EncProba* const proba = &enc->proba_;
+  VP8CalculateLevelCosts(proba);
+  proba->nb_skip_ = 0;
+}
+
+//------------------------------------------------------------------------------
+// Skip decision probability
+
+#define SKIP_PROBA_THRESHOLD 250  // value below which using skip_proba is OK.
+
+static int CalcSkipProba(uint64_t nb, uint64_t total) {
+  return (int)(total ? (total - nb) * 255 / total : 255);
+}
+
+// Returns the bit-cost for coding the skip probability.
+static int FinalizeSkipProba(VP8Encoder* const enc) {
+  VP8EncProba* const proba = &enc->proba_;
+  const int nb_mbs = enc->mb_w_ * enc->mb_h_;
+  const int nb_events = proba->nb_skip_;
+  int size;
+  proba->skip_proba_ = CalcSkipProba(nb_events, nb_mbs);
+  proba->use_skip_proba_ = (proba->skip_proba_ < SKIP_PROBA_THRESHOLD);
+  size = 256;   // 'use_skip_proba' bit
+  if (proba->use_skip_proba_) {
+    size +=  nb_events * VP8BitCost(1, proba->skip_proba_)
+         + (nb_mbs - nb_events) * VP8BitCost(0, proba->skip_proba_);
+    size += 8 * 256;   // cost of signaling the skip_proba_ itself.
+  }
+  return size;
+}
+
+// Collect statistics and deduce probabilities for next coding pass.
+// Return the total bit-cost for coding the probability updates.
+static int CalcTokenProba(int nb, int total) {
+  assert(nb <= total);
+  return nb ? (255 - nb * 255 / total) : 255;
+}
+
+// Cost of coding 'nb' 1's and 'total-nb' 0's using 'proba' probability.
+static int BranchCost(int nb, int total, int proba) {
+  return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
+}
+
+static void ResetTokenStats(VP8Encoder* const enc) {
+  VP8EncProba* const proba = &enc->proba_;
+  memset(proba->stats_, 0, sizeof(proba->stats_));
+}
+
+static int FinalizeTokenProbas(VP8EncProba* const proba) {
+  int has_changed = 0;
+  int size = 0;
+  int t, b, c, p;
+  for (t = 0; t < NUM_TYPES; ++t) {
+    for (b = 0; b < NUM_BANDS; ++b) {
+      for (c = 0; c < NUM_CTX; ++c) {
+        for (p = 0; p < NUM_PROBAS; ++p) {
+          const proba_t stats = proba->stats_[t][b][c][p];
+          const int nb = (stats >> 0) & 0xffff;
+          const int total = (stats >> 16) & 0xffff;
+          const int update_proba = VP8CoeffsUpdateProba[t][b][c][p];
+          const int old_p = VP8CoeffsProba0[t][b][c][p];
+          const int new_p = CalcTokenProba(nb, total);
+          const int old_cost = BranchCost(nb, total, old_p)
+                             + VP8BitCost(0, update_proba);
+          const int new_cost = BranchCost(nb, total, new_p)
+                             + VP8BitCost(1, update_proba)
+                             + 8 * 256;
+          const int use_new_p = (old_cost > new_cost);
+          size += VP8BitCost(use_new_p, update_proba);
+          if (use_new_p) {  // only use proba that seem meaningful enough.
+            proba->coeffs_[t][b][c][p] = new_p;
+            has_changed |= (new_p != old_p);
+            size += 8 * 256;
+          } else {
+            proba->coeffs_[t][b][c][p] = old_p;
+          }
+        }
+      }
+    }
+  }
+  proba->dirty_ = has_changed;
+  return size;
+}
+
+//------------------------------------------------------------------------------
+// Finalize Segment probability based on the coding tree
+
+static int GetProba(int a, int b) {
+  const int total = a + b;
+  return (total == 0) ? 255     // that's the default probability.
+                      : (255 * a + total / 2) / total;  // rounded proba
+}
+
+static void ResetSegments(VP8Encoder* const enc) {
+  int n;
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    enc->mb_info_[n].segment_ = 0;
+  }
+}
+
+static void SetSegmentProbas(VP8Encoder* const enc) {
+  int p[NUM_MB_SEGMENTS] = { 0 };
+  int n;
+
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    const VP8MBInfo* const mb = &enc->mb_info_[n];
+    ++p[mb->segment_];
+  }
+#if !defined(WEBP_DISABLE_STATS)
+  if (enc->pic_->stats != NULL) {
+    for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
+      enc->pic_->stats->segment_size[n] = p[n];
+    }
+  }
+#endif
+  if (enc->segment_hdr_.num_segments_ > 1) {
+    uint8_t* const probas = enc->proba_.segments_;
+    probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
+    probas[1] = GetProba(p[0], p[1]);
+    probas[2] = GetProba(p[2], p[3]);
+
+    enc->segment_hdr_.update_map_ =
+        (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
+    if (!enc->segment_hdr_.update_map_) ResetSegments(enc);
+    enc->segment_hdr_.size_ =
+        p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
+        p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
+        p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) +
+        p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2]));
+  } else {
+    enc->segment_hdr_.update_map_ = 0;
+    enc->segment_hdr_.size_ = 0;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Coefficient coding
+
+static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const uint8_t* p = res->prob[n][ctx];
+  if (!VP8PutBit(bw, res->last >= 0, p[0])) {
+    return 0;
+  }
+
+  while (n < 16) {
+    const int c = res->coeffs[n++];
+    const int sign = c < 0;
+    int v = sign ? -c : c;
+    if (!VP8PutBit(bw, v != 0, p[1])) {
+      p = res->prob[VP8EncBands[n]][0];
+      continue;
+    }
+    if (!VP8PutBit(bw, v > 1, p[2])) {
+      p = res->prob[VP8EncBands[n]][1];
+    } else {
+      if (!VP8PutBit(bw, v > 4, p[3])) {
+        if (VP8PutBit(bw, v != 2, p[4])) {
+          VP8PutBit(bw, v == 4, p[5]);
+        }
+      } else if (!VP8PutBit(bw, v > 10, p[6])) {
+        if (!VP8PutBit(bw, v > 6, p[7])) {
+          VP8PutBit(bw, v == 6, 159);
+        } else {
+          VP8PutBit(bw, v >= 9, 165);
+          VP8PutBit(bw, !(v & 1), 145);
+        }
+      } else {
+        int mask;
+        const uint8_t* tab;
+        if (v < 3 + (8 << 1)) {          // VP8Cat3  (3b)
+          VP8PutBit(bw, 0, p[8]);
+          VP8PutBit(bw, 0, p[9]);
+          v -= 3 + (8 << 0);
+          mask = 1 << 2;
+          tab = VP8Cat3;
+        } else if (v < 3 + (8 << 2)) {   // VP8Cat4  (4b)
+          VP8PutBit(bw, 0, p[8]);
+          VP8PutBit(bw, 1, p[9]);
+          v -= 3 + (8 << 1);
+          mask = 1 << 3;
+          tab = VP8Cat4;
+        } else if (v < 3 + (8 << 3)) {   // VP8Cat5  (5b)
+          VP8PutBit(bw, 1, p[8]);
+          VP8PutBit(bw, 0, p[10]);
+          v -= 3 + (8 << 2);
+          mask = 1 << 4;
+          tab = VP8Cat5;
+        } else {                         // VP8Cat6 (11b)
+          VP8PutBit(bw, 1, p[8]);
+          VP8PutBit(bw, 1, p[10]);
+          v -= 3 + (8 << 3);
+          mask = 1 << 10;
+          tab = VP8Cat6;
+        }
+        while (mask) {
+          VP8PutBit(bw, !!(v & mask), *tab++);
+          mask >>= 1;
+        }
+      }
+      p = res->prob[VP8EncBands[n]][2];
+    }
+    VP8PutBitUniform(bw, sign);
+    if (n == 16 || !VP8PutBit(bw, n <= res->last, p[0])) {
+      return 1;   // EOB
+    }
+  }
+  return 1;
+}
+
+static void CodeResiduals(VP8BitWriter* const bw, VP8EncIterator* const it,
+                          const VP8ModeScore* const rd) {
+  int x, y, ch;
+  VP8Residual res;
+  uint64_t pos1, pos2, pos3;
+  const int i16 = (it->mb_->type_ == 1);
+  const int segment = it->mb_->segment_;
+  VP8Encoder* const enc = it->enc_;
+
+  VP8IteratorNzToBytes(it);
+
+  pos1 = VP8BitWriterPos(bw);
+  if (i16) {
+    VP8InitResidual(0, 1, enc, &res);
+    VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+    it->top_nz_[8] = it->left_nz_[8] =
+      PutCoeffs(bw, it->top_nz_[8] + it->left_nz_[8], &res);
+    VP8InitResidual(1, 0, enc, &res);
+  } else {
+    VP8InitResidual(0, 3, enc, &res);
+  }
+
+  // luma-AC
+  for (y = 0; y < 4; ++y) {
+    for (x = 0; x < 4; ++x) {
+      const int ctx = it->top_nz_[x] + it->left_nz_[y];
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      it->top_nz_[x] = it->left_nz_[y] = PutCoeffs(bw, ctx, &res);
+    }
+  }
+  pos2 = VP8BitWriterPos(bw);
+
+  // U/V
+  VP8InitResidual(0, 2, enc, &res);
+  for (ch = 0; ch <= 2; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x) {
+        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
+            PutCoeffs(bw, ctx, &res);
+      }
+    }
+  }
+  pos3 = VP8BitWriterPos(bw);
+  it->luma_bits_ = pos2 - pos1;
+  it->uv_bits_ = pos3 - pos2;
+  it->bit_count_[segment][i16] += it->luma_bits_;
+  it->bit_count_[segment][2] += it->uv_bits_;
+  VP8IteratorBytesToNz(it);
+}
+
+// Same as CodeResiduals, but doesn't actually write anything.
+// Instead, it just records the event distribution.
+static void RecordResiduals(VP8EncIterator* const it,
+                            const VP8ModeScore* const rd) {
+  int x, y, ch;
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+
+  VP8IteratorNzToBytes(it);
+
+  if (it->mb_->type_ == 1) {   // i16x16
+    VP8InitResidual(0, 1, enc, &res);
+    VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+    it->top_nz_[8] = it->left_nz_[8] =
+      VP8RecordCoeffs(it->top_nz_[8] + it->left_nz_[8], &res);
+    VP8InitResidual(1, 0, enc, &res);
+  } else {
+    VP8InitResidual(0, 3, enc, &res);
+  }
+
+  // luma-AC
+  for (y = 0; y < 4; ++y) {
+    for (x = 0; x < 4; ++x) {
+      const int ctx = it->top_nz_[x] + it->left_nz_[y];
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      it->top_nz_[x] = it->left_nz_[y] = VP8RecordCoeffs(ctx, &res);
+    }
+  }
+
+  // U/V
+  VP8InitResidual(0, 2, enc, &res);
+  for (ch = 0; ch <= 2; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x) {
+        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
+            VP8RecordCoeffs(ctx, &res);
+      }
+    }
+  }
+
+  VP8IteratorBytesToNz(it);
+}
+
+//------------------------------------------------------------------------------
+// Token buffer
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
+                        VP8TBuffer* const tokens) {
+  int x, y, ch;
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+
+  VP8IteratorNzToBytes(it);
+  if (it->mb_->type_ == 1) {   // i16x16
+    const int ctx = it->top_nz_[8] + it->left_nz_[8];
+    VP8InitResidual(0, 1, enc, &res);
+    VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+    it->top_nz_[8] = it->left_nz_[8] =
+        VP8RecordCoeffTokens(ctx, &res, tokens);
+    VP8InitResidual(1, 0, enc, &res);
+  } else {
+    VP8InitResidual(0, 3, enc, &res);
+  }
+
+  // luma-AC
+  for (y = 0; y < 4; ++y) {
+    for (x = 0; x < 4; ++x) {
+      const int ctx = it->top_nz_[x] + it->left_nz_[y];
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      it->top_nz_[x] = it->left_nz_[y] =
+          VP8RecordCoeffTokens(ctx, &res, tokens);
+    }
+  }
+
+  // U/V
+  VP8InitResidual(0, 2, enc, &res);
+  for (ch = 0; ch <= 2; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x) {
+        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
+            VP8RecordCoeffTokens(ctx, &res, tokens);
+      }
+    }
+  }
+  VP8IteratorBytesToNz(it);
+  return !tokens->error_;
+}
+
+#endif    // !DISABLE_TOKEN_BUFFER
+
+//------------------------------------------------------------------------------
+// ExtraInfo map / Debug function
+
+#if !defined(WEBP_DISABLE_STATS)
+
+#if SEGMENT_VISU
+static void SetBlock(uint8_t* p, int value, int size) {
+  int y;
+  for (y = 0; y < size; ++y) {
+    memset(p, value, size);
+    p += BPS;
+  }
+}
+#endif
+
+static void ResetSSE(VP8Encoder* const enc) {
+  enc->sse_[0] = 0;
+  enc->sse_[1] = 0;
+  enc->sse_[2] = 0;
+  // Note: enc->sse_[3] is managed by alpha.c
+  enc->sse_count_ = 0;
+}
+
+static void StoreSSE(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  const uint8_t* const in = it->yuv_in_;
+  const uint8_t* const out = it->yuv_out_;
+  // Note: not totally accurate at boundary. And doesn't include in-loop filter.
+  enc->sse_[0] += VP8SSE16x16(in + Y_OFF_ENC, out + Y_OFF_ENC);
+  enc->sse_[1] += VP8SSE8x8(in + U_OFF_ENC, out + U_OFF_ENC);
+  enc->sse_[2] += VP8SSE8x8(in + V_OFF_ENC, out + V_OFF_ENC);
+  enc->sse_count_ += 16 * 16;
+}
+
+static void StoreSideInfo(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  const VP8MBInfo* const mb = it->mb_;
+  WebPPicture* const pic = enc->pic_;
+
+  if (pic->stats != NULL) {
+    StoreSSE(it);
+    enc->block_count_[0] += (mb->type_ == 0);
+    enc->block_count_[1] += (mb->type_ == 1);
+    enc->block_count_[2] += (mb->skip_ != 0);
+  }
+
+  if (pic->extra_info != NULL) {
+    uint8_t* const info = &pic->extra_info[it->x_ + it->y_ * enc->mb_w_];
+    switch (pic->extra_info_type) {
+      case 1: *info = mb->type_; break;
+      case 2: *info = mb->segment_; break;
+      case 3: *info = enc->dqm_[mb->segment_].quant_; break;
+      case 4: *info = (mb->type_ == 1) ? it->preds_[0] : 0xff; break;
+      case 5: *info = mb->uv_mode_; break;
+      case 6: {
+        const int b = (int)((it->luma_bits_ + it->uv_bits_ + 7) >> 3);
+        *info = (b > 255) ? 255 : b; break;
+      }
+      case 7: *info = mb->alpha_; break;
+      default: *info = 0; break;
+    }
+  }
+#if SEGMENT_VISU  // visualize segments and prediction modes
+  SetBlock(it->yuv_out_ + Y_OFF_ENC, mb->segment_ * 64, 16);
+  SetBlock(it->yuv_out_ + U_OFF_ENC, it->preds_[0] * 64, 8);
+  SetBlock(it->yuv_out_ + V_OFF_ENC, mb->uv_mode_ * 64, 8);
+#endif
+}
+
+static void ResetSideInfo(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  WebPPicture* const pic = enc->pic_;
+  if (pic->stats != NULL) {
+    memset(enc->block_count_, 0, sizeof(enc->block_count_));
+  }
+  ResetSSE(enc);
+}
+#else  // defined(WEBP_DISABLE_STATS)
+static void ResetSSE(VP8Encoder* const enc) {
+  (void)enc;
+}
+static void StoreSideInfo(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  WebPPicture* const pic = enc->pic_;
+  if (pic->extra_info != NULL) {
+    if (it->x_ == 0 && it->y_ == 0) {   // only do it once, at start
+      memset(pic->extra_info, 0,
+             enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+    }
+  }
+}
+
+static void ResetSideInfo(const VP8EncIterator* const it) {
+  (void)it;
+}
+#endif  // !defined(WEBP_DISABLE_STATS)
+
+static double GetPSNR(uint64_t mse, uint64_t size) {
+  return (mse > 0 && size > 0) ? 10. * log10(255. * 255. * size / mse) : 99;
+}
+
+//------------------------------------------------------------------------------
+//  StatLoop(): only collect statistics (number of skips, token usage, ...).
+//  This is used for deciding optimal probabilities. It also modifies the
+//  quantizer value if some target (size, PSNR) was specified.
+
+static void SetLoopParams(VP8Encoder* const enc, float q) {
+  // Make sure the quality parameter is inside valid bounds
+  q = Clamp(q, 0.f, 100.f);
+
+  VP8SetSegmentParams(enc, q);      // setup segment quantizations and filters
+  SetSegmentProbas(enc);            // compute segment probabilities
+
+  ResetStats(enc);
+  ResetSSE(enc);
+}
+
+static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
+                            int nb_mbs, int percent_delta,
+                            PassStats* const s) {
+  VP8EncIterator it;
+  uint64_t size = 0;
+  uint64_t size_p0 = 0;
+  uint64_t distortion = 0;
+  const uint64_t pixel_count = nb_mbs * 384;
+
+  VP8IteratorInit(enc, &it);
+  SetLoopParams(enc, s->q);
+  do {
+    VP8ModeScore info;
+    VP8IteratorImport(&it, NULL);
+    if (VP8Decimate(&it, &info, rd_opt)) {
+      // Just record the number of skips and act like skip_proba is not used.
+      ++enc->proba_.nb_skip_;
+    }
+    RecordResiduals(&it, &info);
+    size += info.R + info.H;
+    size_p0 += info.H;
+    distortion += info.D;
+    if (percent_delta && !VP8IteratorProgress(&it, percent_delta)) {
+      return 0;
+    }
+    VP8IteratorSaveBoundary(&it);
+  } while (VP8IteratorNext(&it) && --nb_mbs > 0);
+
+  size_p0 += enc->segment_hdr_.size_;
+  if (s->do_size_search) {
+    size += FinalizeSkipProba(enc);
+    size += FinalizeTokenProbas(&enc->proba_);
+    size = ((size + size_p0 + 1024) >> 11) + HEADER_SIZE_ESTIMATE;
+    s->value = (double)size;
+  } else {
+    s->value = GetPSNR(distortion, pixel_count);
+  }
+  return size_p0;
+}
+
+static int StatLoop(VP8Encoder* const enc) {
+  const int method = enc->method_;
+  const int do_search = enc->do_search_;
+  const int fast_probe = ((method == 0 || method == 3) && !do_search);
+  int num_pass_left = enc->config_->pass;
+  const int task_percent = 20;
+  const int percent_per_pass =
+      (task_percent + num_pass_left / 2) / num_pass_left;
+  const int final_percent = enc->percent_ + task_percent;
+  const VP8RDLevel rd_opt =
+      (method >= 3 || do_search) ? RD_OPT_BASIC : RD_OPT_NONE;
+  int nb_mbs = enc->mb_w_ * enc->mb_h_;
+  PassStats stats;
+
+  InitPassStats(enc, &stats);
+  ResetTokenStats(enc);
+
+  // Fast mode: quick analysis pass over few mbs. Better than nothing.
+  if (fast_probe) {
+    if (method == 3) {  // we need more stats for method 3 to be reliable.
+      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 1 : 100;
+    } else {
+      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 2 : 50;
+    }
+  }
+
+  while (num_pass_left-- > 0) {
+    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
+                             (num_pass_left == 0) ||
+                             (enc->max_i4_header_bits_ == 0);
+    const uint64_t size_p0 =
+        OneStatPass(enc, rd_opt, nb_mbs, percent_per_pass, &stats);
+    if (size_p0 == 0) return 0;
+#if (DEBUG_SEARCH > 0)
+    printf("#%d value:%.1lf -> %.1lf   q:%.2f -> %.2f\n",
+           num_pass_left, stats.last_value, stats.value, stats.last_q, stats.q);
+#endif
+    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
+      ++num_pass_left;
+      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
+      continue;                        // ...and start over
+    }
+    if (is_last_pass) {
+      break;
+    }
+    // If no target size: just do several pass without changing 'q'
+    if (do_search) {
+      ComputeNextQ(&stats);
+      if (fabs(stats.dq) <= DQ_LIMIT) break;
+    }
+  }
+  if (!do_search || !stats.do_size_search) {
+    // Need to finalize probas now, since it wasn't done during the search.
+    FinalizeSkipProba(enc);
+    FinalizeTokenProbas(&enc->proba_);
+  }
+  VP8CalculateLevelCosts(&enc->proba_);  // finalize costs
+  return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
+}
+
+//------------------------------------------------------------------------------
+// Main loops
+//
+
+static const uint8_t kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 };
+
+static int PreLoopInitialize(VP8Encoder* const enc) {
+  int p;
+  int ok = 1;
+  const int average_bytes_per_MB = kAverageBytesPerMB[enc->base_quant_ >> 4];
+  const int bytes_per_parts =
+      enc->mb_w_ * enc->mb_h_ * average_bytes_per_MB / enc->num_parts_;
+  // Initialize the bit-writers
+  for (p = 0; ok && p < enc->num_parts_; ++p) {
+    ok = VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
+  }
+  if (!ok) {
+    VP8EncFreeBitWriters(enc);  // malloc error occurred
+    WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+  return ok;
+}
+
+static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
+  VP8Encoder* const enc = it->enc_;
+  if (ok) {      // Finalize the partitions, check for extra errors.
+    int p;
+    for (p = 0; p < enc->num_parts_; ++p) {
+      VP8BitWriterFinish(enc->parts_ + p);
+      ok &= !enc->parts_[p].error_;
+    }
+  }
+
+  if (ok) {      // All good. Finish up.
+#if !defined(WEBP_DISABLE_STATS)
+    if (enc->pic_->stats != NULL) {  // finalize byte counters...
+      int i, s;
+      for (i = 0; i <= 2; ++i) {
+        for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+          enc->residual_bytes_[i][s] = (int)((it->bit_count_[s][i] + 7) >> 3);
+        }
+      }
+    }
+#endif
+    VP8AdjustFilterStrength(it);     // ...and store filter stats.
+  } else {
+    // Something bad happened -> need to do some memory cleanup.
+    VP8EncFreeBitWriters(enc);
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+//  VP8EncLoop(): does the final bitstream coding.
+
+static void ResetAfterSkip(VP8EncIterator* const it) {
+  if (it->mb_->type_ == 1) {
+    *it->nz_ = 0;  // reset all predictors
+    it->left_nz_[8] = 0;
+  } else {
+    *it->nz_ &= (1 << 24);  // preserve the dc_nz bit
+  }
+}
+
+int VP8EncLoop(VP8Encoder* const enc) {
+  VP8EncIterator it;
+  int ok = PreLoopInitialize(enc);
+  if (!ok) return 0;
+
+  StatLoop(enc);  // stats-collection loop
+
+  VP8IteratorInit(enc, &it);
+  VP8InitFilter(&it);
+  do {
+    VP8ModeScore info;
+    const int dont_use_skip = !enc->proba_.use_skip_proba_;
+    const VP8RDLevel rd_opt = enc->rd_opt_level_;
+
+    VP8IteratorImport(&it, NULL);
+    // Warning! order is important: first call VP8Decimate() and
+    // *then* decide how to code the skip decision if there's one.
+    if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
+      CodeResiduals(it.bw_, &it, &info);
+    } else {   // reset predictors after a skip
+      ResetAfterSkip(&it);
+    }
+    StoreSideInfo(&it);
+    VP8StoreFilterStats(&it);
+    VP8IteratorExport(&it);
+    ok = VP8IteratorProgress(&it, 20);
+    VP8IteratorSaveBoundary(&it);
+  } while (ok && VP8IteratorNext(&it));
+
+  return PostLoopFinalize(&it, ok);
+}
+
+//------------------------------------------------------------------------------
+// Single pass using Token Buffer.
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+#define MIN_COUNT 96  // minimum number of macroblocks before updating stats
+
+int VP8EncTokenLoop(VP8Encoder* const enc) {
+  // Roughly refresh the proba eight times per pass
+  int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
+  int num_pass_left = enc->config_->pass;
+  int remaining_progress = 40;  // percents
+  const int do_search = enc->do_search_;
+  VP8EncIterator it;
+  VP8EncProba* const proba = &enc->proba_;
+  const VP8RDLevel rd_opt = enc->rd_opt_level_;
+  const uint64_t pixel_count = enc->mb_w_ * enc->mb_h_ * 384;
+  PassStats stats;
+  int ok;
+
+  InitPassStats(enc, &stats);
+  ok = PreLoopInitialize(enc);
+  if (!ok) return 0;
+
+  if (max_count < MIN_COUNT) max_count = MIN_COUNT;
+
+  assert(enc->num_parts_ == 1);
+  assert(enc->use_tokens_);
+  assert(proba->use_skip_proba_ == 0);
+  assert(rd_opt >= RD_OPT_BASIC);   // otherwise, token-buffer won't be useful
+  assert(num_pass_left > 0);
+
+  while (ok && num_pass_left-- > 0) {
+    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
+                             (num_pass_left == 0) ||
+                             (enc->max_i4_header_bits_ == 0);
+    uint64_t size_p0 = 0;
+    uint64_t distortion = 0;
+    int cnt = max_count;
+    // The final number of passes is not trivial to know in advance.
+    const int pass_progress = remaining_progress / (2 + num_pass_left);
+    remaining_progress -= pass_progress;
+    VP8IteratorInit(enc, &it);
+    SetLoopParams(enc, stats.q);
+    if (is_last_pass) {
+      ResetTokenStats(enc);
+      VP8InitFilter(&it);  // don't collect stats until last pass (too costly)
+    }
+    VP8TBufferClear(&enc->tokens_);
+    do {
+      VP8ModeScore info;
+      VP8IteratorImport(&it, NULL);
+      if (--cnt < 0) {
+        FinalizeTokenProbas(proba);
+        VP8CalculateLevelCosts(proba);  // refresh cost tables for rd-opt
+        cnt = max_count;
+      }
+      VP8Decimate(&it, &info, rd_opt);
+      ok = RecordTokens(&it, &info, &enc->tokens_);
+      if (!ok) {
+        WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+        break;
+      }
+      size_p0 += info.H;
+      distortion += info.D;
+      if (is_last_pass) {
+        StoreSideInfo(&it);
+        VP8StoreFilterStats(&it);
+        VP8IteratorExport(&it);
+        ok = VP8IteratorProgress(&it, pass_progress);
+      }
+      VP8IteratorSaveBoundary(&it);
+    } while (ok && VP8IteratorNext(&it));
+    if (!ok) break;
+
+    size_p0 += enc->segment_hdr_.size_;
+    if (stats.do_size_search) {
+      uint64_t size = FinalizeTokenProbas(&enc->proba_);
+      size += VP8EstimateTokenSize(&enc->tokens_,
+                                   (const uint8_t*)proba->coeffs_);
+      size = (size + size_p0 + 1024) >> 11;  // -> size in bytes
+      size += HEADER_SIZE_ESTIMATE;
+      stats.value = (double)size;
+    } else {  // compute and store PSNR
+      stats.value = GetPSNR(distortion, pixel_count);
+    }
+
+#if (DEBUG_SEARCH > 0)
+    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf "
+           " range:[%.1f, %.1f]\n",
+           num_pass_left, stats.last_value, stats.value,
+           stats.last_q, stats.q, stats.dq, stats.qmin, stats.qmax);
+#endif
+    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
+      ++num_pass_left;
+      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
+      if (is_last_pass) {
+        ResetSideInfo(&it);
+      }
+      continue;                        // ...and start over
+    }
+    if (is_last_pass) {
+      break;   // done
+    }
+    if (do_search) {
+      ComputeNextQ(&stats);  // Adjust q
+    }
+  }
+  if (ok) {
+    if (!stats.do_size_search) {
+      FinalizeTokenProbas(&enc->proba_);
+    }
+    ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
+                       (const uint8_t*)proba->coeffs_, 1);
+  }
+  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + remaining_progress,
+                                &enc->percent_);
+  return PostLoopFinalize(&it, ok);
+}
+
+#else
+
+int VP8EncTokenLoop(VP8Encoder* const enc) {
+  (void)enc;
+  return 0;   // we shouldn't be here.
+}
+
+#endif    // DISABLE_TOKEN_BUFFER
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/histogram_enc.c b/media/libwebp/enc/histogram_enc.c
new file mode 100644
index 0000000000..83f218bcb4
--- /dev/null
+++ b/media/libwebp/enc/histogram_enc.c
@@ -0,0 +1,1252 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include <math.h>
+
+#include "../enc/backward_references_enc.h"
+#include "../enc/histogram_enc.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/utils.h"
+
+#define MAX_COST 1.e38
+
+// Number of partitions for the three dominant (literal, red and blue) symbol
+// costs.
+#define NUM_PARTITIONS 4
+// The size of the bin-hash corresponding to the three dominant costs.
+#define BIN_SIZE (NUM_PARTITIONS * NUM_PARTITIONS * NUM_PARTITIONS)
+// Maximum number of histograms allowed in greedy combining algorithm.
+#define MAX_HISTO_GREEDY 100
+
+static void HistogramClear(VP8LHistogram* const p) {
+  uint32_t* const literal = p->literal_;
+  const int cache_bits = p->palette_code_bits_;
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  memset(p, 0, histo_size);
+  p->palette_code_bits_ = cache_bits;
+  p->literal_ = literal;
+}
+
+// Swap two histogram pointers.
+static void HistogramSwap(VP8LHistogram** const A, VP8LHistogram** const B) {
+  VP8LHistogram* const tmp = *A;
+  *A = *B;
+  *B = tmp;
+}
+
+static void HistogramCopy(const VP8LHistogram* const src,
+                          VP8LHistogram* const dst) {
+  uint32_t* const dst_literal = dst->literal_;
+  const int dst_cache_bits = dst->palette_code_bits_;
+  const int literal_size = VP8LHistogramNumCodes(dst_cache_bits);
+  const int histo_size = VP8LGetHistogramSize(dst_cache_bits);
+  assert(src->palette_code_bits_ == dst_cache_bits);
+  memcpy(dst, src, histo_size);
+  dst->literal_ = dst_literal;
+  memcpy(dst->literal_, src->literal_, literal_size * sizeof(*dst->literal_));
+}
+
+int VP8LGetHistogramSize(int cache_bits) {
+  const int literal_size = VP8LHistogramNumCodes(cache_bits);
+  const size_t total_size = sizeof(VP8LHistogram) + sizeof(int) * literal_size;
+  assert(total_size <= (size_t)0x7fffffff);
+  return (int)total_size;
+}
+
+void VP8LFreeHistogram(VP8LHistogram* const histo) {
+  WebPSafeFree(histo);
+}
+
+void VP8LFreeHistogramSet(VP8LHistogramSet* const histo) {
+  WebPSafeFree(histo);
+}
+
+void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
+                            VP8LHistogram* const histo) {
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, NULL, 0);
+    VP8LRefsCursorNext(&c);
+  }
+}
+
+void VP8LHistogramCreate(VP8LHistogram* const p,
+                         const VP8LBackwardRefs* const refs,
+                         int palette_code_bits) {
+  if (palette_code_bits >= 0) {
+    p->palette_code_bits_ = palette_code_bits;
+  }
+  HistogramClear(p);
+  VP8LHistogramStoreRefs(refs, p);
+}
+
+void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits,
+                       int init_arrays) {
+  p->palette_code_bits_ = palette_code_bits;
+  if (init_arrays) {
+    HistogramClear(p);
+  } else {
+    p->trivial_symbol_ = 0;
+    p->bit_cost_ = 0.;
+    p->literal_cost_ = 0.;
+    p->red_cost_ = 0.;
+    p->blue_cost_ = 0.;
+    memset(p->is_used_, 0, sizeof(p->is_used_));
+  }
+}
+
+VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
+  VP8LHistogram* histo = NULL;
+  const int total_size = VP8LGetHistogramSize(cache_bits);
+  uint8_t* const memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
+  if (memory == NULL) return NULL;
+  histo = (VP8LHistogram*)memory;
+  // literal_ won't necessary be aligned.
+  histo->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
+  VP8LHistogramInit(histo, cache_bits, /*init_arrays=*/ 0);
+  return histo;
+}
+
+// Resets the pointers of the histograms to point to the bit buffer in the set.
+static void HistogramSetResetPointers(VP8LHistogramSet* const set,
+                                      int cache_bits) {
+  int i;
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  uint8_t* memory = (uint8_t*) (set->histograms);
+  memory += set->max_size * sizeof(*set->histograms);
+  for (i = 0; i < set->max_size; ++i) {
+    memory = (uint8_t*) WEBP_ALIGN(memory);
+    set->histograms[i] = (VP8LHistogram*) memory;
+    // literal_ won't necessary be aligned.
+    set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
+    memory += histo_size;
+  }
+}
+
+// Returns the total size of the VP8LHistogramSet.
+static size_t HistogramSetTotalSize(int size, int cache_bits) {
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  return (sizeof(VP8LHistogramSet) + size * (sizeof(VP8LHistogram*) +
+          histo_size + WEBP_ALIGN_CST));
+}
+
+VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
+  int i;
+  VP8LHistogramSet* set;
+  const size_t total_size = HistogramSetTotalSize(size, cache_bits);
+  uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
+  if (memory == NULL) return NULL;
+
+  set = (VP8LHistogramSet*)memory;
+  memory += sizeof(*set);
+  set->histograms = (VP8LHistogram**)memory;
+  set->max_size = size;
+  set->size = size;
+  HistogramSetResetPointers(set, cache_bits);
+  for (i = 0; i < size; ++i) {
+    VP8LHistogramInit(set->histograms[i], cache_bits, /*init_arrays=*/ 0);
+  }
+  return set;
+}
+
+void VP8LHistogramSetClear(VP8LHistogramSet* const set) {
+  int i;
+  const int cache_bits = set->histograms[0]->palette_code_bits_;
+  const int size = set->max_size;
+  const size_t total_size = HistogramSetTotalSize(size, cache_bits);
+  uint8_t* memory = (uint8_t*)set;
+
+  memset(memory, 0, total_size);
+  memory += sizeof(*set);
+  set->histograms = (VP8LHistogram**)memory;
+  set->max_size = size;
+  set->size = size;
+  HistogramSetResetPointers(set, cache_bits);
+  for (i = 0; i < size; ++i) {
+    set->histograms[i]->palette_code_bits_ = cache_bits;
+  }
+}
+
+// Removes the histogram 'i' from 'set' by setting it to NULL.
+static void HistogramSetRemoveHistogram(VP8LHistogramSet* const set, int i,
+                                        int* const num_used) {
+  assert(set->histograms[i] != NULL);
+  set->histograms[i] = NULL;
+  --*num_used;
+  // If we remove the last valid one, shrink until the next valid one.
+  if (i == set->size - 1) {
+    while (set->size >= 1 && set->histograms[set->size - 1] == NULL) {
+      --set->size;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
+                                     const PixOrCopy* const v,
+                                     int (*const distance_modifier)(int, int),
+                                     int distance_modifier_arg0) {
+  if (PixOrCopyIsLiteral(v)) {
+    ++histo->alpha_[PixOrCopyLiteral(v, 3)];
+    ++histo->red_[PixOrCopyLiteral(v, 2)];
+    ++histo->literal_[PixOrCopyLiteral(v, 1)];
+    ++histo->blue_[PixOrCopyLiteral(v, 0)];
+  } else if (PixOrCopyIsCacheIdx(v)) {
+    const int literal_ix =
+        NUM_LITERAL_CODES + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
+    assert(histo->palette_code_bits_ != 0);
+    ++histo->literal_[literal_ix];
+  } else {
+    int code, extra_bits;
+    VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
+    ++histo->literal_[NUM_LITERAL_CODES + code];
+    if (distance_modifier == NULL) {
+      VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+    } else {
+      VP8LPrefixEncodeBits(
+          distance_modifier(distance_modifier_arg0, PixOrCopyDistance(v)),
+          &code, &extra_bits);
+    }
+    ++histo->distance_[code];
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Entropy-related functions.
+
+static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
+  double mix;
+  if (entropy->nonzeros < 5) {
+    if (entropy->nonzeros <= 1) {
+      return 0;
+    }
+    // Two symbols, they will be 0 and 1 in a Huffman code.
+    // Let's mix in a bit of entropy to favor good clustering when
+    // distributions of these are combined.
+    if (entropy->nonzeros == 2) {
+      return 0.99 * entropy->sum + 0.01 * entropy->entropy;
+    }
+    // No matter what the entropy says, we cannot be better than min_limit
+    // with Huffman coding. I am mixing a bit of entropy into the
+    // min_limit since it produces much better (~0.5 %) compression results
+    // perhaps because of better entropy clustering.
+    if (entropy->nonzeros == 3) {
+      mix = 0.95;
+    } else {
+      mix = 0.7;  // nonzeros == 4.
+    }
+  } else {
+    mix = 0.627;
+  }
+
+  {
+    double min_limit = 2 * entropy->sum - entropy->max_val;
+    min_limit = mix * min_limit + (1.0 - mix) * entropy->entropy;
+    return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
+  }
+}
+
+double VP8LBitsEntropy(const uint32_t* const array, int n) {
+  VP8LBitEntropy entropy;
+  VP8LBitsEntropyUnrefined(array, n, &entropy);
+
+  return BitsEntropyRefine(&entropy);
+}
+
+static double InitialHuffmanCost(void) {
+  // Small bias because Huffman code length is typically not stored in
+  // full length.
+  static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
+  static const double kSmallBias = 9.1;
+  return kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
+}
+
+// Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
+static double FinalHuffmanCost(const VP8LStreaks* const stats) {
+  // The constants in this function are experimental and got rounded from
+  // their original values in 1/8 when switched to 1/1024.
+  double retval = InitialHuffmanCost();
+  // Second coefficient: Many zeros in the histogram are covered efficiently
+  // by a run-length encode. Originally 2/8.
+  retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1];
+  // Second coefficient: Constant values are encoded less efficiently, but still
+  // RLE'ed. Originally 6/8.
+  retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1];
+  // 0s are usually encoded more efficiently than non-0s.
+  // Originally 15/8.
+  retval += 1.796875 * stats->streaks[0][0];
+  // Originally 26/8.
+  retval += 3.28125 * stats->streaks[1][0];
+  return retval;
+}
+
+// Get the symbol entropy for the distribution 'population'.
+// Set 'trivial_sym', if there's only one symbol present in the distribution.
+static double PopulationCost(const uint32_t* const population, int length,
+                             uint32_t* const trivial_sym,
+                             uint8_t* const is_used) {
+  VP8LBitEntropy bit_entropy;
+  VP8LStreaks stats;
+  VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
+  if (trivial_sym != NULL) {
+    *trivial_sym = (bit_entropy.nonzeros == 1) ? bit_entropy.nonzero_code
+                                               : VP8L_NON_TRIVIAL_SYM;
+  }
+  // The histogram is used if there is at least one non-zero streak.
+  *is_used = (stats.streaks[1][0] != 0 || stats.streaks[1][1] != 0);
+
+  return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+}
+
+// trivial_at_end is 1 if the two histograms only have one element that is
+// non-zero: both the zero-th one, or both the last one.
+static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
+                                             const uint32_t* const Y,
+                                             int length, int is_X_used,
+                                             int is_Y_used,
+                                             int trivial_at_end) {
+  VP8LStreaks stats;
+  if (trivial_at_end) {
+    // This configuration is due to palettization that transforms an indexed
+    // pixel into 0xff000000 | (pixel << 8) in VP8LBundleColorMap.
+    // BitsEntropyRefine is 0 for histograms with only one non-zero value.
+    // Only FinalHuffmanCost needs to be evaluated.
+    memset(&stats, 0, sizeof(stats));
+    // Deal with the non-zero value at index 0 or length-1.
+    stats.streaks[1][0] = 1;
+    // Deal with the following/previous zero streak.
+    stats.counts[0] = 1;
+    stats.streaks[0][1] = length - 1;
+    return FinalHuffmanCost(&stats);
+  } else {
+    VP8LBitEntropy bit_entropy;
+    if (is_X_used) {
+      if (is_Y_used) {
+        VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
+      } else {
+        VP8LGetEntropyUnrefined(X, length, &bit_entropy, &stats);
+      }
+    } else {
+      if (is_Y_used) {
+        VP8LGetEntropyUnrefined(Y, length, &bit_entropy, &stats);
+      } else {
+        memset(&stats, 0, sizeof(stats));
+        stats.counts[0] = 1;
+        stats.streaks[0][length > 3] = length;
+        VP8LBitEntropyInit(&bit_entropy);
+      }
+    }
+
+    return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+  }
+}
+
+// Estimates the Entropy + Huffman + other block overhead size cost.
+double VP8LHistogramEstimateBits(VP8LHistogram* const p) {
+  return
+      PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_),
+                     NULL, &p->is_used_[0])
+      + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1])
+      + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2])
+      + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3])
+      + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL, &p->is_used_[4])
+      + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
+      + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
+}
+
+// -----------------------------------------------------------------------------
+// Various histogram combine/cost-eval functions
+
+static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
+                                       const VP8LHistogram* const b,
+                                       double cost_threshold,
+                                       double* cost) {
+  const int palette_code_bits = a->palette_code_bits_;
+  int trivial_at_end = 0;
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+  *cost += GetCombinedEntropy(a->literal_, b->literal_,
+                              VP8LHistogramNumCodes(palette_code_bits),
+                              a->is_used_[0], b->is_used_[0], 0);
+  *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
+                                 b->literal_ + NUM_LITERAL_CODES,
+                                 NUM_LENGTH_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
+      a->trivial_symbol_ == b->trivial_symbol_) {
+    // A, R and B are all 0 or 0xff.
+    const uint32_t color_a = (a->trivial_symbol_ >> 24) & 0xff;
+    const uint32_t color_r = (a->trivial_symbol_ >> 16) & 0xff;
+    const uint32_t color_b = (a->trivial_symbol_ >> 0) & 0xff;
+    if ((color_a == 0 || color_a == 0xff) &&
+        (color_r == 0 || color_r == 0xff) &&
+        (color_b == 0 || color_b == 0xff)) {
+      trivial_at_end = 1;
+    }
+  }
+
+  *cost +=
+      GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, a->is_used_[1],
+                         b->is_used_[1], trivial_at_end);
+  if (*cost > cost_threshold) return 0;
+
+  *cost +=
+      GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, a->is_used_[2],
+                         b->is_used_[2], trivial_at_end);
+  if (*cost > cost_threshold) return 0;
+
+  *cost +=
+      GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
+                         a->is_used_[3], b->is_used_[3], trivial_at_end);
+  if (*cost > cost_threshold) return 0;
+
+  *cost +=
+      GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
+                         a->is_used_[4], b->is_used_[4], 0);
+  *cost +=
+      VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  return 1;
+}
+
+static WEBP_INLINE void HistogramAdd(const VP8LHistogram* const a,
+                                     const VP8LHistogram* const b,
+                                     VP8LHistogram* const out) {
+  VP8LHistogramAdd(a, b, out);
+  out->trivial_symbol_ = (a->trivial_symbol_ == b->trivial_symbol_)
+                       ? a->trivial_symbol_
+                       : VP8L_NON_TRIVIAL_SYM;
+}
+
+// Performs out = a + b, computing the cost C(a+b) - C(a) - C(b) while comparing
+// to the threshold value 'cost_threshold'. The score returned is
+//  Score = C(a+b) - C(a) - C(b), where C(a) + C(b) is known and fixed.
+// Since the previous score passed is 'cost_threshold', we only need to compare
+// the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
+// early.
+static double HistogramAddEval(const VP8LHistogram* const a,
+                               const VP8LHistogram* const b,
+                               VP8LHistogram* const out,
+                               double cost_threshold) {
+  double cost = 0;
+  const double sum_cost = a->bit_cost_ + b->bit_cost_;
+  cost_threshold += sum_cost;
+
+  if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
+    HistogramAdd(a, b, out);
+    out->bit_cost_ = cost;
+    out->palette_code_bits_ = a->palette_code_bits_;
+  }
+
+  return cost - sum_cost;
+}
+
+// Same as HistogramAddEval(), except that the resulting histogram
+// is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
+// the term C(b) which is constant over all the evaluations.
+static double HistogramAddThresh(const VP8LHistogram* const a,
+                                 const VP8LHistogram* const b,
+                                 double cost_threshold) {
+  double cost;
+  assert(a != NULL && b != NULL);
+  cost = -a->bit_cost_;
+  GetCombinedHistogramEntropy(a, b, cost_threshold, &cost);
+  return cost;
+}
+
+// -----------------------------------------------------------------------------
+
+// The structure to keep track of cost range for the three dominant entropy
+// symbols.
+// TODO(skal): Evaluate if float can be used here instead of double for
+// representing the entropy costs.
+typedef struct {
+  double literal_max_;
+  double literal_min_;
+  double red_max_;
+  double red_min_;
+  double blue_max_;
+  double blue_min_;
+} DominantCostRange;
+
+static void DominantCostRangeInit(DominantCostRange* const c) {
+  c->literal_max_ = 0.;
+  c->literal_min_ = MAX_COST;
+  c->red_max_ = 0.;
+  c->red_min_ = MAX_COST;
+  c->blue_max_ = 0.;
+  c->blue_min_ = MAX_COST;
+}
+
+static void UpdateDominantCostRange(
+    const VP8LHistogram* const h, DominantCostRange* const c) {
+  if (c->literal_max_ < h->literal_cost_) c->literal_max_ = h->literal_cost_;
+  if (c->literal_min_ > h->literal_cost_) c->literal_min_ = h->literal_cost_;
+  if (c->red_max_ < h->red_cost_) c->red_max_ = h->red_cost_;
+  if (c->red_min_ > h->red_cost_) c->red_min_ = h->red_cost_;
+  if (c->blue_max_ < h->blue_cost_) c->blue_max_ = h->blue_cost_;
+  if (c->blue_min_ > h->blue_cost_) c->blue_min_ = h->blue_cost_;
+}
+
+static void UpdateHistogramCost(VP8LHistogram* const h) {
+  uint32_t alpha_sym, red_sym, blue_sym;
+  const double alpha_cost =
+      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym,
+                     &h->is_used_[3]);
+  const double distance_cost =
+      PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
+      VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
+  const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
+  h->literal_cost_ =
+      PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
+          VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
+  h->red_cost_ =
+      PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
+  h->blue_cost_ =
+      PopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym, &h->is_used_[2]);
+  h->bit_cost_ = h->literal_cost_ + h->red_cost_ + h->blue_cost_ +
+                 alpha_cost + distance_cost;
+  if ((alpha_sym | red_sym | blue_sym) == VP8L_NON_TRIVIAL_SYM) {
+    h->trivial_symbol_ = VP8L_NON_TRIVIAL_SYM;
+  } else {
+    h->trivial_symbol_ =
+        ((uint32_t)alpha_sym << 24) | (red_sym << 16) | (blue_sym << 0);
+  }
+}
+
+static int GetBinIdForEntropy(double min, double max, double val) {
+  const double range = max - min;
+  if (range > 0.) {
+    const double delta = val - min;
+    return (int)((NUM_PARTITIONS - 1e-6) * delta / range);
+  } else {
+    return 0;
+  }
+}
+
+static int GetHistoBinIndex(const VP8LHistogram* const h,
+                            const DominantCostRange* const c, int low_effort) {
+  int bin_id = GetBinIdForEntropy(c->literal_min_, c->literal_max_,
+                                  h->literal_cost_);
+  assert(bin_id < NUM_PARTITIONS);
+  if (!low_effort) {
+    bin_id = bin_id * NUM_PARTITIONS
+           + GetBinIdForEntropy(c->red_min_, c->red_max_, h->red_cost_);
+    bin_id = bin_id * NUM_PARTITIONS
+           + GetBinIdForEntropy(c->blue_min_, c->blue_max_, h->blue_cost_);
+    assert(bin_id < BIN_SIZE);
+  }
+  return bin_id;
+}
+
+// Construct the histograms from backward references.
+static void HistogramBuild(
+    int xsize, int histo_bits, const VP8LBackwardRefs* const backward_refs,
+    VP8LHistogramSet* const image_histo) {
+  int x = 0, y = 0;
+  const int histo_xsize = VP8LSubSampleSize(xsize, histo_bits);
+  VP8LHistogram** const histograms = image_histo->histograms;
+  VP8LRefsCursor c = VP8LRefsCursorInit(backward_refs);
+  assert(histo_bits > 0);
+  VP8LHistogramSetClear(image_histo);
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
+    const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
+    VP8LHistogramAddSinglePixOrCopy(histograms[ix], v, NULL, 0);
+    x += PixOrCopyLength(v);
+    while (x >= xsize) {
+      x -= xsize;
+      ++y;
+    }
+    VP8LRefsCursorNext(&c);
+  }
+}
+
+// Copies the histograms and computes its bit_cost.
+static const uint16_t kInvalidHistogramSymbol = (uint16_t)(-1);
+static void HistogramCopyAndAnalyze(VP8LHistogramSet* const orig_histo,
+                                    VP8LHistogramSet* const image_histo,
+                                    int* const num_used,
+                                    uint16_t* const histogram_symbols) {
+  int i, cluster_id;
+  int num_used_orig = *num_used;
+  VP8LHistogram** const orig_histograms = orig_histo->histograms;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  assert(image_histo->max_size == orig_histo->max_size);
+  for (cluster_id = 0, i = 0; i < orig_histo->max_size; ++i) {
+    VP8LHistogram* const histo = orig_histograms[i];
+    UpdateHistogramCost(histo);
+
+    // Skip the histogram if it is completely empty, which can happen for tiles
+    // with no information (when they are skipped because of LZ77).
+    if (!histo->is_used_[0] && !histo->is_used_[1] && !histo->is_used_[2]
+        && !histo->is_used_[3] && !histo->is_used_[4]) {
+      // The first histogram is always used. If an histogram is empty, we set
+      // its id to be the same as the previous one: this will improve
+      // compressibility for later LZ77.
+      assert(i > 0);
+      HistogramSetRemoveHistogram(image_histo, i, num_used);
+      HistogramSetRemoveHistogram(orig_histo, i, &num_used_orig);
+      histogram_symbols[i] = kInvalidHistogramSymbol;
+    } else {
+      // Copy histograms from orig_histo[] to image_histo[].
+      HistogramCopy(histo, histograms[i]);
+      histogram_symbols[i] = cluster_id++;
+      assert(cluster_id <= image_histo->max_size);
+    }
+  }
+}
+
+// Partition histograms to different entropy bins for three dominant (literal,
+// red and blue) symbol costs and compute the histogram aggregate bit_cost.
+static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
+                                       uint16_t* const bin_map,
+                                       int low_effort) {
+  int i;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  const int histo_size = image_histo->size;
+  DominantCostRange cost_range;
+  DominantCostRangeInit(&cost_range);
+
+  // Analyze the dominant (literal, red and blue) entropy costs.
+  for (i = 0; i < histo_size; ++i) {
+    if (histograms[i] == NULL) continue;
+    UpdateDominantCostRange(histograms[i], &cost_range);
+  }
+
+  // bin-hash histograms on three of the dominant (literal, red and blue)
+  // symbol costs and store the resulting bin_id for each histogram.
+  for (i = 0; i < histo_size; ++i) {
+    // bin_map[i] is not set to a special value as its use will later be guarded
+    // by another (histograms[i] == NULL).
+    if (histograms[i] == NULL) continue;
+    bin_map[i] = GetHistoBinIndex(histograms[i], &cost_range, low_effort);
+  }
+}
+
+// Merges some histograms with same bin_id together if it's advantageous.
+// Sets the remaining histograms to NULL.
+static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
+                                       int* num_used,
+                                       const uint16_t* const clusters,
+                                       uint16_t* const cluster_mappings,
+                                       VP8LHistogram* cur_combo,
+                                       const uint16_t* const bin_map,
+                                       int num_bins,
+                                       double combine_cost_factor,
+                                       int low_effort) {
+  VP8LHistogram** const histograms = image_histo->histograms;
+  int idx;
+  struct {
+    int16_t first;    // position of the histogram that accumulates all
+                      // histograms with the same bin_id
+    uint16_t num_combine_failures;   // number of combine failures per bin_id
+  } bin_info[BIN_SIZE];
+
+  assert(num_bins <= BIN_SIZE);
+  for (idx = 0; idx < num_bins; ++idx) {
+    bin_info[idx].first = -1;
+    bin_info[idx].num_combine_failures = 0;
+  }
+
+  // By default, a cluster matches itself.
+  for (idx = 0; idx < *num_used; ++idx) cluster_mappings[idx] = idx;
+  for (idx = 0; idx < image_histo->size; ++idx) {
+    int bin_id, first;
+    if (histograms[idx] == NULL) continue;
+    bin_id = bin_map[idx];
+    first = bin_info[bin_id].first;
+    if (first == -1) {
+      bin_info[bin_id].first = idx;
+    } else if (low_effort) {
+      HistogramAdd(histograms[idx], histograms[first], histograms[first]);
+      HistogramSetRemoveHistogram(image_histo, idx, num_used);
+      cluster_mappings[clusters[idx]] = clusters[first];
+    } else {
+      // try to merge #idx into #first (both share the same bin_id)
+      const double bit_cost = histograms[idx]->bit_cost_;
+      const double bit_cost_thresh = -bit_cost * combine_cost_factor;
+      const double curr_cost_diff =
+          HistogramAddEval(histograms[first], histograms[idx],
+                           cur_combo, bit_cost_thresh);
+      if (curr_cost_diff < bit_cost_thresh) {
+        // Try to merge two histograms only if the combo is a trivial one or
+        // the two candidate histograms are already non-trivial.
+        // For some images, 'try_combine' turns out to be false for a lot of
+        // histogram pairs. In that case, we fallback to combining
+        // histograms as usual to avoid increasing the header size.
+        const int try_combine =
+            (cur_combo->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM) ||
+            ((histograms[idx]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM) &&
+             (histograms[first]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM));
+        const int max_combine_failures = 32;
+        if (try_combine ||
+            bin_info[bin_id].num_combine_failures >= max_combine_failures) {
+          // move the (better) merged histogram to its final slot
+          HistogramSwap(&cur_combo, &histograms[first]);
+          HistogramSetRemoveHistogram(image_histo, idx, num_used);
+          cluster_mappings[clusters[idx]] = clusters[first];
+        } else {
+          ++bin_info[bin_id].num_combine_failures;
+        }
+      }
+    }
+  }
+  if (low_effort) {
+    // for low_effort case, update the final cost when everything is merged
+    for (idx = 0; idx < image_histo->size; ++idx) {
+      if (histograms[idx] == NULL) continue;
+      UpdateHistogramCost(histograms[idx]);
+    }
+  }
+}
+
+// Implement a Lehmer random number generator with a multiplicative constant of
+// 48271 and a modulo constant of 2^31 - 1.
+static uint32_t MyRand(uint32_t* const seed) {
+  *seed = (uint32_t)(((uint64_t)(*seed) * 48271u) % 2147483647u);
+  assert(*seed > 0);
+  return *seed;
+}
+
+// -----------------------------------------------------------------------------
+// Histogram pairs priority queue
+
+// Pair of histograms. Negative idx1 value means that pair is out-of-date.
+typedef struct {
+  int idx1;
+  int idx2;
+  double cost_diff;
+  double cost_combo;
+} HistogramPair;
+
+typedef struct {
+  HistogramPair* queue;
+  int size;
+  int max_size;
+} HistoQueue;
+
+static int HistoQueueInit(HistoQueue* const histo_queue, const int max_size) {
+  histo_queue->size = 0;
+  histo_queue->max_size = max_size;
+  // We allocate max_size + 1 because the last element at index "size" is
+  // used as temporary data (and it could be up to max_size).
+  histo_queue->queue = (HistogramPair*)WebPSafeMalloc(
+      histo_queue->max_size + 1, sizeof(*histo_queue->queue));
+  return histo_queue->queue != NULL;
+}
+
+static void HistoQueueClear(HistoQueue* const histo_queue) {
+  assert(histo_queue != NULL);
+  WebPSafeFree(histo_queue->queue);
+  histo_queue->size = 0;
+  histo_queue->max_size = 0;
+}
+
+// Pop a specific pair in the queue by replacing it with the last one
+// and shrinking the queue.
+static void HistoQueuePopPair(HistoQueue* const histo_queue,
+                              HistogramPair* const pair) {
+  assert(pair >= histo_queue->queue &&
+         pair < (histo_queue->queue + histo_queue->size));
+  assert(histo_queue->size > 0);
+  *pair = histo_queue->queue[histo_queue->size - 1];
+  --histo_queue->size;
+}
+
+// Check whether a pair in the queue should be updated as head or not.
+static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
+                                 HistogramPair* const pair) {
+  assert(pair->cost_diff < 0.);
+  assert(pair >= histo_queue->queue &&
+         pair < (histo_queue->queue + histo_queue->size));
+  assert(histo_queue->size > 0);
+  if (pair->cost_diff < histo_queue->queue[0].cost_diff) {
+    // Replace the best pair.
+    const HistogramPair tmp = histo_queue->queue[0];
+    histo_queue->queue[0] = *pair;
+    *pair = tmp;
+  }
+}
+
+// Update the cost diff and combo of a pair of histograms. This needs to be
+// called when the the histograms have been merged with a third one.
+static void HistoQueueUpdatePair(const VP8LHistogram* const h1,
+                                 const VP8LHistogram* const h2,
+                                 double threshold,
+                                 HistogramPair* const pair) {
+  const double sum_cost = h1->bit_cost_ + h2->bit_cost_;
+  pair->cost_combo = 0.;
+  GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair->cost_combo);
+  pair->cost_diff = pair->cost_combo - sum_cost;
+}
+
+// Create a pair from indices "idx1" and "idx2" provided its cost
+// is inferior to "threshold", a negative entropy.
+// It returns the cost of the pair, or 0. if it superior to threshold.
+static double HistoQueuePush(HistoQueue* const histo_queue,
+                             VP8LHistogram** const histograms, int idx1,
+                             int idx2, double threshold) {
+  const VP8LHistogram* h1;
+  const VP8LHistogram* h2;
+  HistogramPair pair;
+
+  // Stop here if the queue is full.
+  if (histo_queue->size == histo_queue->max_size) return 0.;
+  assert(threshold <= 0.);
+  if (idx1 > idx2) {
+    const int tmp = idx2;
+    idx2 = idx1;
+    idx1 = tmp;
+  }
+  pair.idx1 = idx1;
+  pair.idx2 = idx2;
+  h1 = histograms[idx1];
+  h2 = histograms[idx2];
+
+  HistoQueueUpdatePair(h1, h2, threshold, &pair);
+
+  // Do not even consider the pair if it does not improve the entropy.
+  if (pair.cost_diff >= threshold) return 0.;
+
+  histo_queue->queue[histo_queue->size++] = pair;
+  HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]);
+
+  return pair.cost_diff;
+}
+
+// -----------------------------------------------------------------------------
+
+// Combines histograms by continuously choosing the one with the highest cost
+// reduction.
+static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
+                                  int* const num_used) {
+  int ok = 0;
+  const int image_histo_size = image_histo->size;
+  int i, j;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  // Priority queue of histogram pairs.
+  HistoQueue histo_queue;
+
+  // image_histo_size^2 for the queue size is safe. If you look at
+  // HistogramCombineGreedy, and imagine that UpdateQueueFront always pushes
+  // data to the queue, you insert at most:
+  // - image_histo_size*(image_histo_size-1)/2 (the first two for loops)
+  // - image_histo_size - 1 in the last for loop at the first iteration of
+  //   the while loop, image_histo_size - 2 at the second iteration ...
+  //   therefore image_histo_size*(image_histo_size-1)/2 overall too
+  if (!HistoQueueInit(&histo_queue, image_histo_size * image_histo_size)) {
+    goto End;
+  }
+
+  for (i = 0; i < image_histo_size; ++i) {
+    if (image_histo->histograms[i] == NULL) continue;
+    for (j = i + 1; j < image_histo_size; ++j) {
+      // Initialize queue.
+      if (image_histo->histograms[j] == NULL) continue;
+      HistoQueuePush(&histo_queue, histograms, i, j, 0.);
+    }
+  }
+
+  while (histo_queue.size > 0) {
+    const int idx1 = histo_queue.queue[0].idx1;
+    const int idx2 = histo_queue.queue[0].idx2;
+    HistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
+    histograms[idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
+
+    // Remove merged histogram.
+    HistogramSetRemoveHistogram(image_histo, idx2, num_used);
+
+    // Remove pairs intersecting the just combined best pair.
+    for (i = 0; i < histo_queue.size;) {
+      HistogramPair* const p = histo_queue.queue + i;
+      if (p->idx1 == idx1 || p->idx2 == idx1 ||
+          p->idx1 == idx2 || p->idx2 == idx2) {
+        HistoQueuePopPair(&histo_queue, p);
+      } else {
+        HistoQueueUpdateHead(&histo_queue, p);
+        ++i;
+      }
+    }
+
+    // Push new pairs formed with combined histogram to the queue.
+    for (i = 0; i < image_histo->size; ++i) {
+      if (i == idx1 || image_histo->histograms[i] == NULL) continue;
+      HistoQueuePush(&histo_queue, image_histo->histograms, idx1, i, 0.);
+    }
+  }
+
+  ok = 1;
+
+ End:
+  HistoQueueClear(&histo_queue);
+  return ok;
+}
+
+// Perform histogram aggregation using a stochastic approach.
+// 'do_greedy' is set to 1 if a greedy approach needs to be performed
+// afterwards, 0 otherwise.
+static int PairComparison(const void* idx1, const void* idx2) {
+  // To be used with bsearch: <0 when *idx1<*idx2, >0 if >, 0 when ==.
+  return (*(int*) idx1 - *(int*) idx2);
+}
+static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
+                                      int* const num_used, int min_cluster_size,
+                                      int* const do_greedy) {
+  int j, iter;
+  uint32_t seed = 1;
+  int tries_with_no_success = 0;
+  const int outer_iters = *num_used;
+  const int num_tries_no_success = outer_iters / 2;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  // Priority queue of histogram pairs. Its size of 'kHistoQueueSize'
+  // impacts the quality of the compression and the speed: the smaller the
+  // faster but the worse for the compression.
+  HistoQueue histo_queue;
+  const int kHistoQueueSize = 9;
+  int ok = 0;
+  // mapping from an index in image_histo with no NULL histogram to the full
+  // blown image_histo.
+  int* mappings;
+
+  if (*num_used < min_cluster_size) {
+    *do_greedy = 1;
+    return 1;
+  }
+
+  mappings = (int*) WebPSafeMalloc(*num_used, sizeof(*mappings));
+  if (mappings == NULL) return 0;
+  if (!HistoQueueInit(&histo_queue, kHistoQueueSize)) goto End;
+  // Fill the initial mapping.
+  for (j = 0, iter = 0; iter < image_histo->size; ++iter) {
+    if (histograms[iter] == NULL) continue;
+    mappings[j++] = iter;
+  }
+  assert(j == *num_used);
+
+  // Collapse similar histograms in 'image_histo'.
+  for (iter = 0;
+       iter < outer_iters && *num_used >= min_cluster_size &&
+           ++tries_with_no_success < num_tries_no_success;
+       ++iter) {
+    int* mapping_index;
+    double best_cost =
+        (histo_queue.size == 0) ? 0. : histo_queue.queue[0].cost_diff;
+    int best_idx1 = -1, best_idx2 = 1;
+    const uint32_t rand_range = (*num_used - 1) * (*num_used);
+    // (*num_used) / 2 was chosen empirically. Less means faster but worse
+    // compression.
+    const int num_tries = (*num_used) / 2;
+
+    // Pick random samples.
+    for (j = 0; *num_used >= 2 && j < num_tries; ++j) {
+      double curr_cost;
+      // Choose two different histograms at random and try to combine them.
+      const uint32_t tmp = MyRand(&seed) % rand_range;
+      uint32_t idx1 = tmp / (*num_used - 1);
+      uint32_t idx2 = tmp % (*num_used - 1);
+      if (idx2 >= idx1) ++idx2;
+      idx1 = mappings[idx1];
+      idx2 = mappings[idx2];
+
+      // Calculate cost reduction on combination.
+      curr_cost =
+          HistoQueuePush(&histo_queue, histograms, idx1, idx2, best_cost);
+      if (curr_cost < 0) {  // found a better pair?
+        best_cost = curr_cost;
+        // Empty the queue if we reached full capacity.
+        if (histo_queue.size == histo_queue.max_size) break;
+      }
+    }
+    if (histo_queue.size == 0) continue;
+
+    // Get the best histograms.
+    best_idx1 = histo_queue.queue[0].idx1;
+    best_idx2 = histo_queue.queue[0].idx2;
+    assert(best_idx1 < best_idx2);
+    // Pop best_idx2 from mappings.
+    mapping_index = (int*) bsearch(&best_idx2, mappings, *num_used,
+                                   sizeof(best_idx2), &PairComparison);
+    assert(mapping_index != NULL);
+    memmove(mapping_index, mapping_index + 1, sizeof(*mapping_index) *
+        ((*num_used) - (mapping_index - mappings) - 1));
+    // Merge the histograms and remove best_idx2 from the queue.
+    HistogramAdd(histograms[best_idx2], histograms[best_idx1],
+                 histograms[best_idx1]);
+    histograms[best_idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
+    HistogramSetRemoveHistogram(image_histo, best_idx2, num_used);
+    // Parse the queue and update each pair that deals with best_idx1,
+    // best_idx2 or image_histo_size.
+    for (j = 0; j < histo_queue.size;) {
+      HistogramPair* const p = histo_queue.queue + j;
+      const int is_idx1_best = p->idx1 == best_idx1 || p->idx1 == best_idx2;
+      const int is_idx2_best = p->idx2 == best_idx1 || p->idx2 == best_idx2;
+      int do_eval = 0;
+      // The front pair could have been duplicated by a random pick so
+      // check for it all the time nevertheless.
+      if (is_idx1_best && is_idx2_best) {
+        HistoQueuePopPair(&histo_queue, p);
+        continue;
+      }
+      // Any pair containing one of the two best indices should only refer to
+      // best_idx1. Its cost should also be updated.
+      if (is_idx1_best) {
+        p->idx1 = best_idx1;
+        do_eval = 1;
+      } else if (is_idx2_best) {
+        p->idx2 = best_idx1;
+        do_eval = 1;
+      }
+      // Make sure the index order is respected.
+      if (p->idx1 > p->idx2) {
+        const int tmp = p->idx2;
+        p->idx2 = p->idx1;
+        p->idx1 = tmp;
+      }
+      if (do_eval) {
+        // Re-evaluate the cost of an updated pair.
+        HistoQueueUpdatePair(histograms[p->idx1], histograms[p->idx2], 0., p);
+        if (p->cost_diff >= 0.) {
+          HistoQueuePopPair(&histo_queue, p);
+          continue;
+        }
+      }
+      HistoQueueUpdateHead(&histo_queue, p);
+      ++j;
+    }
+    tries_with_no_success = 0;
+  }
+  *do_greedy = (*num_used <= min_cluster_size);
+  ok = 1;
+
+End:
+  HistoQueueClear(&histo_queue);
+  WebPSafeFree(mappings);
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// Histogram refinement
+
+// Find the best 'out' histogram for each of the 'in' histograms.
+// At call-time, 'out' contains the histograms of the clusters.
+// Note: we assume that out[]->bit_cost_ is already up-to-date.
+static void HistogramRemap(const VP8LHistogramSet* const in,
+                           VP8LHistogramSet* const out,
+                           uint16_t* const symbols) {
+  int i;
+  VP8LHistogram** const in_histo = in->histograms;
+  VP8LHistogram** const out_histo = out->histograms;
+  const int in_size = out->max_size;
+  const int out_size = out->size;
+  if (out_size > 1) {
+    for (i = 0; i < in_size; ++i) {
+      int best_out = 0;
+      double best_bits = MAX_COST;
+      int k;
+      if (in_histo[i] == NULL) {
+        // Arbitrarily set to the previous value if unused to help future LZ77.
+        symbols[i] = symbols[i - 1];
+        continue;
+      }
+      for (k = 0; k < out_size; ++k) {
+        double cur_bits;
+        cur_bits = HistogramAddThresh(out_histo[k], in_histo[i], best_bits);
+        if (k == 0 || cur_bits < best_bits) {
+          best_bits = cur_bits;
+          best_out = k;
+        }
+      }
+      symbols[i] = best_out;
+    }
+  } else {
+    assert(out_size == 1);
+    for (i = 0; i < in_size; ++i) {
+      symbols[i] = 0;
+    }
+  }
+
+  // Recompute each out based on raw and symbols.
+  VP8LHistogramSetClear(out);
+  out->size = out_size;
+
+  for (i = 0; i < in_size; ++i) {
+    int idx;
+    if (in_histo[i] == NULL) continue;
+    idx = symbols[i];
+    HistogramAdd(in_histo[i], out_histo[idx], out_histo[idx]);
+  }
+}
+
+static double GetCombineCostFactor(int histo_size, int quality) {
+  double combine_cost_factor = 0.16;
+  if (quality < 90) {
+    if (histo_size > 256) combine_cost_factor /= 2.;
+    if (histo_size > 512) combine_cost_factor /= 2.;
+    if (histo_size > 1024) combine_cost_factor /= 2.;
+    if (quality <= 50) combine_cost_factor /= 2.;
+  }
+  return combine_cost_factor;
+}
+
+// Given a HistogramSet 'set', the mapping of clusters 'cluster_mapping' and the
+// current assignment of the cells in 'symbols', merge the clusters and
+// assign the smallest possible clusters values.
+static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
+                                     uint16_t* const cluster_mappings,
+                                     int num_clusters,
+                                     uint16_t* const cluster_mappings_tmp,
+                                     uint16_t* const symbols) {
+  int i, cluster_max;
+  int do_continue = 1;
+  // First, assign the lowest cluster to each pixel.
+  while (do_continue) {
+    do_continue = 0;
+    for (i = 0; i < num_clusters; ++i) {
+      int k;
+      k = cluster_mappings[i];
+      while (k != cluster_mappings[k]) {
+        cluster_mappings[k] = cluster_mappings[cluster_mappings[k]];
+        k = cluster_mappings[k];
+      }
+      if (k != cluster_mappings[i]) {
+        do_continue = 1;
+        cluster_mappings[i] = k;
+      }
+    }
+  }
+  // Create a mapping from a cluster id to its minimal version.
+  cluster_max = 0;
+  memset(cluster_mappings_tmp, 0,
+         set->max_size * sizeof(*cluster_mappings_tmp));
+  assert(cluster_mappings[0] == 0);
+  // Re-map the ids.
+  for (i = 0; i < set->max_size; ++i) {
+    int cluster;
+    if (symbols[i] == kInvalidHistogramSymbol) continue;
+    cluster = cluster_mappings[symbols[i]];
+    assert(symbols[i] < num_clusters);
+    if (cluster > 0 && cluster_mappings_tmp[cluster] == 0) {
+      ++cluster_max;
+      cluster_mappings_tmp[cluster] = cluster_max;
+    }
+    symbols[i] = cluster_mappings_tmp[cluster];
+  }
+
+  // Make sure all cluster values are used.
+  cluster_max = 0;
+  for (i = 0; i < set->max_size; ++i) {
+    if (symbols[i] == kInvalidHistogramSymbol) continue;
+    if (symbols[i] <= cluster_max) continue;
+    ++cluster_max;
+    assert(symbols[i] == cluster_max);
+  }
+}
+
+static void RemoveEmptyHistograms(VP8LHistogramSet* const image_histo) {
+  uint32_t size;
+  int i;
+  for (i = 0, size = 0; i < image_histo->size; ++i) {
+    if (image_histo->histograms[i] == NULL) continue;
+    image_histo->histograms[size++] = image_histo->histograms[i];
+  }
+  image_histo->size = size;
+}
+
+int VP8LGetHistoImageSymbols(int xsize, int ysize,
+                             const VP8LBackwardRefs* const refs,
+                             int quality, int low_effort,
+                             int histogram_bits, int cache_bits,
+                             VP8LHistogramSet* const image_histo,
+                             VP8LHistogram* const tmp_histo,
+                             uint16_t* const histogram_symbols) {
+  int ok = 0;
+  const int histo_xsize =
+      histogram_bits ? VP8LSubSampleSize(xsize, histogram_bits) : 1;
+  const int histo_ysize =
+      histogram_bits ? VP8LSubSampleSize(ysize, histogram_bits) : 1;
+  const int image_histo_raw_size = histo_xsize * histo_ysize;
+  VP8LHistogramSet* const orig_histo =
+      VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
+  // Don't attempt linear bin-partition heuristic for
+  // histograms of small sizes (as bin_map will be very sparse) and
+  // maximum quality q==100 (to preserve the compression gains at that level).
+  const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
+  int entropy_combine;
+  uint16_t* const map_tmp =
+      WebPSafeMalloc(2 * image_histo_raw_size, sizeof(map_tmp));
+  uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size;
+  int num_used = image_histo_raw_size;
+  if (orig_histo == NULL || map_tmp == NULL) goto Error;
+
+  // Construct the histograms from backward references.
+  HistogramBuild(xsize, histogram_bits, refs, orig_histo);
+  // Copies the histograms and computes its bit_cost.
+  // histogram_symbols is optimized
+  HistogramCopyAndAnalyze(orig_histo, image_histo, &num_used,
+                          histogram_symbols);
+
+  entropy_combine =
+      (num_used > entropy_combine_num_bins * 2) && (quality < 100);
+
+  if (entropy_combine) {
+    uint16_t* const bin_map = map_tmp;
+    const double combine_cost_factor =
+        GetCombineCostFactor(image_histo_raw_size, quality);
+    const uint32_t num_clusters = num_used;
+
+    HistogramAnalyzeEntropyBin(image_histo, bin_map, low_effort);
+    // Collapse histograms with similar entropy.
+    HistogramCombineEntropyBin(image_histo, &num_used, histogram_symbols,
+                               cluster_mappings, tmp_histo, bin_map,
+                               entropy_combine_num_bins, combine_cost_factor,
+                               low_effort);
+    OptimizeHistogramSymbols(image_histo, cluster_mappings, num_clusters,
+                             map_tmp, histogram_symbols);
+  }
+
+  // Don't combine the histograms using stochastic and greedy heuristics for
+  // low-effort compression mode.
+  if (!low_effort || !entropy_combine) {
+    const float x = quality / 100.f;
+    // cubic ramp between 1 and MAX_HISTO_GREEDY:
+    const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
+    int do_greedy;
+    if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size,
+                                    &do_greedy)) {
+      goto Error;
+    }
+    if (do_greedy) {
+      RemoveEmptyHistograms(image_histo);
+      if (!HistogramCombineGreedy(image_histo, &num_used)) {
+        goto Error;
+      }
+    }
+  }
+
+  // Find the optimal map from original histograms to the final ones.
+  RemoveEmptyHistograms(image_histo);
+  HistogramRemap(orig_histo, image_histo, histogram_symbols);
+
+  ok = 1;
+
+ Error:
+  VP8LFreeHistogramSet(orig_histo);
+  WebPSafeFree(map_tmp);
+  return ok;
+}
diff --git a/media/libwebp/enc/histogram_enc.h b/media/libwebp/enc/histogram_enc.h
index ef39b7c6db..bf93ce62b3 100644
--- a/media/libwebp/enc/histogram_enc.h
+++ b/media/libwebp/enc/histogram_enc.h
@@ -64,8 +64,8 @@ void VP8LHistogramCreate(VP8LHistogram* const p,
                          const VP8LBackwardRefs* const refs,
                          int palette_code_bits);
 
-// Return the size of the histogram for a given palette_code_bits.
-int VP8LGetHistogramSize(int palette_code_bits);
+// Return the size of the histogram for a given cache_bits.
+int VP8LGetHistogramSize(int cache_bits);
 
 // Set the palette_code_bits and reset the stats.
 // If init_arrays is true, the arrays are also filled with 0's.
@@ -110,7 +110,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              const VP8LBackwardRefs* const refs,
                              int quality, int low_effort,
                              int histogram_bits, int cache_bits,
-                             VP8LHistogramSet* const image_in,
+                             VP8LHistogramSet* const image_histo,
                              VP8LHistogram* const tmp_histo,
                              uint16_t* const histogram_symbols);
 
diff --git a/media/libwebp/enc/iterator_enc.c b/media/libwebp/enc/iterator_enc.c
new file mode 100644
index 0000000000..c2b137c124
--- /dev/null
+++ b/media/libwebp/enc/iterator_enc.c
@@ -0,0 +1,459 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// VP8Iterator: block iterator
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <string.h>
+
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// VP8Iterator
+//------------------------------------------------------------------------------
+
+static void InitLeft(VP8EncIterator* const it) {
+  it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] =
+      (it->y_ > 0) ? 129 : 127;
+  memset(it->y_left_, 129, 16);
+  memset(it->u_left_, 129, 8);
+  memset(it->v_left_, 129, 8);
+  it->left_nz_[8] = 0;
+  if (it->top_derr_ != NULL) {
+    memset(&it->left_derr_, 0, sizeof(it->left_derr_));
+  }
+}
+
+static void InitTop(VP8EncIterator* const it) {
+  const VP8Encoder* const enc = it->enc_;
+  const size_t top_size = enc->mb_w_ * 16;
+  memset(enc->y_top_, 127, 2 * top_size);
+  memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
+  if (enc->top_derr_ != NULL) {
+    memset(enc->top_derr_, 0, enc->mb_w_ * sizeof(*enc->top_derr_));
+  }
+}
+
+void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
+  VP8Encoder* const enc = it->enc_;
+  it->x_ = 0;
+  it->y_ = y;
+  it->bw_ = &enc->parts_[y & (enc->num_parts_ - 1)];
+  it->preds_ = enc->preds_ + y * 4 * enc->preds_w_;
+  it->nz_ = enc->nz_;
+  it->mb_ = enc->mb_info_ + y * enc->mb_w_;
+  it->y_top_ = enc->y_top_;
+  it->uv_top_ = enc->uv_top_;
+  InitLeft(it);
+}
+
+void VP8IteratorReset(VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  VP8IteratorSetRow(it, 0);
+  VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
+  InitTop(it);
+  memset(it->bit_count_, 0, sizeof(it->bit_count_));
+  it->do_trellis_ = 0;
+}
+
+void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down) {
+  it->count_down_ = it->count_down0_ = count_down;
+}
+
+int VP8IteratorIsDone(const VP8EncIterator* const it) {
+  return (it->count_down_ <= 0);
+}
+
+void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
+  it->enc_ = enc;
+  it->yuv_in_   = (uint8_t*)WEBP_ALIGN(it->yuv_mem_);
+  it->yuv_out_  = it->yuv_in_ + YUV_SIZE_ENC;
+  it->yuv_out2_ = it->yuv_out_ + YUV_SIZE_ENC;
+  it->yuv_p_    = it->yuv_out2_ + YUV_SIZE_ENC;
+  it->lf_stats_ = enc->lf_stats_;
+  it->percent0_ = enc->percent_;
+  it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1);
+  it->u_left_ = it->y_left_ + 16 + 16;
+  it->v_left_ = it->u_left_ + 16;
+  it->top_derr_ = enc->top_derr_;
+  VP8IteratorReset(it);
+}
+
+int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
+  VP8Encoder* const enc = it->enc_;
+  if (delta && enc->pic_->progress_hook != NULL) {
+    const int done = it->count_down0_ - it->count_down_;
+    const int percent = (it->count_down0_ <= 0)
+                      ? it->percent0_
+                      : it->percent0_ + delta * done / it->count_down0_;
+    return WebPReportProgress(enc->pic_, percent, &enc->percent_);
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Import the source samples into the cache. Takes care of replicating
+// boundary pixels if necessary.
+
+static WEBP_INLINE int MinSize(int a, int b) { return (a < b) ? a : b; }
+
+static void ImportBlock(const uint8_t* src, int src_stride,
+                        uint8_t* dst, int w, int h, int size) {
+  int i;
+  for (i = 0; i < h; ++i) {
+    memcpy(dst, src, w);
+    if (w < size) {
+      memset(dst + w, dst[w - 1], size - w);
+    }
+    dst += BPS;
+    src += src_stride;
+  }
+  for (i = h; i < size; ++i) {
+    memcpy(dst, dst - BPS, size);
+    dst += BPS;
+  }
+}
+
+static void ImportLine(const uint8_t* src, int src_stride,
+                       uint8_t* dst, int len, int total_len) {
+  int i;
+  for (i = 0; i < len; ++i, src += src_stride) dst[i] = *src;
+  for (; i < total_len; ++i) dst[i] = dst[len - 1];
+}
+
+void VP8IteratorImport(VP8EncIterator* const it, uint8_t* const tmp_32) {
+  const VP8Encoder* const enc = it->enc_;
+  const int x = it->x_, y = it->y_;
+  const WebPPicture* const pic = enc->pic_;
+  const uint8_t* const ysrc = pic->y + (y * pic->y_stride  + x) * 16;
+  const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8;
+  const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8;
+  const int w = MinSize(pic->width - x * 16, 16);
+  const int h = MinSize(pic->height - y * 16, 16);
+  const int uv_w = (w + 1) >> 1;
+  const int uv_h = (h + 1) >> 1;
+
+  ImportBlock(ysrc, pic->y_stride,  it->yuv_in_ + Y_OFF_ENC, w, h, 16);
+  ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF_ENC, uv_w, uv_h, 8);
+  ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF_ENC, uv_w, uv_h, 8);
+
+  if (tmp_32 == NULL) return;
+
+  // Import source (uncompressed) samples into boundary.
+  if (x == 0) {
+    InitLeft(it);
+  } else {
+    if (y == 0) {
+      it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] = 127;
+    } else {
+      it->y_left_[-1] = ysrc[- 1 - pic->y_stride];
+      it->u_left_[-1] = usrc[- 1 - pic->uv_stride];
+      it->v_left_[-1] = vsrc[- 1 - pic->uv_stride];
+    }
+    ImportLine(ysrc - 1, pic->y_stride,  it->y_left_, h,   16);
+    ImportLine(usrc - 1, pic->uv_stride, it->u_left_, uv_h, 8);
+    ImportLine(vsrc - 1, pic->uv_stride, it->v_left_, uv_h, 8);
+  }
+
+  it->y_top_  = tmp_32 + 0;
+  it->uv_top_ = tmp_32 + 16;
+  if (y == 0) {
+    memset(tmp_32, 127, 32 * sizeof(*tmp_32));
+  } else {
+    ImportLine(ysrc - pic->y_stride,  1, tmp_32,          w,   16);
+    ImportLine(usrc - pic->uv_stride, 1, tmp_32 + 16,     uv_w, 8);
+    ImportLine(vsrc - pic->uv_stride, 1, tmp_32 + 16 + 8, uv_w, 8);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Copy back the compressed samples into user space if requested.
+
+static void ExportBlock(const uint8_t* src, uint8_t* dst, int dst_stride,
+                        int w, int h) {
+  while (h-- > 0) {
+    memcpy(dst, src, w);
+    dst += dst_stride;
+    src += BPS;
+  }
+}
+
+void VP8IteratorExport(const VP8EncIterator* const it) {
+  const VP8Encoder* const enc = it->enc_;
+  if (enc->config_->show_compressed) {
+    const int x = it->x_, y = it->y_;
+    const uint8_t* const ysrc = it->yuv_out_ + Y_OFF_ENC;
+    const uint8_t* const usrc = it->yuv_out_ + U_OFF_ENC;
+    const uint8_t* const vsrc = it->yuv_out_ + V_OFF_ENC;
+    const WebPPicture* const pic = enc->pic_;
+    uint8_t* const ydst = pic->y + (y * pic->y_stride + x) * 16;
+    uint8_t* const udst = pic->u + (y * pic->uv_stride + x) * 8;
+    uint8_t* const vdst = pic->v + (y * pic->uv_stride + x) * 8;
+    int w = (pic->width - x * 16);
+    int h = (pic->height - y * 16);
+
+    if (w > 16) w = 16;
+    if (h > 16) h = 16;
+
+    // Luma plane
+    ExportBlock(ysrc, ydst, pic->y_stride, w, h);
+
+    {   // U/V planes
+      const int uv_w = (w + 1) >> 1;
+      const int uv_h = (h + 1) >> 1;
+      ExportBlock(usrc, udst, pic->uv_stride, uv_w, uv_h);
+      ExportBlock(vsrc, vdst, pic->uv_stride, uv_w, uv_h);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Non-zero contexts setup/teardown
+
+// Nz bits:
+//  0  1  2  3  Y
+//  4  5  6  7
+//  8  9 10 11
+// 12 13 14 15
+// 16 17        U
+// 18 19
+// 20 21        V
+// 22 23
+// 24           DC-intra16
+
+// Convert packed context to byte array
+#define BIT(nz, n) (!!((nz) & (1 << (n))))
+
+void VP8IteratorNzToBytes(VP8EncIterator* const it) {
+  const int tnz = it->nz_[0], lnz = it->nz_[-1];
+  int* const top_nz = it->top_nz_;
+  int* const left_nz = it->left_nz_;
+
+  // Top-Y
+  top_nz[0] = BIT(tnz, 12);
+  top_nz[1] = BIT(tnz, 13);
+  top_nz[2] = BIT(tnz, 14);
+  top_nz[3] = BIT(tnz, 15);
+  // Top-U
+  top_nz[4] = BIT(tnz, 18);
+  top_nz[5] = BIT(tnz, 19);
+  // Top-V
+  top_nz[6] = BIT(tnz, 22);
+  top_nz[7] = BIT(tnz, 23);
+  // DC
+  top_nz[8] = BIT(tnz, 24);
+
+  // left-Y
+  left_nz[0] = BIT(lnz,  3);
+  left_nz[1] = BIT(lnz,  7);
+  left_nz[2] = BIT(lnz, 11);
+  left_nz[3] = BIT(lnz, 15);
+  // left-U
+  left_nz[4] = BIT(lnz, 17);
+  left_nz[5] = BIT(lnz, 19);
+  // left-V
+  left_nz[6] = BIT(lnz, 21);
+  left_nz[7] = BIT(lnz, 23);
+  // left-DC is special, iterated separately
+}
+
+void VP8IteratorBytesToNz(VP8EncIterator* const it) {
+  uint32_t nz = 0;
+  const int* const top_nz = it->top_nz_;
+  const int* const left_nz = it->left_nz_;
+  // top
+  nz |= (top_nz[0] << 12) | (top_nz[1] << 13);
+  nz |= (top_nz[2] << 14) | (top_nz[3] << 15);
+  nz |= (top_nz[4] << 18) | (top_nz[5] << 19);
+  nz |= (top_nz[6] << 22) | (top_nz[7] << 23);
+  nz |= (top_nz[8] << 24);  // we propagate the _top_ bit, esp. for intra4
+  // left
+  nz |= (left_nz[0] << 3) | (left_nz[1] << 7);
+  nz |= (left_nz[2] << 11);
+  nz |= (left_nz[4] << 17) | (left_nz[6] << 21);
+
+  *it->nz_ = nz;
+}
+
+#undef BIT
+
+//------------------------------------------------------------------------------
+// Advance to the next position, doing the bookkeeping.
+
+void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  const int x = it->x_, y = it->y_;
+  const uint8_t* const ysrc = it->yuv_out_ + Y_OFF_ENC;
+  const uint8_t* const uvsrc = it->yuv_out_ + U_OFF_ENC;
+  if (x < enc->mb_w_ - 1) {   // left
+    int i;
+    for (i = 0; i < 16; ++i) {
+      it->y_left_[i] = ysrc[15 + i * BPS];
+    }
+    for (i = 0; i < 8; ++i) {
+      it->u_left_[i] = uvsrc[7 + i * BPS];
+      it->v_left_[i] = uvsrc[15 + i * BPS];
+    }
+    // top-left (before 'top'!)
+    it->y_left_[-1] = it->y_top_[15];
+    it->u_left_[-1] = it->uv_top_[0 + 7];
+    it->v_left_[-1] = it->uv_top_[8 + 7];
+  }
+  if (y < enc->mb_h_ - 1) {  // top
+    memcpy(it->y_top_, ysrc + 15 * BPS, 16);
+    memcpy(it->uv_top_, uvsrc + 7 * BPS, 8 + 8);
+  }
+}
+
+int VP8IteratorNext(VP8EncIterator* const it) {
+  if (++it->x_ == it->enc_->mb_w_) {
+    VP8IteratorSetRow(it, ++it->y_);
+  } else {
+    it->preds_ += 4;
+    it->mb_ += 1;
+    it->nz_ += 1;
+    it->y_top_ += 16;
+    it->uv_top_ += 16;
+  }
+  return (0 < --it->count_down_);
+}
+
+//------------------------------------------------------------------------------
+// Helper function to set mode properties
+
+void VP8SetIntra16Mode(const VP8EncIterator* const it, int mode) {
+  uint8_t* preds = it->preds_;
+  int y;
+  for (y = 0; y < 4; ++y) {
+    memset(preds, mode, 4);
+    preds += it->enc_->preds_w_;
+  }
+  it->mb_->type_ = 1;
+}
+
+void VP8SetIntra4Mode(const VP8EncIterator* const it, const uint8_t* modes) {
+  uint8_t* preds = it->preds_;
+  int y;
+  for (y = 4; y > 0; --y) {
+    memcpy(preds, modes, 4 * sizeof(*modes));
+    preds += it->enc_->preds_w_;
+    modes += 4;
+  }
+  it->mb_->type_ = 0;
+}
+
+void VP8SetIntraUVMode(const VP8EncIterator* const it, int mode) {
+  it->mb_->uv_mode_ = mode;
+}
+
+void VP8SetSkip(const VP8EncIterator* const it, int skip) {
+  it->mb_->skip_ = skip;
+}
+
+void VP8SetSegment(const VP8EncIterator* const it, int segment) {
+  it->mb_->segment_ = segment;
+}
+
+//------------------------------------------------------------------------------
+// Intra4x4 sub-blocks iteration
+//
+//  We store and update the boundary samples into an array of 37 pixels. They
+//  are updated as we iterate and reconstructs each intra4x4 blocks in turn.
+//  The position of the samples has the following snake pattern:
+//
+// 16|17 18 19 20|21 22 23 24|25 26 27 28|29 30 31 32|33 34 35 36  <- Top-right
+// --+-----------+-----------+-----------+-----------+
+// 15|         19|         23|         27|         31|
+// 14|         18|         22|         26|         30|
+// 13|         17|         21|         25|         29|
+// 12|13 14 15 16|17 18 19 20|21 22 23 24|25 26 27 28|
+// --+-----------+-----------+-----------+-----------+
+// 11|         15|         19|         23|         27|
+// 10|         14|         18|         22|         26|
+//  9|         13|         17|         21|         25|
+//  8| 9 10 11 12|13 14 15 16|17 18 19 20|21 22 23 24|
+// --+-----------+-----------+-----------+-----------+
+//  7|         11|         15|         19|         23|
+//  6|         10|         14|         18|         22|
+//  5|          9|         13|         17|         21|
+//  4| 5  6  7  8| 9 10 11 12|13 14 15 16|17 18 19 20|
+// --+-----------+-----------+-----------+-----------+
+//  3|          7|         11|         15|         19|
+//  2|          6|         10|         14|         18|
+//  1|          5|          9|         13|         17|
+//  0| 1  2  3  4| 5  6  7  8| 9 10 11 12|13 14 15 16|
+// --+-----------+-----------+-----------+-----------+
+
+// Array to record the position of the top sample to pass to the prediction
+// functions in dsp.c.
+static const uint8_t VP8TopLeftI4[16] = {
+  17, 21, 25, 29,
+  13, 17, 21, 25,
+  9,  13, 17, 21,
+  5,   9, 13, 17
+};
+
+void VP8IteratorStartI4(VP8EncIterator* const it) {
+  const VP8Encoder* const enc = it->enc_;
+  int i;
+
+  it->i4_ = 0;    // first 4x4 sub-block
+  it->i4_top_ = it->i4_boundary_ + VP8TopLeftI4[0];
+
+  // Import the boundary samples
+  for (i = 0; i < 17; ++i) {    // left
+    it->i4_boundary_[i] = it->y_left_[15 - i];
+  }
+  for (i = 0; i < 16; ++i) {    // top
+    it->i4_boundary_[17 + i] = it->y_top_[i];
+  }
+  // top-right samples have a special case on the far right of the picture
+  if (it->x_ < enc->mb_w_ - 1) {
+    for (i = 16; i < 16 + 4; ++i) {
+      it->i4_boundary_[17 + i] = it->y_top_[i];
+    }
+  } else {    // else, replicate the last valid pixel four times
+    for (i = 16; i < 16 + 4; ++i) {
+      it->i4_boundary_[17 + i] = it->i4_boundary_[17 + 15];
+    }
+  }
+  VP8IteratorNzToBytes(it);  // import the non-zero context
+}
+
+int VP8IteratorRotateI4(VP8EncIterator* const it,
+                        const uint8_t* const yuv_out) {
+  const uint8_t* const blk = yuv_out + VP8Scan[it->i4_];
+  uint8_t* const top = it->i4_top_;
+  int i;
+
+  // Update the cache with 7 fresh samples
+  for (i = 0; i <= 3; ++i) {
+    top[-4 + i] = blk[i + 3 * BPS];   // store future top samples
+  }
+  if ((it->i4_ & 3) != 3) {  // if not on the right sub-blocks #3, #7, #11, #15
+    for (i = 0; i <= 2; ++i) {        // store future left samples
+      top[i] = blk[3 + (2 - i) * BPS];
+    }
+  } else {  // else replicate top-right samples, as says the specs.
+    for (i = 0; i <= 3; ++i) {
+      top[i] = top[i + 4];
+    }
+  }
+  // move pointers to next sub-block
+  ++it->i4_;
+  if (it->i4_ == 16) {    // we're done
+    return 0;
+  }
+
+  it->i4_top_ = it->i4_boundary_ + VP8TopLeftI4[it->i4_];
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/moz.build b/media/libwebp/enc/moz.build
new file mode 100644
index 0000000000..12eaf5a5ed
--- /dev/null
+++ b/media/libwebp/enc/moz.build
@@ -0,0 +1,39 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+with Files('**'):
+    BUG_COMPONENT = ('Core', 'ImageLib')
+
+SOURCES += [
+    'alpha_enc.c',
+    'analysis_enc.c',
+    'backward_references_cost_enc.c',
+    'backward_references_enc.c',
+    'config_enc.c',
+    'cost_enc.c',
+    'filter_enc.c',
+    'frame_enc.c',
+    'histogram_enc.c',
+    'iterator_enc.c',
+    'near_lossless_enc.c',
+    'picture_csp_enc.c',
+    'picture_enc.c',
+    'picture_psnr_enc.c',
+    'picture_rescale_enc.c',
+    'picture_tools_enc.c',
+    'predictor_enc.c',
+    'quant_enc.c',
+    'syntax_enc.c',
+    'token_enc.c',
+    'tree_enc.c',
+    'vp8l_enc.c',
+    'webp_enc.c',
+]
+
+FINAL_LIBRARY = 'gkmedias'
+
+# We allow warnings for third-party code that can be updated from upstream.
+ALLOW_COMPILER_WARNINGS = True
diff --git a/media/libwebp/enc/near_lossless_enc.c b/media/libwebp/enc/near_lossless_enc.c
new file mode 100644
index 0000000000..1fd12a4364
--- /dev/null
+++ b/media/libwebp/enc/near_lossless_enc.c
@@ -0,0 +1,151 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Near-lossless image preprocessing adjusts pixel values to help
+// compressibility with a guarantee of maximum deviation between original and
+// resulting pixel values.
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+// Converted to C by Aleksander Kramarz (akramarz@google.com)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../dsp/lossless_common.h"
+#include "../utils/utils.h"
+#include "../enc/vp8li_enc.h"
+
+#if (WEBP_NEAR_LOSSLESS == 1)
+
+#define MIN_DIM_FOR_NEAR_LOSSLESS 64
+#define MAX_LIMIT_BITS             5
+
+// Quantizes the value up or down to a multiple of 1<<bits (or to 255),
+// choosing the closer one, resolving ties using bankers' rounding.
+static uint32_t FindClosestDiscretized(uint32_t a, int bits) {
+  const uint32_t mask = (1u << bits) - 1;
+  const uint32_t biased = a + (mask >> 1) + ((a >> bits) & 1);
+  assert(bits > 0);
+  if (biased > 0xff) return 0xff;
+  return biased & ~mask;
+}
+
+// Applies FindClosestDiscretized to all channels of pixel.
+static uint32_t ClosestDiscretizedArgb(uint32_t a, int bits) {
+  return
+      (FindClosestDiscretized(a >> 24, bits) << 24) |
+      (FindClosestDiscretized((a >> 16) & 0xff, bits) << 16) |
+      (FindClosestDiscretized((a >> 8) & 0xff, bits) << 8) |
+      (FindClosestDiscretized(a & 0xff, bits));
+}
+
+// Checks if distance between corresponding channel values of pixels a and b
+// is within the given limit.
+static int IsNear(uint32_t a, uint32_t b, int limit) {
+  int k;
+  for (k = 0; k < 4; ++k) {
+    const int delta =
+        (int)((a >> (k * 8)) & 0xff) - (int)((b >> (k * 8)) & 0xff);
+    if (delta >= limit || delta <= -limit) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int IsSmooth(const uint32_t* const prev_row,
+                    const uint32_t* const curr_row,
+                    const uint32_t* const next_row,
+                    int ix, int limit) {
+  // Check that all pixels in 4-connected neighborhood are smooth.
+  return (IsNear(curr_row[ix], curr_row[ix - 1], limit) &&
+          IsNear(curr_row[ix], curr_row[ix + 1], limit) &&
+          IsNear(curr_row[ix], prev_row[ix], limit) &&
+          IsNear(curr_row[ix], next_row[ix], limit));
+}
+
+// Adjusts pixel values of image with given maximum error.
+static void NearLossless(int xsize, int ysize, const uint32_t* argb_src,
+                         int stride, int limit_bits, uint32_t* copy_buffer,
+                         uint32_t* argb_dst) {
+  int x, y;
+  const int limit = 1 << limit_bits;
+  uint32_t* prev_row = copy_buffer;
+  uint32_t* curr_row = prev_row + xsize;
+  uint32_t* next_row = curr_row + xsize;
+  memcpy(curr_row, argb_src, xsize * sizeof(argb_src[0]));
+  memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
+
+  for (y = 0; y < ysize; ++y, argb_src += stride, argb_dst += xsize) {
+    if (y == 0 || y == ysize - 1) {
+      memcpy(argb_dst, argb_src, xsize * sizeof(argb_src[0]));
+    } else {
+      memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
+      argb_dst[0] = argb_src[0];
+      argb_dst[xsize - 1] = argb_src[xsize - 1];
+      for (x = 1; x < xsize - 1; ++x) {
+        if (IsSmooth(prev_row, curr_row, next_row, x, limit)) {
+          argb_dst[x] = curr_row[x];
+        } else {
+          argb_dst[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
+        }
+      }
+    }
+    {
+      // Three-way swap.
+      uint32_t* const temp = prev_row;
+      prev_row = curr_row;
+      curr_row = next_row;
+      next_row = temp;
+    }
+  }
+}
+
+int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
+                         uint32_t* const argb_dst) {
+  int i;
+  const int xsize = picture->width;
+  const int ysize = picture->height;
+  const int stride = picture->argb_stride;
+  uint32_t* const copy_buffer =
+      (uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer));
+  const int limit_bits = VP8LNearLosslessBits(quality);
+  assert(argb_dst != NULL);
+  assert(limit_bits > 0);
+  assert(limit_bits <= MAX_LIMIT_BITS);
+  if (copy_buffer == NULL) {
+    return 0;
+  }
+  // For small icon images, don't attempt to apply near-lossless compression.
+  if ((xsize < MIN_DIM_FOR_NEAR_LOSSLESS &&
+       ysize < MIN_DIM_FOR_NEAR_LOSSLESS) ||
+      ysize < 3) {
+    for (i = 0; i < ysize; ++i) {
+      memcpy(argb_dst + i * xsize, picture->argb + i * picture->argb_stride,
+             xsize * sizeof(*argb_dst));
+    }
+    WebPSafeFree(copy_buffer);
+    return 1;
+  }
+
+  NearLossless(xsize, ysize, picture->argb, stride, limit_bits, copy_buffer,
+               argb_dst);
+  for (i = limit_bits - 1; i != 0; --i) {
+    NearLossless(xsize, ysize, argb_dst, xsize, i, copy_buffer, argb_dst);
+  }
+  WebPSafeFree(copy_buffer);
+  return 1;
+}
+#else  // (WEBP_NEAR_LOSSLESS == 1)
+
+// Define a stub to suppress compiler warnings.
+extern void VP8LNearLosslessStub(void);
+void VP8LNearLosslessStub(void) {}
+
+#endif  // (WEBP_NEAR_LOSSLESS == 1)
diff --git a/media/libwebp/enc/picture_csp_enc.c b/media/libwebp/enc/picture_csp_enc.c
new file mode 100644
index 0000000000..3dd5d380e8
--- /dev/null
+++ b/media/libwebp/enc/picture_csp_enc.c
@@ -0,0 +1,1210 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture utils for colorspace conversion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../utils/random_utils.h"
+#include "../utils/utils.h"
+#include "../dsp/dsp.h"
+#include "../dsp/lossless.h"
+#include "../dsp/yuv.h"
+
+// Uncomment to disable gamma-compression during RGB->U/V averaging
+#define USE_GAMMA_COMPRESSION
+
+// If defined, use table to compute x / alpha.
+#define USE_INVERSE_ALPHA_TABLE
+
+#ifdef WORDS_BIGENDIAN
+// uint32_t 0xff000000 is 0xff,00,00,00 in memory
+#define CHANNEL_OFFSET(i) (i)
+#else
+// uint32_t 0xff000000 is 0x00,00,00,ff in memory
+#define CHANNEL_OFFSET(i) (3-(i))
+#endif
+
+#define ALPHA_OFFSET CHANNEL_OFFSET(0)
+
+//------------------------------------------------------------------------------
+// Detection of non-trivial transparency
+
+// Returns true if alpha[] has non-0xff values.
+static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
+                          int x_step, int y_step) {
+  if (alpha == NULL) return 0;
+  WebPInitAlphaProcessing();
+  if (x_step == 1) {
+    for (; height-- > 0; alpha += y_step) {
+      if (WebPHasAlpha8b(alpha, width)) return 1;
+    }
+  } else {
+    for (; height-- > 0; alpha += y_step) {
+      if (WebPHasAlpha32b(alpha, width)) return 1;
+    }
+  }
+  return 0;
+}
+
+// Checking for the presence of non-opaque alpha.
+int WebPPictureHasTransparency(const WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (picture->use_argb) {
+    const int alpha_offset = ALPHA_OFFSET;
+    return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
+                          picture->width, picture->height,
+                          4, picture->argb_stride * sizeof(*picture->argb));
+  }
+  return CheckNonOpaque(picture->a, picture->width, picture->height,
+                        1, picture->a_stride);
+}
+
+//------------------------------------------------------------------------------
+// Code for gamma correction
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// gamma-compensates loss of resolution during chroma subsampling
+#define kGamma 0.80      // for now we use a different gamma value than kGammaF
+#define kGammaFix 12     // fixed-point precision for linear values
+#define kGammaScale ((1 << kGammaFix) - 1)
+#define kGammaTabFix 7   // fixed-point fractional bits precision
+#define kGammaTabScale (1 << kGammaTabFix)
+#define kGammaTabRounder (kGammaTabScale >> 1)
+#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
+
+static int kLinearToGammaTab[kGammaTabSize + 1];
+static uint16_t kGammaToLinearTab[256];
+static volatile int kGammaTablesOk = 0;
+static void InitGammaTables(void);
+
+WEBP_DSP_INIT_FUNC(InitGammaTables) {
+  if (!kGammaTablesOk) {
+    int v;
+    const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
+    const double norm = 1. / 255.;
+    for (v = 0; v <= 255; ++v) {
+      kGammaToLinearTab[v] =
+          (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5);
+    }
+    for (v = 0; v <= kGammaTabSize; ++v) {
+      kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5);
+    }
+    kGammaTablesOk = 1;
+  }
+}
+
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
+  return kGammaToLinearTab[v];
+}
+
+static WEBP_INLINE int Interpolate(int v) {
+  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
+  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
+  const int v0 = kLinearToGammaTab[tab_pos];
+  const int v1 = kLinearToGammaTab[tab_pos + 1];
+  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
+  assert(tab_pos + 1 < kGammaTabSize + 1);
+  return y;
+}
+
+// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
+// U/V value, suitable for RGBToU/V calls.
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  const int y = Interpolate(base_value << shift);   // final uplifted value
+  return (y + kGammaTabRounder) >> kGammaTabFix;    // descale
+}
+
+#else
+
+static void InitGammaTables(void) {}
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  return (int)(base_value << shift);
+}
+
+#endif    // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+// RGB -> YUV conversion
+
+static int RGBToY(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToY(r, g, b, YUV_HALF)
+                      : VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
+}
+
+static int RGBToU(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToU(r, g, b, YUV_HALF << 2)
+                      : VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+static int RGBToV(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToV(r, g, b, YUV_HALF << 2)
+                      : VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+//------------------------------------------------------------------------------
+// Sharp RGB->YUV conversion
+
+static const int kNumIterations = 4;
+static const int kMinDimensionIterativeConversion = 4;
+
+// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
+// banding sometimes. Better use extra precision.
+#define SFIX 2                // fixed-point precision of RGB and Y/W
+typedef int16_t fixed_t;      // signed type with extra SFIX precision for UV
+typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
+
+#define SHALF (1 << SFIX >> 1)
+#define MAX_Y_T ((256 << SFIX) - 1)
+#define SROUNDER (1 << (YUV_FIX + SFIX - 1))
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// We use tables of different size and precision for the Rec709 / BT2020
+// transfer function.
+#define kGammaF (1./0.45)
+static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
+#define GAMMA_TO_LINEAR_BITS 14
+static uint32_t kGammaToLinearTabS[MAX_Y_T + 1];   // size scales with Y_FIX
+static volatile int kGammaTablesSOk = 0;
+static void InitGammaTablesS(void);
+
+WEBP_DSP_INIT_FUNC(InitGammaTablesS) {
+  assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
+  if (!kGammaTablesSOk) {
+    int v;
+    const double norm = 1. / MAX_Y_T;
+    const double scale = 1. / kGammaTabSize;
+    const double a = 0.09929682680944;
+    const double thresh = 0.018053968510807;
+    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
+    for (v = 0; v <= MAX_Y_T; ++v) {
+      const double g = norm * v;
+      double value;
+      if (g <= thresh * 4.5) {
+        value = g / 4.5;
+      } else {
+        const double a_rec = 1. / (1. + a);
+        value = pow(a_rec * (g + a), kGammaF);
+      }
+      kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
+    }
+    for (v = 0; v <= kGammaTabSize; ++v) {
+      const double g = scale * v;
+      double value;
+      if (g <= thresh) {
+        value = 4.5 * g;
+      } else {
+        value = (1. + a) * pow(g, 1. / kGammaF) - a;
+      }
+      // we already incorporate the 1/2 rounding constant here
+      kLinearToGammaTabS[v] =
+          (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
+    }
+    // to prevent small rounding errors to cause read-overflow:
+    kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
+    kGammaTablesSOk = 1;
+  }
+}
+
+// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return kGammaToLinearTabS[v];
+}
+
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
+  const uint32_t v = value * kGammaTabSize;
+  const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
+  // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
+  const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS);  // fractional part
+  // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
+  const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
+  const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
+  // Final interpolation. Note that rounding is already included.
+  const uint32_t v2 = (v1 - v0) * x;    // note: v1 >= v0.
+  const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
+  return result;
+}
+
+#else
+
+static void InitGammaTablesS(void) {}
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T;
+}
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS;
+}
+
+#endif    // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+
+static uint8_t clip_8b(fixed_t v) {
+  return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
+}
+
+static fixed_y_t clip_y(int y) {
+  return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
+}
+
+//------------------------------------------------------------------------------
+
+static int RGBToGray(int r, int g, int b) {
+  const int luma = 13933 * r + 46871 * g + 4732 * b + YUV_HALF;
+  return (luma >> YUV_FIX);
+}
+
+static uint32_t ScaleDown(int a, int b, int c, int d) {
+  const uint32_t A = GammaToLinearS(a);
+  const uint32_t B = GammaToLinearS(b);
+  const uint32_t C = GammaToLinearS(c);
+  const uint32_t D = GammaToLinearS(d);
+  return LinearToGammaS((A + B + C + D + 2) >> 2);
+}
+
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    const uint32_t R = GammaToLinearS(src[0 * w + i]);
+    const uint32_t G = GammaToLinearS(src[1 * w + i]);
+    const uint32_t B = GammaToLinearS(src[2 * w + i]);
+    const uint32_t Y = RGBToGray(R, G, B);
+    dst[i] = (fixed_y_t)LinearToGammaS(Y);
+  }
+}
+
+static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
+                         fixed_t* dst, int uv_w) {
+  int i;
+  for (i = 0; i < uv_w; ++i) {
+    const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
+                            src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
+    const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
+                            src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
+    const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
+                            src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
+    const int W = RGBToGray(r, g, b);
+    dst[0 * uv_w] = (fixed_t)(r - W);
+    dst[1 * uv_w] = (fixed_t)(g - W);
+    dst[2 * uv_w] = (fixed_t)(b - W);
+    dst  += 1;
+    src1 += 2;
+    src2 += 2;
+  }
+}
+
+static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) {
+  const int v0 = (A * 3 + B + 2) >> 2;
+  return clip_y(v0 + W0);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {  // 8bit -> SFIX
+  return ((fixed_y_t)a << SFIX) | SHALF;
+}
+
+static void ImportOneRow(const uint8_t* const r_ptr,
+                         const uint8_t* const g_ptr,
+                         const uint8_t* const b_ptr,
+                         int step,
+                         int pic_width,
+                         fixed_y_t* const dst) {
+  int i;
+  const int w = (pic_width + 1) & ~1;
+  for (i = 0; i < pic_width; ++i) {
+    const int off = i * step;
+    dst[i + 0 * w] = UpLift(r_ptr[off]);
+    dst[i + 1 * w] = UpLift(g_ptr[off]);
+    dst[i + 2 * w] = UpLift(b_ptr[off]);
+  }
+  if (pic_width & 1) {  // replicate rightmost pixel
+    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
+    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
+    dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
+  }
+}
+
+static void InterpolateTwoRows(const fixed_y_t* const best_y,
+                               const fixed_t* prev_uv,
+                               const fixed_t* cur_uv,
+                               const fixed_t* next_uv,
+                               int w,
+                               fixed_y_t* out1,
+                               fixed_y_t* out2) {
+  const int uv_w = w >> 1;
+  const int len = (w - 1) >> 1;   // length to filter
+  int k = 3;
+  while (k-- > 0) {   // process each R/G/B segments in turn
+    // special boundary case for i==0
+    out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
+    out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
+
+    WebPSharpYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
+    WebPSharpYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
+
+    // special boundary case for i == w - 1 when w is even
+    if (!(w & 1)) {
+      out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
+                            best_y[w - 1 + 0]);
+      out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
+                            best_y[w - 1 + w]);
+    }
+    out1 += w;
+    out2 += w;
+    prev_uv += uv_w;
+    cur_uv  += uv_w;
+    next_uv += uv_w;
+  }
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
+  const int luma = 16839 * r + 33059 * g + 6420 * b + SROUNDER;
+  return clip_8b(16 + (luma >> (YUV_FIX + SFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
+  const int u =  -9719 * r - 19081 * g + 28800 * b + SROUNDER;
+  return clip_8b(128 + (u >> (YUV_FIX + SFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
+  const int v = +28800 * r - 24116 * g -  4684 * b + SROUNDER;
+  return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
+}
+
+static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
+                            WebPPicture* const picture) {
+  int i, j;
+  uint8_t* dst_y = picture->y;
+  uint8_t* dst_u = picture->u;
+  uint8_t* dst_v = picture->v;
+  const fixed_t* const best_uv_base = best_uv;
+  const int w = (picture->width + 1) & ~1;
+  const int h = (picture->height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  for (best_uv = best_uv_base, j = 0; j < picture->height; ++j) {
+    for (i = 0; i < picture->width; ++i) {
+      const int off = (i >> 1);
+      const int W = best_y[i];
+      const int r = best_uv[off + 0 * uv_w] + W;
+      const int g = best_uv[off + 1 * uv_w] + W;
+      const int b = best_uv[off + 2 * uv_w] + W;
+      dst_y[i] = ConvertRGBToY(r, g, b);
+    }
+    best_y += w;
+    best_uv += (j & 1) * 3 * uv_w;
+    dst_y += picture->y_stride;
+  }
+  for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
+    for (i = 0; i < uv_w; ++i) {
+      const int off = i;
+      const int r = best_uv[off + 0 * uv_w];
+      const int g = best_uv[off + 1 * uv_w];
+      const int b = best_uv[off + 2 * uv_w];
+      dst_u[i] = ConvertRGBToU(r, g, b);
+      dst_v[i] = ConvertRGBToV(r, g, b);
+    }
+    best_uv += 3 * uv_w;
+    dst_u += picture->uv_stride;
+    dst_v += picture->uv_stride;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main function
+
+#define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
+
+static int PreprocessARGB(const uint8_t* r_ptr,
+                          const uint8_t* g_ptr,
+                          const uint8_t* b_ptr,
+                          int step, int rgb_stride,
+                          WebPPicture* const picture) {
+  // we expand the right/bottom border if needed
+  const int w = (picture->width + 1) & ~1;
+  const int h = (picture->height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  uint64_t prev_diff_y_sum = ~0;
+  int j, iter;
+
+  // TODO(skal): allocate one big memory chunk. But for now, it's easier
+  // for valgrind debugging to have several chunks.
+  fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
+  fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
+  fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+  fixed_y_t* best_y = best_y_base;
+  fixed_y_t* target_y = target_y_base;
+  fixed_t* best_uv = best_uv_base;
+  fixed_t* target_uv = target_uv_base;
+  const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
+  int ok;
+
+  if (best_y_base == NULL || best_uv_base == NULL ||
+      target_y_base == NULL || target_uv_base == NULL ||
+      best_rgb_y == NULL || best_rgb_uv == NULL ||
+      tmp_buffer == NULL) {
+    ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    goto End;
+  }
+  assert(picture->width >= kMinDimensionIterativeConversion);
+  assert(picture->height >= kMinDimensionIterativeConversion);
+
+  WebPInitConvertARGBToYUV();
+
+  // Import RGB samples to W/RGB representation.
+  for (j = 0; j < picture->height; j += 2) {
+    const int is_last_row = (j == picture->height - 1);
+    fixed_y_t* const src1 = tmp_buffer + 0 * w;
+    fixed_y_t* const src2 = tmp_buffer + 3 * w;
+
+    // prepare two rows of input
+    ImportOneRow(r_ptr, g_ptr, b_ptr, step, picture->width, src1);
+    if (!is_last_row) {
+      ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
+                   step, picture->width, src2);
+    } else {
+      memcpy(src2, src1, 3 * w * sizeof(*src2));
+    }
+    StoreGray(src1, best_y + 0, w);
+    StoreGray(src2, best_y + w, w);
+
+    UpdateW(src1, target_y, w);
+    UpdateW(src2, target_y + w, w);
+    UpdateChroma(src1, src2, target_uv, uv_w);
+    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
+    best_y += 2 * w;
+    best_uv += 3 * uv_w;
+    target_y += 2 * w;
+    target_uv += 3 * uv_w;
+    r_ptr += 2 * rgb_stride;
+    g_ptr += 2 * rgb_stride;
+    b_ptr += 2 * rgb_stride;
+  }
+
+  // Iterate and resolve clipping conflicts.
+  for (iter = 0; iter < kNumIterations; ++iter) {
+    const fixed_t* cur_uv = best_uv_base;
+    const fixed_t* prev_uv = best_uv_base;
+    uint64_t diff_y_sum = 0;
+
+    best_y = best_y_base;
+    best_uv = best_uv_base;
+    target_y = target_y_base;
+    target_uv = target_uv_base;
+    for (j = 0; j < h; j += 2) {
+      fixed_y_t* const src1 = tmp_buffer + 0 * w;
+      fixed_y_t* const src2 = tmp_buffer + 3 * w;
+      {
+        const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
+        InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2);
+        prev_uv = cur_uv;
+        cur_uv = next_uv;
+      }
+
+      UpdateW(src1, best_rgb_y + 0 * w, w);
+      UpdateW(src2, best_rgb_y + 1 * w, w);
+      UpdateChroma(src1, src2, best_rgb_uv, uv_w);
+
+      // update two rows of Y and one row of RGB
+      diff_y_sum += WebPSharpYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w);
+      WebPSharpYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
+
+      best_y += 2 * w;
+      best_uv += 3 * uv_w;
+      target_y += 2 * w;
+      target_uv += 3 * uv_w;
+    }
+    // test exit condition
+    if (iter > 0) {
+      if (diff_y_sum < diff_y_threshold) break;
+      if (diff_y_sum > prev_diff_y_sum) break;
+    }
+    prev_diff_y_sum = diff_y_sum;
+  }
+  // final reconstruction
+  ok = ConvertWRGBToYUV(best_y_base, best_uv_base, picture);
+
+ End:
+  WebPSafeFree(best_y_base);
+  WebPSafeFree(best_uv_base);
+  WebPSafeFree(target_y_base);
+  WebPSafeFree(target_uv_base);
+  WebPSafeFree(best_rgb_y);
+  WebPSafeFree(best_rgb_uv);
+  WebPSafeFree(tmp_buffer);
+  return ok;
+}
+#undef SAFE_ALLOC
+
+//------------------------------------------------------------------------------
+// "Fast" regular RGB->YUV
+
+#define SUM4(ptr, step) LinearToGamma(                     \
+    GammaToLinear((ptr)[0]) +                              \
+    GammaToLinear((ptr)[(step)]) +                         \
+    GammaToLinear((ptr)[rgb_stride]) +                     \
+    GammaToLinear((ptr)[rgb_stride + (step)]), 0)          \
+
+#define SUM2(ptr) \
+    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
+
+#define SUM2ALPHA(ptr) ((ptr)[0] + (ptr)[rgb_stride])
+#define SUM4ALPHA(ptr) (SUM2ALPHA(ptr) + SUM2ALPHA((ptr) + 4))
+
+#if defined(USE_INVERSE_ALPHA_TABLE)
+
+static const int kAlphaFix = 19;
+// Following table is (1 << kAlphaFix) / a. The (v * kInvAlpha[a]) >> kAlphaFix
+// formula is then equal to v / a in most (99.6%) cases. Note that this table
+// and constant are adjusted very tightly to fit 32b arithmetic.
+// In particular, they use the fact that the operands for 'v / a' are actually
+// derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3
+// with ai in [0..255] and pi in [0..1<<kGammaFix). The constraint to avoid
+// overflow is: kGammaFix + kAlphaFix <= 31.
+static const uint32_t kInvAlpha[4 * 0xff + 1] = {
+  0,  /* alpha = 0 */
+  524288, 262144, 174762, 131072, 104857, 87381, 74898, 65536,
+  58254, 52428, 47662, 43690, 40329, 37449, 34952, 32768,
+  30840, 29127, 27594, 26214, 24966, 23831, 22795, 21845,
+  20971, 20164, 19418, 18724, 18078, 17476, 16912, 16384,
+  15887, 15420, 14979, 14563, 14169, 13797, 13443, 13107,
+  12787, 12483, 12192, 11915, 11650, 11397, 11155, 10922,
+  10699, 10485, 10280, 10082, 9892, 9709, 9532, 9362,
+  9198, 9039, 8886, 8738, 8594, 8456, 8322, 8192,
+  8065, 7943, 7825, 7710, 7598, 7489, 7384, 7281,
+  7182, 7084, 6990, 6898, 6808, 6721, 6636, 6553,
+  6472, 6393, 6316, 6241, 6168, 6096, 6026, 5957,
+  5890, 5825, 5761, 5698, 5637, 5577, 5518, 5461,
+  5405, 5349, 5295, 5242, 5190, 5140, 5090, 5041,
+  4993, 4946, 4899, 4854, 4809, 4766, 4723, 4681,
+  4639, 4599, 4559, 4519, 4481, 4443, 4405, 4369,
+  4332, 4297, 4262, 4228, 4194, 4161, 4128, 4096,
+  4064, 4032, 4002, 3971, 3942, 3912, 3883, 3855,
+  3826, 3799, 3771, 3744, 3718, 3692, 3666, 3640,
+  3615, 3591, 3566, 3542, 3518, 3495, 3472, 3449,
+  3426, 3404, 3382, 3360, 3339, 3318, 3297, 3276,
+  3256, 3236, 3216, 3196, 3177, 3158, 3139, 3120,
+  3102, 3084, 3066, 3048, 3030, 3013, 2995, 2978,
+  2962, 2945, 2928, 2912, 2896, 2880, 2864, 2849,
+  2833, 2818, 2803, 2788, 2774, 2759, 2744, 2730,
+  2716, 2702, 2688, 2674, 2661, 2647, 2634, 2621,
+  2608, 2595, 2582, 2570, 2557, 2545, 2532, 2520,
+  2508, 2496, 2484, 2473, 2461, 2449, 2438, 2427,
+  2416, 2404, 2394, 2383, 2372, 2361, 2351, 2340,
+  2330, 2319, 2309, 2299, 2289, 2279, 2269, 2259,
+  2250, 2240, 2231, 2221, 2212, 2202, 2193, 2184,
+  2175, 2166, 2157, 2148, 2139, 2131, 2122, 2114,
+  2105, 2097, 2088, 2080, 2072, 2064, 2056, 2048,
+  2040, 2032, 2024, 2016, 2008, 2001, 1993, 1985,
+  1978, 1971, 1963, 1956, 1949, 1941, 1934, 1927,
+  1920, 1913, 1906, 1899, 1892, 1885, 1879, 1872,
+  1865, 1859, 1852, 1846, 1839, 1833, 1826, 1820,
+  1814, 1807, 1801, 1795, 1789, 1783, 1777, 1771,
+  1765, 1759, 1753, 1747, 1741, 1736, 1730, 1724,
+  1718, 1713, 1707, 1702, 1696, 1691, 1685, 1680,
+  1675, 1669, 1664, 1659, 1653, 1648, 1643, 1638,
+  1633, 1628, 1623, 1618, 1613, 1608, 1603, 1598,
+  1593, 1588, 1583, 1579, 1574, 1569, 1565, 1560,
+  1555, 1551, 1546, 1542, 1537, 1533, 1528, 1524,
+  1519, 1515, 1510, 1506, 1502, 1497, 1493, 1489,
+  1485, 1481, 1476, 1472, 1468, 1464, 1460, 1456,
+  1452, 1448, 1444, 1440, 1436, 1432, 1428, 1424,
+  1420, 1416, 1413, 1409, 1405, 1401, 1398, 1394,
+  1390, 1387, 1383, 1379, 1376, 1372, 1368, 1365,
+  1361, 1358, 1354, 1351, 1347, 1344, 1340, 1337,
+  1334, 1330, 1327, 1323, 1320, 1317, 1314, 1310,
+  1307, 1304, 1300, 1297, 1294, 1291, 1288, 1285,
+  1281, 1278, 1275, 1272, 1269, 1266, 1263, 1260,
+  1257, 1254, 1251, 1248, 1245, 1242, 1239, 1236,
+  1233, 1230, 1227, 1224, 1222, 1219, 1216, 1213,
+  1210, 1208, 1205, 1202, 1199, 1197, 1194, 1191,
+  1188, 1186, 1183, 1180, 1178, 1175, 1172, 1170,
+  1167, 1165, 1162, 1159, 1157, 1154, 1152, 1149,
+  1147, 1144, 1142, 1139, 1137, 1134, 1132, 1129,
+  1127, 1125, 1122, 1120, 1117, 1115, 1113, 1110,
+  1108, 1106, 1103, 1101, 1099, 1096, 1094, 1092,
+  1089, 1087, 1085, 1083, 1081, 1078, 1076, 1074,
+  1072, 1069, 1067, 1065, 1063, 1061, 1059, 1057,
+  1054, 1052, 1050, 1048, 1046, 1044, 1042, 1040,
+  1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024,
+  1022, 1020, 1018, 1016, 1014, 1012, 1010, 1008,
+  1006, 1004, 1002, 1000, 998, 996, 994, 992,
+  991, 989, 987, 985, 983, 981, 979, 978,
+  976, 974, 972, 970, 969, 967, 965, 963,
+  961, 960, 958, 956, 954, 953, 951, 949,
+  948, 946, 944, 942, 941, 939, 937, 936,
+  934, 932, 931, 929, 927, 926, 924, 923,
+  921, 919, 918, 916, 914, 913, 911, 910,
+  908, 907, 905, 903, 902, 900, 899, 897,
+  896, 894, 893, 891, 890, 888, 887, 885,
+  884, 882, 881, 879, 878, 876, 875, 873,
+  872, 870, 869, 868, 866, 865, 863, 862,
+  860, 859, 858, 856, 855, 853, 852, 851,
+  849, 848, 846, 845, 844, 842, 841, 840,
+  838, 837, 836, 834, 833, 832, 830, 829,
+  828, 826, 825, 824, 823, 821, 820, 819,
+  817, 816, 815, 814, 812, 811, 810, 809,
+  807, 806, 805, 804, 802, 801, 800, 799,
+  798, 796, 795, 794, 793, 791, 790, 789,
+  788, 787, 786, 784, 783, 782, 781, 780,
+  779, 777, 776, 775, 774, 773, 772, 771,
+  769, 768, 767, 766, 765, 764, 763, 762,
+  760, 759, 758, 757, 756, 755, 754, 753,
+  752, 751, 750, 748, 747, 746, 745, 744,
+  743, 742, 741, 740, 739, 738, 737, 736,
+  735, 734, 733, 732, 731, 730, 729, 728,
+  727, 726, 725, 724, 723, 722, 721, 720,
+  719, 718, 717, 716, 715, 714, 713, 712,
+  711, 710, 709, 708, 707, 706, 705, 704,
+  703, 702, 701, 700, 699, 699, 698, 697,
+  696, 695, 694, 693, 692, 691, 690, 689,
+  688, 688, 687, 686, 685, 684, 683, 682,
+  681, 680, 680, 679, 678, 677, 676, 675,
+  674, 673, 673, 672, 671, 670, 669, 668,
+  667, 667, 666, 665, 664, 663, 662, 661,
+  661, 660, 659, 658, 657, 657, 656, 655,
+  654, 653, 652, 652, 651, 650, 649, 648,
+  648, 647, 646, 645, 644, 644, 643, 642,
+  641, 640, 640, 639, 638, 637, 637, 636,
+  635, 634, 633, 633, 632, 631, 630, 630,
+  629, 628, 627, 627, 626, 625, 624, 624,
+  623, 622, 621, 621, 620, 619, 618, 618,
+  617, 616, 616, 615, 614, 613, 613, 612,
+  611, 611, 610, 609, 608, 608, 607, 606,
+  606, 605, 604, 604, 603, 602, 601, 601,
+  600, 599, 599, 598, 597, 597, 596, 595,
+  595, 594, 593, 593, 592, 591, 591, 590,
+  589, 589, 588, 587, 587, 586, 585, 585,
+  584, 583, 583, 582, 581, 581, 580, 579,
+  579, 578, 578, 577, 576, 576, 575, 574,
+  574, 573, 572, 572, 571, 571, 570, 569,
+  569, 568, 568, 567, 566, 566, 565, 564,
+  564, 563, 563, 562, 561, 561, 560, 560,
+  559, 558, 558, 557, 557, 556, 555, 555,
+  554, 554, 553, 553, 552, 551, 551, 550,
+  550, 549, 548, 548, 547, 547, 546, 546,
+  545, 544, 544, 543, 543, 542, 542, 541,
+  541, 540, 539, 539, 538, 538, 537, 537,
+  536, 536, 535, 534, 534, 533, 533, 532,
+  532, 531, 531, 530, 530, 529, 529, 528,
+  527, 527, 526, 526, 525, 525, 524, 524,
+  523, 523, 522, 522, 521, 521, 520, 520,
+  519, 519, 518, 518, 517, 517, 516, 516,
+  515, 515, 514, 514
+};
+
+// Note that LinearToGamma() expects the values to be premultiplied by 4,
+// so we incorporate this factor 4 inside the DIVIDE_BY_ALPHA macro directly.
+#define DIVIDE_BY_ALPHA(sum, a)  (((sum) * kInvAlpha[(a)]) >> (kAlphaFix - 2))
+
+#else
+
+#define DIVIDE_BY_ALPHA(sum, a) (4 * (sum) / (a))
+
+#endif  // USE_INVERSE_ALPHA_TABLE
+
+static WEBP_INLINE int LinearToGammaWeighted(const uint8_t* src,
+                                             const uint8_t* a_ptr,
+                                             uint32_t total_a, int step,
+                                             int rgb_stride) {
+  const uint32_t sum =
+      a_ptr[0] * GammaToLinear(src[0]) +
+      a_ptr[step] * GammaToLinear(src[step]) +
+      a_ptr[rgb_stride] * GammaToLinear(src[rgb_stride]) +
+      a_ptr[rgb_stride + step] * GammaToLinear(src[rgb_stride + step]);
+  assert(total_a > 0 && total_a <= 4 * 0xff);
+#if defined(USE_INVERSE_ALPHA_TABLE)
+  assert((uint64_t)sum * kInvAlpha[total_a] < ((uint64_t)1 << 32));
+#endif
+  return LinearToGamma(DIVIDE_BY_ALPHA(sum, total_a), 0);
+}
+
+static WEBP_INLINE void ConvertRowToY(const uint8_t* const r_ptr,
+                                      const uint8_t* const g_ptr,
+                                      const uint8_t* const b_ptr,
+                                      int step,
+                                      uint8_t* const dst_y,
+                                      int width,
+                                      VP8Random* const rg) {
+  int i, j;
+  for (i = 0, j = 0; i < width; i += 1, j += step) {
+    dst_y[i] = RGBToY(r_ptr[j], g_ptr[j], b_ptr[j], rg);
+  }
+}
+
+static WEBP_INLINE void AccumulateRGBA(const uint8_t* const r_ptr,
+                                       const uint8_t* const g_ptr,
+                                       const uint8_t* const b_ptr,
+                                       const uint8_t* const a_ptr,
+                                       int rgb_stride,
+                                       uint16_t* dst, int width) {
+  int i, j;
+  // we loop over 2x2 blocks and produce one R/G/B/A value for each.
+  for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * 4, dst += 4) {
+    const uint32_t a = SUM4ALPHA(a_ptr + j);
+    int r, g, b;
+    if (a == 4 * 0xff || a == 0) {
+      r = SUM4(r_ptr + j, 4);
+      g = SUM4(g_ptr + j, 4);
+      b = SUM4(b_ptr + j, 4);
+    } else {
+      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 4, rgb_stride);
+      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 4, rgb_stride);
+      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 4, rgb_stride);
+    }
+    dst[0] = r;
+    dst[1] = g;
+    dst[2] = b;
+    dst[3] = a;
+  }
+  if (width & 1) {
+    const uint32_t a = 2u * SUM2ALPHA(a_ptr + j);
+    int r, g, b;
+    if (a == 4 * 0xff || a == 0) {
+      r = SUM2(r_ptr + j);
+      g = SUM2(g_ptr + j);
+      b = SUM2(b_ptr + j);
+    } else {
+      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 0, rgb_stride);
+      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 0, rgb_stride);
+      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 0, rgb_stride);
+    }
+    dst[0] = r;
+    dst[1] = g;
+    dst[2] = b;
+    dst[3] = a;
+  }
+}
+
+static WEBP_INLINE void AccumulateRGB(const uint8_t* const r_ptr,
+                                      const uint8_t* const g_ptr,
+                                      const uint8_t* const b_ptr,
+                                      int step, int rgb_stride,
+                                      uint16_t* dst, int width) {
+  int i, j;
+  for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * step, dst += 4) {
+    dst[0] = SUM4(r_ptr + j, step);
+    dst[1] = SUM4(g_ptr + j, step);
+    dst[2] = SUM4(b_ptr + j, step);
+  }
+  if (width & 1) {
+    dst[0] = SUM2(r_ptr + j);
+    dst[1] = SUM2(g_ptr + j);
+    dst[2] = SUM2(b_ptr + j);
+  }
+}
+
+static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb,
+                                        uint8_t* const dst_u,
+                                        uint8_t* const dst_v,
+                                        int width,
+                                        VP8Random* const rg) {
+  int i;
+  for (i = 0; i < width; i += 1, rgb += 4) {
+    const int r = rgb[0], g = rgb[1], b = rgb[2];
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+}
+
+static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
+                              const uint8_t* g_ptr,
+                              const uint8_t* b_ptr,
+                              const uint8_t* a_ptr,
+                              int step,         // bytes per pixel
+                              int rgb_stride,   // bytes per scanline
+                              float dithering,
+                              int use_iterative_conversion,
+                              WebPPicture* const picture) {
+  int y;
+  const int width = picture->width;
+  const int height = picture->height;
+  const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
+  const int is_rgb = (r_ptr < b_ptr);  // otherwise it's bgr
+
+  picture->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
+  picture->use_argb = 0;
+
+  // disable smart conversion if source is too small (overkill).
+  if (width < kMinDimensionIterativeConversion ||
+      height < kMinDimensionIterativeConversion) {
+    use_iterative_conversion = 0;
+  }
+
+  if (!WebPPictureAllocYUVA(picture, width, height)) {
+    return 0;
+  }
+  if (has_alpha) {
+    assert(step == 4);
+#if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE)
+    assert(kAlphaFix + kGammaFix <= 31);
+#endif
+  }
+
+  if (use_iterative_conversion) {
+    InitGammaTablesS();
+    if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
+      return 0;
+    }
+    if (has_alpha) {
+      WebPExtractAlpha(a_ptr, rgb_stride, width, height,
+                       picture->a, picture->a_stride);
+    }
+  } else {
+    const int uv_width = (width + 1) >> 1;
+    int use_dsp = (step == 3);  // use special function in this case
+    // temporary storage for accumulated R/G/B values during conversion to U/V
+    uint16_t* const tmp_rgb =
+        (uint16_t*)WebPSafeMalloc(4 * uv_width, sizeof(*tmp_rgb));
+    uint8_t* dst_y = picture->y;
+    uint8_t* dst_u = picture->u;
+    uint8_t* dst_v = picture->v;
+    uint8_t* dst_a = picture->a;
+
+    VP8Random base_rg;
+    VP8Random* rg = NULL;
+    if (dithering > 0.) {
+      VP8InitRandom(&base_rg, dithering);
+      rg = &base_rg;
+      use_dsp = 0;   // can't use dsp in this case
+    }
+    WebPInitConvertARGBToYUV();
+    InitGammaTables();
+
+    if (tmp_rgb == NULL) return 0;  // malloc error
+
+    // Downsample Y/U/V planes, two rows at a time
+    for (y = 0; y < (height >> 1); ++y) {
+      int rows_have_alpha = has_alpha;
+      if (use_dsp) {
+        if (is_rgb) {
+          WebPConvertRGB24ToY(r_ptr, dst_y, width);
+          WebPConvertRGB24ToY(r_ptr + rgb_stride,
+                              dst_y + picture->y_stride, width);
+        } else {
+          WebPConvertBGR24ToY(b_ptr, dst_y, width);
+          WebPConvertBGR24ToY(b_ptr + rgb_stride,
+                              dst_y + picture->y_stride, width);
+        }
+      } else {
+        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
+        ConvertRowToY(r_ptr + rgb_stride,
+                      g_ptr + rgb_stride,
+                      b_ptr + rgb_stride, step,
+                      dst_y + picture->y_stride, width, rg);
+      }
+      dst_y += 2 * picture->y_stride;
+      if (has_alpha) {
+        rows_have_alpha &= !WebPExtractAlpha(a_ptr, rgb_stride, width, 2,
+                                             dst_a, picture->a_stride);
+        dst_a += 2 * picture->a_stride;
+      }
+      // Collect averaged R/G/B(/A)
+      if (!rows_have_alpha) {
+        AccumulateRGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, tmp_rgb, width);
+      } else {
+        AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, rgb_stride, tmp_rgb, width);
+      }
+      // Convert to U/V
+      if (rg == NULL) {
+        WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
+      } else {
+        ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
+      }
+      dst_u += picture->uv_stride;
+      dst_v += picture->uv_stride;
+      r_ptr += 2 * rgb_stride;
+      b_ptr += 2 * rgb_stride;
+      g_ptr += 2 * rgb_stride;
+      if (has_alpha) a_ptr += 2 * rgb_stride;
+    }
+    if (height & 1) {    // extra last row
+      int row_has_alpha = has_alpha;
+      if (use_dsp) {
+        if (r_ptr < b_ptr) {
+          WebPConvertRGB24ToY(r_ptr, dst_y, width);
+        } else {
+          WebPConvertBGR24ToY(b_ptr, dst_y, width);
+        }
+      } else {
+        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
+      }
+      if (row_has_alpha) {
+        row_has_alpha &= !WebPExtractAlpha(a_ptr, 0, width, 1, dst_a, 0);
+      }
+      // Collect averaged R/G/B(/A)
+      if (!row_has_alpha) {
+        // Collect averaged R/G/B
+        AccumulateRGB(r_ptr, g_ptr, b_ptr, step, /* rgb_stride = */ 0,
+                      tmp_rgb, width);
+      } else {
+        AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /* rgb_stride = */ 0,
+                       tmp_rgb, width);
+      }
+      if (rg == NULL) {
+        WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
+      } else {
+        ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
+      }
+    }
+    WebPSafeFree(tmp_rgb);
+  }
+  return 1;
+}
+
+#undef SUM4
+#undef SUM2
+#undef SUM4ALPHA
+#undef SUM2ALPHA
+
+//------------------------------------------------------------------------------
+// call for ARGB->YUVA conversion
+
+static int PictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace,
+                             float dithering, int use_iterative_conversion) {
+  if (picture == NULL) return 0;
+  if (picture->argb == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  } else if ((colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  } else {
+    const uint8_t* const argb = (const uint8_t*)picture->argb;
+    const uint8_t* const a = argb + CHANNEL_OFFSET(0);
+    const uint8_t* const r = argb + CHANNEL_OFFSET(1);
+    const uint8_t* const g = argb + CHANNEL_OFFSET(2);
+    const uint8_t* const b = argb + CHANNEL_OFFSET(3);
+
+    picture->colorspace = WEBP_YUV420;
+    return ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride,
+                              dithering, use_iterative_conversion, picture);
+  }
+}
+
+int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
+                                  float dithering) {
+  return PictureARGBToYUVA(picture, colorspace, dithering, 0);
+}
+
+int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
+  return PictureARGBToYUVA(picture, colorspace, 0.f, 0);
+}
+
+int WebPPictureSharpARGBToYUVA(WebPPicture* picture) {
+  return PictureARGBToYUVA(picture, WEBP_YUV420, 0.f, 1);
+}
+// for backward compatibility
+int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
+  return WebPPictureSharpARGBToYUVA(picture);
+}
+
+//------------------------------------------------------------------------------
+// call for YUVA -> ARGB conversion
+
+int WebPPictureYUVAToARGB(WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (picture->y == NULL || picture->u == NULL || picture->v == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  }
+  if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  }
+  if ((picture->colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  }
+  // Allocate a new argb buffer (discarding the previous one).
+  if (!WebPPictureAllocARGB(picture, picture->width, picture->height)) return 0;
+  picture->use_argb = 1;
+
+  // Convert
+  {
+    int y;
+    const int width = picture->width;
+    const int height = picture->height;
+    const int argb_stride = 4 * picture->argb_stride;
+    uint8_t* dst = (uint8_t*)picture->argb;
+    const uint8_t* cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
+    WebPUpsampleLinePairFunc upsample =
+        WebPGetLinePairConverter(ALPHA_OFFSET > 0);
+
+    // First row, with replicated top samples.
+    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+    cur_y += picture->y_stride;
+    dst += argb_stride;
+    // Center rows.
+    for (y = 1; y + 1 < height; y += 2) {
+      const uint8_t* const top_u = cur_u;
+      const uint8_t* const top_v = cur_v;
+      cur_u += picture->uv_stride;
+      cur_v += picture->uv_stride;
+      upsample(cur_y, cur_y + picture->y_stride, top_u, top_v, cur_u, cur_v,
+               dst, dst + argb_stride, width);
+      cur_y += 2 * picture->y_stride;
+      dst += 2 * argb_stride;
+    }
+    // Last row (if needed), with replicated bottom samples.
+    if (height > 1 && !(height & 1)) {
+      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+    }
+    // Insert alpha values if needed, in replacement for the default 0xff ones.
+    if (picture->colorspace & WEBP_CSP_ALPHA_BIT) {
+      for (y = 0; y < height; ++y) {
+        uint32_t* const argb_dst = picture->argb + y * picture->argb_stride;
+        const uint8_t* const src = picture->a + y * picture->a_stride;
+        int x;
+        for (x = 0; x < width; ++x) {
+          argb_dst[x] = (argb_dst[x] & 0x00ffffffu) | ((uint32_t)src[x] << 24);
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// automatic import / conversion
+
+static int Import(WebPPicture* const picture,
+                  const uint8_t* rgb, int rgb_stride,
+                  int step, int swap_rb, int import_alpha) {
+  int y;
+  // swap_rb -> b,g,r,a , !swap_rb -> r,g,b,a
+  const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0);
+  const uint8_t* g_ptr = rgb + 1;
+  const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2);
+  const int width = picture->width;
+  const int height = picture->height;
+
+  if (!picture->use_argb) {
+    const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
+    return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
+                              0.f /* no dithering */, 0, picture);
+  }
+  if (!WebPPictureAlloc(picture)) return 0;
+
+  VP8LDspInit();
+  WebPInitAlphaProcessing();
+
+  if (import_alpha) {
+    // dst[] byte order is {a,r,g,b} for big-endian, {b,g,r,a} for little endian
+    uint32_t* dst = picture->argb;
+    const int do_copy = (ALPHA_OFFSET == 3) && swap_rb;
+    assert(step == 4);
+    if (do_copy) {
+      for (y = 0; y < height; ++y) {
+        memcpy(dst, rgb, width * 4);
+        rgb += rgb_stride;
+        dst += picture->argb_stride;
+      }
+    } else {
+      for (y = 0; y < height; ++y) {
+#ifdef WORDS_BIGENDIAN
+        // BGRA or RGBA input order.
+        const uint8_t* a_ptr = rgb + 3;
+        WebPPackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
+        r_ptr += rgb_stride;
+        g_ptr += rgb_stride;
+        b_ptr += rgb_stride;
+#else
+        // RGBA input order. Need to swap R and B.
+        VP8LConvertBGRAToRGBA((const uint32_t*)rgb, width, (uint8_t*)dst);
+#endif
+        rgb += rgb_stride;
+        dst += picture->argb_stride;
+      }
+    }
+  } else {
+    uint32_t* dst = picture->argb;
+    assert(step >= 3);
+    for (y = 0; y < height; ++y) {
+      WebPPackRGB(r_ptr, g_ptr, b_ptr, width, step, dst);
+      r_ptr += rgb_stride;
+      g_ptr += rgb_stride;
+      b_ptr += rgb_stride;
+      dst += picture->argb_stride;
+    }
+  }
+  return 1;
+}
+
+// Public API
+
+#if !defined(WEBP_REDUCE_CSP)
+
+int WebPPictureImportBGR(WebPPicture* picture,
+                         const uint8_t* rgb, int rgb_stride) {
+  return (picture != NULL && rgb != NULL)
+             ? Import(picture, rgb, rgb_stride, 3, 1, 0)
+             : 0;
+}
+
+int WebPPictureImportBGRA(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL && rgba != NULL)
+             ? Import(picture, rgba, rgba_stride, 4, 1, 1)
+             : 0;
+}
+
+
+int WebPPictureImportBGRX(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL && rgba != NULL)
+             ? Import(picture, rgba, rgba_stride, 4, 1, 0)
+             : 0;
+}
+
+#endif   // WEBP_REDUCE_CSP
+
+int WebPPictureImportRGB(WebPPicture* picture,
+                         const uint8_t* rgb, int rgb_stride) {
+  return (picture != NULL && rgb != NULL)
+             ? Import(picture, rgb, rgb_stride, 3, 0, 0)
+             : 0;
+}
+
+int WebPPictureImportRGBA(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL && rgba != NULL)
+             ? Import(picture, rgba, rgba_stride, 4, 0, 1)
+             : 0;
+}
+
+int WebPPictureImportRGBX(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL && rgba != NULL)
+             ? Import(picture, rgba, rgba_stride, 4, 0, 0)
+             : 0;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/picture_enc.c b/media/libwebp/enc/picture_enc.c
new file mode 100644
index 0000000000..5275ba9ed2
--- /dev/null
+++ b/media/libwebp/enc/picture_enc.c
@@ -0,0 +1,296 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture class basis
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../dsp/dsp.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+// WebPPicture
+//------------------------------------------------------------------------------
+
+static int DummyWriter(const uint8_t* data, size_t data_size,
+                       const WebPPicture* const picture) {
+  // The following are to prevent 'unused variable' error message.
+  (void)data;
+  (void)data_size;
+  (void)picture;
+  return 1;
+}
+
+int WebPPictureInitInternal(WebPPicture* picture, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
+    return 0;   // caller/system version mismatch!
+  }
+  if (picture != NULL) {
+    memset(picture, 0, sizeof(*picture));
+    picture->writer = DummyWriter;
+    WebPEncodingSetError(picture, VP8_ENC_OK);
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static void WebPPictureResetBufferARGB(WebPPicture* const picture) {
+  picture->memory_argb_ = NULL;
+  picture->argb = NULL;
+  picture->argb_stride = 0;
+}
+
+static void WebPPictureResetBufferYUVA(WebPPicture* const picture) {
+  picture->memory_ = NULL;
+  picture->y = picture->u = picture->v = picture->a = NULL;
+  picture->y_stride = picture->uv_stride = 0;
+  picture->a_stride = 0;
+}
+
+void WebPPictureResetBuffers(WebPPicture* const picture) {
+  WebPPictureResetBufferARGB(picture);
+  WebPPictureResetBufferYUVA(picture);
+}
+
+int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
+  void* memory;
+  const uint64_t argb_size = (uint64_t)width * height;
+
+  assert(picture != NULL);
+
+  WebPSafeFree(picture->memory_argb_);
+  WebPPictureResetBufferARGB(picture);
+
+  if (width <= 0 || height <= 0) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+  // allocate a new buffer.
+  memory = WebPSafeMalloc(argb_size + WEBP_ALIGN_CST, sizeof(*picture->argb));
+  if (memory == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+  picture->memory_argb_ = memory;
+  picture->argb = (uint32_t*)WEBP_ALIGN(memory);
+  picture->argb_stride = width;
+  return 1;
+}
+
+int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
+  const WebPEncCSP uv_csp =
+      (WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK);
+  const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT;
+  const int y_stride = width;
+  const int uv_width = (int)(((int64_t)width + 1) >> 1);
+  const int uv_height = (int)(((int64_t)height + 1) >> 1);
+  const int uv_stride = uv_width;
+  int a_width, a_stride;
+  uint64_t y_size, uv_size, a_size, total_size;
+  uint8_t* mem;
+
+  assert(picture != NULL);
+
+  WebPSafeFree(picture->memory_);
+  WebPPictureResetBufferYUVA(picture);
+
+  if (uv_csp != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  }
+
+  // alpha
+  a_width = has_alpha ? width : 0;
+  a_stride = a_width;
+  y_size = (uint64_t)y_stride * height;
+  uv_size = (uint64_t)uv_stride * uv_height;
+  a_size =  (uint64_t)a_stride * height;
+
+  total_size = y_size + a_size + 2 * uv_size;
+
+  // Security and validation checks
+  if (width <= 0 || height <= 0 ||           // luma/alpha param error
+      uv_width <= 0 || uv_height <= 0) {     // u/v param error
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+  // allocate a new buffer.
+  mem = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*mem));
+  if (mem == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+
+  // From now on, we're in the clear, we can no longer fail...
+  picture->memory_ = (void*)mem;
+  picture->y_stride  = y_stride;
+  picture->uv_stride = uv_stride;
+  picture->a_stride  = a_stride;
+
+  // TODO(skal): we could align the y/u/v planes and adjust stride.
+  picture->y = mem;
+  mem += y_size;
+
+  picture->u = mem;
+  mem += uv_size;
+  picture->v = mem;
+  mem += uv_size;
+
+  if (a_size > 0) {
+    picture->a = mem;
+    mem += a_size;
+  }
+  (void)mem;  // makes the static analyzer happy
+  return 1;
+}
+
+int WebPPictureAlloc(WebPPicture* picture) {
+  if (picture != NULL) {
+    const int width = picture->width;
+    const int height = picture->height;
+
+    WebPPictureFree(picture);   // erase previous buffer
+
+    if (!picture->use_argb) {
+      return WebPPictureAllocYUVA(picture, width, height);
+    } else {
+      return WebPPictureAllocARGB(picture, width, height);
+    }
+  }
+  return 1;
+}
+
+void WebPPictureFree(WebPPicture* picture) {
+  if (picture != NULL) {
+    WebPSafeFree(picture->memory_);
+    WebPSafeFree(picture->memory_argb_);
+    WebPPictureResetBuffers(picture);
+  }
+}
+
+//------------------------------------------------------------------------------
+// WebPMemoryWriter: Write-to-memory
+
+void WebPMemoryWriterInit(WebPMemoryWriter* writer) {
+  writer->mem = NULL;
+  writer->size = 0;
+  writer->max_size = 0;
+}
+
+int WebPMemoryWrite(const uint8_t* data, size_t data_size,
+                    const WebPPicture* picture) {
+  WebPMemoryWriter* const w = (WebPMemoryWriter*)picture->custom_ptr;
+  uint64_t next_size;
+  if (w == NULL) {
+    return 1;
+  }
+  next_size = (uint64_t)w->size + data_size;
+  if (next_size > w->max_size) {
+    uint8_t* new_mem;
+    uint64_t next_max_size = 2ULL * w->max_size;
+    if (next_max_size < next_size) next_max_size = next_size;
+    if (next_max_size < 8192ULL) next_max_size = 8192ULL;
+    new_mem = (uint8_t*)WebPSafeMalloc(next_max_size, 1);
+    if (new_mem == NULL) {
+      return 0;
+    }
+    if (w->size > 0) {
+      memcpy(new_mem, w->mem, w->size);
+    }
+    WebPSafeFree(w->mem);
+    w->mem = new_mem;
+    // down-cast is ok, thanks to WebPSafeMalloc
+    w->max_size = (size_t)next_max_size;
+  }
+  if (data_size > 0) {
+    memcpy(w->mem + w->size, data, data_size);
+    w->size += data_size;
+  }
+  return 1;
+}
+
+void WebPMemoryWriterClear(WebPMemoryWriter* writer) {
+  if (writer != NULL) {
+    WebPSafeFree(writer->mem);
+    writer->mem = NULL;
+    writer->size = 0;
+    writer->max_size = 0;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Simplest high-level calls:
+
+typedef int (*Importer)(WebPPicture* const, const uint8_t* const, int);
+
+static size_t Encode(const uint8_t* rgba, int width, int height, int stride,
+                     Importer import, float quality_factor, int lossless,
+                     uint8_t** output) {
+  WebPPicture pic;
+  WebPConfig config;
+  WebPMemoryWriter wrt;
+  int ok;
+
+  if (output == NULL) return 0;
+
+  if (!WebPConfigPreset(&config, WEBP_PRESET_DEFAULT, quality_factor) ||
+      !WebPPictureInit(&pic)) {
+    return 0;  // shouldn't happen, except if system installation is broken
+  }
+
+  config.lossless = !!lossless;
+  pic.use_argb = !!lossless;
+  pic.width = width;
+  pic.height = height;
+  pic.writer = WebPMemoryWrite;
+  pic.custom_ptr = &wrt;
+  WebPMemoryWriterInit(&wrt);
+
+  ok = import(&pic, rgba, stride) && WebPEncode(&config, &pic);
+  WebPPictureFree(&pic);
+  if (!ok) {
+    WebPMemoryWriterClear(&wrt);
+    *output = NULL;
+    return 0;
+  }
+  *output = wrt.mem;
+  return wrt.size;
+}
+
+#define ENCODE_FUNC(NAME, IMPORTER)                                     \
+size_t NAME(const uint8_t* in, int w, int h, int bps, float q,          \
+            uint8_t** out) {                                            \
+  return Encode(in, w, h, bps, IMPORTER, q, 0, out);                    \
+}
+
+ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB)
+ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA)
+#if !defined(WEBP_REDUCE_CSP)
+ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
+ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA)
+#endif  // WEBP_REDUCE_CSP
+
+#undef ENCODE_FUNC
+
+#define LOSSLESS_DEFAULT_QUALITY 70.
+#define LOSSLESS_ENCODE_FUNC(NAME, IMPORTER)                                 \
+size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) {       \
+  return Encode(in, w, h, bps, IMPORTER, LOSSLESS_DEFAULT_QUALITY, 1, out);  \
+}
+
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA)
+#if !defined(WEBP_REDUCE_CSP)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)
+#endif  // WEBP_REDUCE_CSP
+
+#undef LOSSLESS_ENCODE_FUNC
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/picture_psnr_enc.c b/media/libwebp/enc/picture_psnr_enc.c
new file mode 100644
index 0000000000..bbd32854c9
--- /dev/null
+++ b/media/libwebp/enc/picture_psnr_enc.c
@@ -0,0 +1,258 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools for measuring distortion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../webp/encode.h"
+
+#if !(defined(WEBP_DISABLE_STATS) || defined(WEBP_REDUCE_SIZE))
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "../dsp/dsp.h"
+#include "../enc/vp8i_enc.h"
+#include "../utils/utils.h"
+
+typedef double (*AccumulateFunc)(const uint8_t* src, int src_stride,
+                                 const uint8_t* ref, int ref_stride,
+                                 int w, int h);
+
+//------------------------------------------------------------------------------
+// local-min distortion
+//
+// For every pixel in the *reference* picture, we search for the local best
+// match in the compressed image. This is not a symmetrical measure.
+
+#define RADIUS 2  // search radius. Shouldn't be too large.
+
+static double AccumulateLSIM(const uint8_t* src, int src_stride,
+                             const uint8_t* ref, int ref_stride,
+                             int w, int h) {
+  int x, y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
+    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
+    for (x = 0; x < w; ++x) {
+      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
+      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
+      double best_sse = 255. * 255.;
+      const double value = (double)ref[y * ref_stride + x];
+      int i, j;
+      for (j = y_0; j < y_1; ++j) {
+        const uint8_t* const s = src + j * src_stride;
+        for (i = x_0; i < x_1; ++i) {
+          const double diff = s[i] - value;
+          const double sse = diff * diff;
+          if (sse < best_sse) best_sse = sse;
+        }
+      }
+      total_sse += best_sse;
+    }
+  }
+  return total_sse;
+}
+#undef RADIUS
+
+static double AccumulateSSE(const uint8_t* src, int src_stride,
+                            const uint8_t* ref, int ref_stride,
+                            int w, int h) {
+  int y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    total_sse += VP8AccumulateSSE(src, ref, w);
+    src += src_stride;
+    ref += ref_stride;
+  }
+  return total_sse;
+}
+
+//------------------------------------------------------------------------------
+
+static double AccumulateSSIM(const uint8_t* src, int src_stride,
+                             const uint8_t* ref, int ref_stride,
+                             int w, int h) {
+  const int w0 = (w < VP8_SSIM_KERNEL) ? w : VP8_SSIM_KERNEL;
+  const int w1 = w - VP8_SSIM_KERNEL - 1;
+  const int h0 = (h < VP8_SSIM_KERNEL) ? h : VP8_SSIM_KERNEL;
+  const int h1 = h - VP8_SSIM_KERNEL - 1;
+  int x, y;
+  double sum = 0.;
+  for (y = 0; y < h0; ++y) {
+    for (x = 0; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  for (; y < h1; ++y) {
+    for (x = 0; x < w0; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+    for (; x < w1; ++x) {
+      const int off1 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * src_stride;
+      const int off2 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * ref_stride;
+      sum += VP8SSIMGet(src + off1, src_stride, ref + off2, ref_stride);
+    }
+    for (; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  for (; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  return sum;
+}
+
+//------------------------------------------------------------------------------
+// Distortion
+
+// Max value returned in case of exact similarity.
+static const double kMinDistortion_dB = 99.;
+
+static double GetPSNR(double v, double size) {
+  return (v > 0. && size > 0.) ? -4.3429448 * log(v / (size * 255 * 255.))
+                               : kMinDistortion_dB;
+}
+
+static double GetLogSSIM(double v, double size) {
+  v = (size > 0.) ? v / size : 1.;
+  return (v < 1.) ? -10.0 * log10(1. - v) : kMinDistortion_dB;
+}
+
+int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                        const uint8_t* ref, size_t ref_stride,
+                        int width, int height, size_t x_step,
+                        int type, float* distortion, float* result) {
+  uint8_t* allocated = NULL;
+  const AccumulateFunc metric = (type == 0) ? AccumulateSSE :
+                                (type == 1) ? AccumulateSSIM :
+                                              AccumulateLSIM;
+  if (src == NULL || ref == NULL ||
+      src_stride < x_step * width || ref_stride < x_step * width ||
+      result == NULL || distortion == NULL) {
+    return 0;
+  }
+
+  VP8SSIMDspInit();
+  if (x_step != 1) {   // extract a packed plane if needed
+    int x, y;
+    uint8_t* tmp1;
+    uint8_t* tmp2;
+    allocated =
+        (uint8_t*)WebPSafeMalloc(2ULL * width * height, sizeof(*allocated));
+    if (allocated == NULL) return 0;
+    tmp1 = allocated;
+    tmp2 = tmp1 + (size_t)width * height;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        tmp1[x + y * width] = src[x * x_step + y * src_stride];
+        tmp2[x + y * width] = ref[x * x_step + y * ref_stride];
+      }
+    }
+    src = tmp1;
+    ref = tmp2;
+  }
+  *distortion = (float)metric(src, width, ref, width, width, height);
+  WebPSafeFree(allocated);
+
+  *result = (type == 1) ? (float)GetLogSSIM(*distortion, (double)width * height)
+                        : (float)GetPSNR(*distortion, (double)width * height);
+  return 1;
+}
+
+#ifdef WORDS_BIGENDIAN
+#define BLUE_OFFSET 3   // uint32_t 0x000000ff is 0x00,00,00,ff in memory
+#else
+#define BLUE_OFFSET 0   // uint32_t 0x000000ff is 0xff,00,00,00 in memory
+#endif
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+                          int type, float results[5]) {
+  int w, h, c;
+  int ok = 0;
+  WebPPicture p0, p1;
+  double total_size = 0., total_distortion = 0.;
+  if (src == NULL || ref == NULL ||
+      src->width != ref->width || src->height != ref->height ||
+      results == NULL) {
+    return 0;
+  }
+
+  VP8SSIMDspInit();
+  if (!WebPPictureInit(&p0) || !WebPPictureInit(&p1)) return 0;
+  w = src->width;
+  h = src->height;
+  if (!WebPPictureView(src, 0, 0, w, h, &p0)) goto Error;
+  if (!WebPPictureView(ref, 0, 0, w, h, &p1)) goto Error;
+
+  // We always measure distortion in ARGB space.
+  if (p0.use_argb == 0 && !WebPPictureYUVAToARGB(&p0)) goto Error;
+  if (p1.use_argb == 0 && !WebPPictureYUVAToARGB(&p1)) goto Error;
+  for (c = 0; c < 4; ++c) {
+    float distortion;
+    const size_t stride0 = 4 * (size_t)p0.argb_stride;
+    const size_t stride1 = 4 * (size_t)p1.argb_stride;
+    // results are reported as BGRA
+    const int offset = c ^ BLUE_OFFSET;
+    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + offset, stride0,
+                             (const uint8_t*)p1.argb + offset, stride1,
+                             w, h, 4, type, &distortion, results + c)) {
+      goto Error;
+    }
+    total_distortion += distortion;
+    total_size += w * h;
+  }
+
+  results[4] = (type == 1) ? (float)GetLogSSIM(total_distortion, total_size)
+                           : (float)GetPSNR(total_distortion, total_size);
+  ok = 1;
+
+ Error:
+  WebPPictureFree(&p0);
+  WebPPictureFree(&p1);
+  return ok;
+}
+
+#undef BLUE_OFFSET
+
+#else  // defined(WEBP_DISABLE_STATS)
+int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                        const uint8_t* ref, size_t ref_stride,
+                        int width, int height, size_t x_step,
+                        int type, float* distortion, float* result) {
+  (void)src;
+  (void)src_stride;
+  (void)ref;
+  (void)ref_stride;
+  (void)width;
+  (void)height;
+  (void)x_step;
+  (void)type;
+  if (distortion == NULL || result == NULL) return 0;
+  *distortion = 0.f;
+  *result = 0.f;
+  return 1;
+}
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+                          int type, float results[5]) {
+  int i;
+  (void)src;
+  (void)ref;
+  (void)type;
+  if (results == NULL) return 0;
+  for (i = 0; i < 5; ++i) results[i] = 0.f;
+  return 1;
+}
+
+#endif  // !defined(WEBP_DISABLE_STATS)
diff --git a/media/libwebp/enc/picture_rescale_enc.c b/media/libwebp/enc/picture_rescale_enc.c
new file mode 100644
index 0000000000..22d31363f0
--- /dev/null
+++ b/media/libwebp/enc/picture_rescale_enc.c
@@ -0,0 +1,316 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools: copy, crop, rescaling and view.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../webp/encode.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../utils/rescaler_utils.h"
+#include "../utils/utils.h"
+
+#define HALVE(x) (((x) + 1) >> 1)
+
+// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
+// into 'dst'. Mark 'dst' as not owning any memory.
+static void PictureGrabSpecs(const WebPPicture* const src,
+                             WebPPicture* const dst) {
+  assert(src != NULL && dst != NULL);
+  *dst = *src;
+  WebPPictureResetBuffers(dst);
+}
+
+//------------------------------------------------------------------------------
+
+// Adjust top-left corner to chroma sample position.
+static void SnapTopLeftPosition(const WebPPicture* const pic,
+                                int* const left, int* const top) {
+  if (!pic->use_argb) {
+    *left &= ~1;
+    *top &= ~1;
+  }
+}
+
+// Adjust top-left corner and verify that the sub-rectangle is valid.
+static int AdjustAndCheckRectangle(const WebPPicture* const pic,
+                                   int* const left, int* const top,
+                                   int width, int height) {
+  SnapTopLeftPosition(pic, left, top);
+  if ((*left) < 0 || (*top) < 0) return 0;
+  if (width <= 0 || height <= 0) return 0;
+  if ((*left) + width > pic->width) return 0;
+  if ((*top) + height > pic->height) return 0;
+  return 1;
+}
+
+int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
+  if (src == NULL || dst == NULL) return 0;
+  if (src == dst) return 1;
+
+  PictureGrabSpecs(src, dst);
+  if (!WebPPictureAlloc(dst)) return 0;
+
+  if (!src->use_argb) {
+    WebPCopyPlane(src->y, src->y_stride,
+                  dst->y, dst->y_stride, dst->width, dst->height);
+    WebPCopyPlane(src->u, src->uv_stride, dst->u, dst->uv_stride,
+                  HALVE(dst->width), HALVE(dst->height));
+    WebPCopyPlane(src->v, src->uv_stride, dst->v, dst->uv_stride,
+                  HALVE(dst->width), HALVE(dst->height));
+    if (dst->a != NULL)  {
+      WebPCopyPlane(src->a, src->a_stride,
+                    dst->a, dst->a_stride, dst->width, dst->height);
+    }
+  } else {
+    WebPCopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
+                  (uint8_t*)dst->argb, 4 * dst->argb_stride,
+                  4 * dst->width, dst->height);
+  }
+  return 1;
+}
+
+int WebPPictureIsView(const WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (picture->use_argb) {
+    return (picture->memory_argb_ == NULL);
+  }
+  return (picture->memory_ == NULL);
+}
+
+int WebPPictureView(const WebPPicture* src,
+                    int left, int top, int width, int height,
+                    WebPPicture* dst) {
+  if (src == NULL || dst == NULL) return 0;
+
+  // verify rectangle position.
+  if (!AdjustAndCheckRectangle(src, &left, &top, width, height)) return 0;
+
+  if (src != dst) {  // beware of aliasing! We don't want to leak 'memory_'.
+    PictureGrabSpecs(src, dst);
+  }
+  dst->width = width;
+  dst->height = height;
+  if (!src->use_argb) {
+    dst->y = src->y + top * src->y_stride + left;
+    dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
+    dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
+    dst->y_stride = src->y_stride;
+    dst->uv_stride = src->uv_stride;
+    if (src->a != NULL) {
+      dst->a = src->a + top * src->a_stride + left;
+      dst->a_stride = src->a_stride;
+    }
+  } else {
+    dst->argb = src->argb + top * src->argb_stride + left;
+    dst->argb_stride = src->argb_stride;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Picture cropping
+
+int WebPPictureCrop(WebPPicture* pic,
+                    int left, int top, int width, int height) {
+  WebPPicture tmp;
+
+  if (pic == NULL) return 0;
+  if (!AdjustAndCheckRectangle(pic, &left, &top, width, height)) return 0;
+
+  PictureGrabSpecs(pic, &tmp);
+  tmp.width = width;
+  tmp.height = height;
+  if (!WebPPictureAlloc(&tmp)) return 0;
+
+  if (!pic->use_argb) {
+    const int y_offset = top * pic->y_stride + left;
+    const int uv_offset = (top / 2) * pic->uv_stride + left / 2;
+    WebPCopyPlane(pic->y + y_offset, pic->y_stride,
+                  tmp.y, tmp.y_stride, width, height);
+    WebPCopyPlane(pic->u + uv_offset, pic->uv_stride,
+                  tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
+    WebPCopyPlane(pic->v + uv_offset, pic->uv_stride,
+                  tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
+
+    if (tmp.a != NULL) {
+      const int a_offset = top * pic->a_stride + left;
+      WebPCopyPlane(pic->a + a_offset, pic->a_stride,
+                    tmp.a, tmp.a_stride, width, height);
+    }
+  } else {
+    const uint8_t* const src =
+        (const uint8_t*)(pic->argb + top * pic->argb_stride + left);
+    WebPCopyPlane(src, pic->argb_stride * 4, (uint8_t*)tmp.argb,
+                  tmp.argb_stride * 4, width * 4, height);
+  }
+  WebPPictureFree(pic);
+  *pic = tmp;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Simple picture rescaler
+
+static int RescalePlane(const uint8_t* src,
+                        int src_width, int src_height, int src_stride,
+                        uint8_t* dst,
+                        int dst_width, int dst_height, int dst_stride,
+                        rescaler_t* const work,
+                        int num_channels) {
+  WebPRescaler rescaler;
+  int y = 0;
+  if (!WebPRescalerInit(&rescaler, src_width, src_height,
+                        dst, dst_width, dst_height, dst_stride,
+                        num_channels, work)) {
+    return 0;
+  }
+  while (y < src_height) {
+    y += WebPRescalerImport(&rescaler, src_height - y,
+                            src + y * src_stride, src_stride);
+    WebPRescalerExport(&rescaler);
+  }
+  return 1;
+}
+
+static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
+  assert(pic->argb != NULL);
+  WebPMultARGBRows((uint8_t*)pic->argb, pic->argb_stride * sizeof(*pic->argb),
+                   pic->width, pic->height, inverse);
+}
+
+static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
+  if (pic->a != NULL) {
+    WebPMultRows(pic->y, pic->y_stride, pic->a, pic->a_stride,
+                 pic->width, pic->height, inverse);
+  }
+}
+
+int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+  WebPPicture tmp;
+  int prev_width, prev_height;
+  rescaler_t* work;
+
+  if (pic == NULL) return 0;
+  prev_width = pic->width;
+  prev_height = pic->height;
+  if (!WebPRescalerGetScaledDimensions(
+          prev_width, prev_height, &width, &height)) {
+    return 0;
+  }
+
+  PictureGrabSpecs(pic, &tmp);
+  tmp.width = width;
+  tmp.height = height;
+  if (!WebPPictureAlloc(&tmp)) return 0;
+
+  if (!pic->use_argb) {
+    work = (rescaler_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
+    if (work == NULL) {
+      WebPPictureFree(&tmp);
+      return 0;
+    }
+    // If present, we need to rescale alpha first (for AlphaMultiplyY).
+    if (pic->a != NULL) {
+      WebPInitAlphaProcessing();
+      if (!RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+                        tmp.a, width, height, tmp.a_stride, work, 1)) {
+        return 0;
+      }
+    }
+
+    // We take transparency into account on the luma plane only. That's not
+    // totally exact blending, but still is a good approximation.
+    AlphaMultiplyY(pic, 0);
+    if (!RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
+                      tmp.y, width, height, tmp.y_stride, work, 1) ||
+        !RescalePlane(pic->u,
+                      HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
+                      tmp.u,
+                      HALVE(width), HALVE(height), tmp.uv_stride, work, 1) ||
+        !RescalePlane(pic->v,
+                      HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
+                      tmp.v,
+                      HALVE(width), HALVE(height), tmp.uv_stride, work, 1)) {
+      return 0;
+    }
+    AlphaMultiplyY(&tmp, 1);
+  } else {
+    work = (rescaler_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
+    if (work == NULL) {
+      WebPPictureFree(&tmp);
+      return 0;
+    }
+    // In order to correctly interpolate colors, we need to apply the alpha
+    // weighting first (black-matting), scale the RGB values, and remove
+    // the premultiplication afterward (while preserving the alpha channel).
+    WebPInitAlphaProcessing();
+    AlphaMultiplyARGB(pic, 0);
+    if (!RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
+                      pic->argb_stride * 4,
+                      (uint8_t*)tmp.argb, width, height,
+                      tmp.argb_stride * 4, work, 4)) {
+      return 0;
+    }
+    AlphaMultiplyARGB(&tmp, 1);
+  }
+  WebPPictureFree(pic);
+  WebPSafeFree(work);
+  *pic = tmp;
+  return 1;
+}
+
+#else  // defined(WEBP_REDUCE_SIZE)
+
+int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
+  (void)src;
+  (void)dst;
+  return 0;
+}
+
+int WebPPictureIsView(const WebPPicture* picture) {
+  (void)picture;
+  return 0;
+}
+
+int WebPPictureView(const WebPPicture* src,
+                    int left, int top, int width, int height,
+                    WebPPicture* dst) {
+  (void)src;
+  (void)left;
+  (void)top;
+  (void)width;
+  (void)height;
+  (void)dst;
+  return 0;
+}
+
+int WebPPictureCrop(WebPPicture* pic,
+                    int left, int top, int width, int height) {
+  (void)pic;
+  (void)left;
+  (void)top;
+  (void)width;
+  (void)height;
+  return 0;
+}
+
+int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+  (void)pic;
+  (void)width;
+  (void)height;
+  return 0;
+}
+#endif  // !defined(WEBP_REDUCE_SIZE)
diff --git a/media/libwebp/enc/picture_tools_enc.c b/media/libwebp/enc/picture_tools_enc.c
new file mode 100644
index 0000000000..02d48c5223
--- /dev/null
+++ b/media/libwebp/enc/picture_tools_enc.c
@@ -0,0 +1,273 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools: alpha handling, etc.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+
+#include "../enc/vp8i_enc.h"
+#include "../dsp/yuv.h"
+
+//------------------------------------------------------------------------------
+// Helper: clean up fully transparent area to help compressibility.
+
+#define SIZE 8
+#define SIZE2 (SIZE / 2)
+static int IsTransparentARGBArea(const uint32_t* ptr, int stride, int size) {
+  int y, x;
+  for (y = 0; y < size; ++y) {
+    for (x = 0; x < size; ++x) {
+      if (ptr[x] & 0xff000000u) {
+        return 0;
+      }
+    }
+    ptr += stride;
+  }
+  return 1;
+}
+
+static void Flatten(uint8_t* ptr, int v, int stride, int size) {
+  int y;
+  for (y = 0; y < size; ++y) {
+    memset(ptr, v, size);
+    ptr += stride;
+  }
+}
+
+static void FlattenARGB(uint32_t* ptr, uint32_t v, int stride, int size) {
+  int x, y;
+  for (y = 0; y < size; ++y) {
+    for (x = 0; x < size; ++x) ptr[x] = v;
+    ptr += stride;
+  }
+}
+
+// Smoothen the luma components of transparent pixels. Return true if the whole
+// block is transparent.
+static int SmoothenBlock(const uint8_t* a_ptr, int a_stride, uint8_t* y_ptr,
+                         int y_stride, int width, int height) {
+  int sum = 0, count = 0;
+  int x, y;
+  const uint8_t* alpha_ptr = a_ptr;
+  uint8_t* luma_ptr = y_ptr;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      if (alpha_ptr[x] != 0) {
+        ++count;
+        sum += luma_ptr[x];
+      }
+    }
+    alpha_ptr += a_stride;
+    luma_ptr += y_stride;
+  }
+  if (count > 0 && count < width * height) {
+    const uint8_t avg_u8 = (uint8_t)(sum / count);
+    alpha_ptr = a_ptr;
+    luma_ptr = y_ptr;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        if (alpha_ptr[x] == 0) luma_ptr[x] = avg_u8;
+      }
+      alpha_ptr += a_stride;
+      luma_ptr += y_stride;
+    }
+  }
+  return (count == 0);
+}
+
+void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color) {
+  if (pic != NULL && pic->use_argb) {
+    int y = pic->height;
+    uint32_t* argb = pic->argb;
+    color &= 0xffffffu;   // force alpha=0
+    WebPInitAlphaProcessing();
+    while (y-- > 0) {
+      WebPAlphaReplace(argb, pic->width, color);
+      argb += pic->argb_stride;
+    }
+  }
+}
+
+void WebPCleanupTransparentArea(WebPPicture* pic) {
+  int x, y, w, h;
+  if (pic == NULL) return;
+  w = pic->width / SIZE;
+  h = pic->height / SIZE;
+
+  // note: we ignore the left-overs on right/bottom, except for SmoothenBlock().
+  if (pic->use_argb) {
+    uint32_t argb_value = 0;
+    for (y = 0; y < h; ++y) {
+      int need_reset = 1;
+      for (x = 0; x < w; ++x) {
+        const int off = (y * pic->argb_stride + x) * SIZE;
+        if (IsTransparentARGBArea(pic->argb + off, pic->argb_stride, SIZE)) {
+          if (need_reset) {
+            argb_value = pic->argb[off];
+            need_reset = 0;
+          }
+          FlattenARGB(pic->argb + off, argb_value, pic->argb_stride, SIZE);
+        } else {
+          need_reset = 1;
+        }
+      }
+    }
+  } else {
+    const int width = pic->width;
+    const int height = pic->height;
+    const int y_stride = pic->y_stride;
+    const int uv_stride = pic->uv_stride;
+    const int a_stride = pic->a_stride;
+    uint8_t* y_ptr = pic->y;
+    uint8_t* u_ptr = pic->u;
+    uint8_t* v_ptr = pic->v;
+    const uint8_t* a_ptr = pic->a;
+    int values[3] = { 0 };
+    if (a_ptr == NULL || y_ptr == NULL || u_ptr == NULL || v_ptr == NULL) {
+      return;
+    }
+    for (y = 0; y + SIZE <= height; y += SIZE) {
+      int need_reset = 1;
+      for (x = 0; x + SIZE <= width; x += SIZE) {
+        if (SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                          SIZE, SIZE)) {
+          if (need_reset) {
+            values[0] = y_ptr[x];
+            values[1] = u_ptr[x >> 1];
+            values[2] = v_ptr[x >> 1];
+            need_reset = 0;
+          }
+          Flatten(y_ptr + x,        values[0], y_stride,  SIZE);
+          Flatten(u_ptr + (x >> 1), values[1], uv_stride, SIZE2);
+          Flatten(v_ptr + (x >> 1), values[2], uv_stride, SIZE2);
+        } else {
+          need_reset = 1;
+        }
+      }
+      if (x < width) {
+        SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                      width - x, SIZE);
+      }
+      a_ptr += SIZE * a_stride;
+      y_ptr += SIZE * y_stride;
+      u_ptr += SIZE2 * uv_stride;
+      v_ptr += SIZE2 * uv_stride;
+    }
+    if (y < height) {
+      const int sub_height = height - y;
+      for (x = 0; x + SIZE <= width; x += SIZE) {
+        SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                      SIZE, sub_height);
+      }
+      if (x < width) {
+        SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                      width - x, sub_height);
+      }
+    }
+  }
+}
+
+#undef SIZE
+#undef SIZE2
+
+//------------------------------------------------------------------------------
+// Blend color and remove transparency info
+
+#define BLEND(V0, V1, ALPHA) \
+    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 256) >> 16)
+#define BLEND_10BIT(V0, V1, ALPHA) \
+    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 1024) >> 18)
+
+static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
+  return (0xff000000u | (r << 16) | (g << 8) | b);
+}
+
+void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
+  const int red = (background_rgb >> 16) & 0xff;
+  const int green = (background_rgb >> 8) & 0xff;
+  const int blue = (background_rgb >> 0) & 0xff;
+  int x, y;
+  if (pic == NULL) return;
+  if (!pic->use_argb) {
+    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
+    const int Y0 = VP8RGBToY(red, green, blue, YUV_HALF);
+    // VP8RGBToU/V expects the u/v values summed over four pixels
+    const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
+    const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
+    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
+    uint8_t* y_ptr = pic->y;
+    uint8_t* u_ptr = pic->u;
+    uint8_t* v_ptr = pic->v;
+    uint8_t* a_ptr = pic->a;
+    if (!has_alpha || a_ptr == NULL) return;    // nothing to do
+    for (y = 0; y < pic->height; ++y) {
+      // Luma blending
+      for (x = 0; x < pic->width; ++x) {
+        const uint8_t alpha = a_ptr[x];
+        if (alpha < 0xff) {
+          y_ptr[x] = BLEND(Y0, y_ptr[x], alpha);
+        }
+      }
+      // Chroma blending every even line
+      if ((y & 1) == 0) {
+        uint8_t* const a_ptr2 =
+            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
+        for (x = 0; x < uv_width; ++x) {
+          // Average four alpha values into a single blending weight.
+          // TODO(skal): might lead to visible contouring. Can we do better?
+          const uint32_t alpha =
+              a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
+              a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
+          u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
+          v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
+        }
+        if (pic->width & 1) {   // rightmost pixel
+          const uint32_t alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
+          u_ptr[x] = BLEND_10BIT(U0, u_ptr[x], alpha);
+          v_ptr[x] = BLEND_10BIT(V0, v_ptr[x], alpha);
+        }
+      } else {
+        u_ptr += pic->uv_stride;
+        v_ptr += pic->uv_stride;
+      }
+      memset(a_ptr, 0xff, pic->width);  // reset alpha value to opaque
+      a_ptr += pic->a_stride;
+      y_ptr += pic->y_stride;
+    }
+  } else {
+    uint32_t* argb = pic->argb;
+    const uint32_t background = MakeARGB32(red, green, blue);
+    for (y = 0; y < pic->height; ++y) {
+      for (x = 0; x < pic->width; ++x) {
+        const int alpha = (argb[x] >> 24) & 0xff;
+        if (alpha != 0xff) {
+          if (alpha > 0) {
+            int r = (argb[x] >> 16) & 0xff;
+            int g = (argb[x] >>  8) & 0xff;
+            int b = (argb[x] >>  0) & 0xff;
+            r = BLEND(red, r, alpha);
+            g = BLEND(green, g, alpha);
+            b = BLEND(blue, b, alpha);
+            argb[x] = MakeARGB32(r, g, b);
+          } else {
+            argb[x] = background;
+          }
+        }
+      }
+      argb += pic->argb_stride;
+    }
+  }
+}
+
+#undef BLEND
+#undef BLEND_10BIT
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/predictor_enc.c b/media/libwebp/enc/predictor_enc.c
new file mode 100644
index 0000000000..794c45cde6
--- /dev/null
+++ b/media/libwebp/enc/predictor_enc.c
@@ -0,0 +1,772 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+//          Urvang Joshi (urvang@google.com)
+//          Vincent Rabaud (vrabaud@google.com)
+
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../enc/vp8li_enc.h"
+
+#define MAX_DIFF_COST (1e30f)
+
+static const float kSpatialPredictorBias = 15.f;
+static const int kPredLowEffort = 11;
+static const uint32_t kMaskAlpha = 0xff000000;
+
+// Mostly used to reduce code size + readability
+static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
+
+//------------------------------------------------------------------------------
+// Methods to calculate Entropy (Shannon).
+
+static float PredictionCostSpatial(const int counts[256], int weight_0,
+                                   double exp_val) {
+  const int significant_symbols = 256 >> 4;
+  const double exp_decay_factor = 0.6;
+  double bits = weight_0 * counts[0];
+  int i;
+  for (i = 1; i < significant_symbols; ++i) {
+    bits += exp_val * (counts[i] + counts[256 - i]);
+    exp_val *= exp_decay_factor;
+  }
+  return (float)(-0.1 * bits);
+}
+
+static float PredictionCostSpatialHistogram(const int accumulated[4][256],
+                                            const int tile[4][256]) {
+  int i;
+  double retval = 0;
+  for (i = 0; i < 4; ++i) {
+    const double kExpValue = 0.94;
+    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
+    retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
+  }
+  return (float)retval;
+}
+
+static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
+  ++histo_argb[0][argb >> 24];
+  ++histo_argb[1][(argb >> 16) & 0xff];
+  ++histo_argb[2][(argb >> 8) & 0xff];
+  ++histo_argb[3][argb & 0xff];
+}
+
+//------------------------------------------------------------------------------
+// Spatial transform functions.
+
+static WEBP_INLINE void PredictBatch(int mode, int x_start, int y,
+                                     int num_pixels, const uint32_t* current,
+                                     const uint32_t* upper, uint32_t* out) {
+  if (x_start == 0) {
+    if (y == 0) {
+      // ARGB_BLACK.
+      VP8LPredictorsSub[0](current, NULL, 1, out);
+    } else {
+      // Top one.
+      VP8LPredictorsSub[2](current, upper, 1, out);
+    }
+    ++x_start;
+    ++out;
+    --num_pixels;
+  }
+  if (y == 0) {
+    // Left one.
+    VP8LPredictorsSub[1](current + x_start, NULL, num_pixels, out);
+  } else {
+    VP8LPredictorsSub[mode](current + x_start, upper + x_start, num_pixels,
+                            out);
+  }
+}
+
+#if (WEBP_NEAR_LOSSLESS == 1)
+static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
+
+static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
+  const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
+  const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
+  const int diff_g = abs((int)((p1 >> 8) & 0xff) - (int)((p2 >> 8) & 0xff));
+  const int diff_b = abs((int)(p1 & 0xff) - (int)(p2 & 0xff));
+  return GetMax(GetMax(diff_a, diff_r), GetMax(diff_g, diff_b));
+}
+
+static int MaxDiffAroundPixel(uint32_t current, uint32_t up, uint32_t down,
+                              uint32_t left, uint32_t right) {
+  const int diff_up = MaxDiffBetweenPixels(current, up);
+  const int diff_down = MaxDiffBetweenPixels(current, down);
+  const int diff_left = MaxDiffBetweenPixels(current, left);
+  const int diff_right = MaxDiffBetweenPixels(current, right);
+  return GetMax(GetMax(diff_up, diff_down), GetMax(diff_left, diff_right));
+}
+
+static uint32_t AddGreenToBlueAndRed(uint32_t argb) {
+  const uint32_t green = (argb >> 8) & 0xff;
+  uint32_t red_blue = argb & 0x00ff00ffu;
+  red_blue += (green << 16) | green;
+  red_blue &= 0x00ff00ffu;
+  return (argb & 0xff00ff00u) | red_blue;
+}
+
+static void MaxDiffsForRow(int width, int stride, const uint32_t* const argb,
+                           uint8_t* const max_diffs, int used_subtract_green) {
+  uint32_t current, up, down, left, right;
+  int x;
+  if (width <= 2) return;
+  current = argb[0];
+  right = argb[1];
+  if (used_subtract_green) {
+    current = AddGreenToBlueAndRed(current);
+    right = AddGreenToBlueAndRed(right);
+  }
+  // max_diffs[0] and max_diffs[width - 1] are never used.
+  for (x = 1; x < width - 1; ++x) {
+    up = argb[-stride + x];
+    down = argb[stride + x];
+    left = current;
+    current = right;
+    right = argb[x + 1];
+    if (used_subtract_green) {
+      up = AddGreenToBlueAndRed(up);
+      down = AddGreenToBlueAndRed(down);
+      right = AddGreenToBlueAndRed(right);
+    }
+    max_diffs[x] = MaxDiffAroundPixel(current, up, down, left, right);
+  }
+}
+
+// Quantize the difference between the actual component value and its prediction
+// to a multiple of quantization, working modulo 256, taking care not to cross
+// a boundary (inclusive upper limit).
+static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
+                                     uint8_t boundary, int quantization) {
+  const int residual = (value - predict) & 0xff;
+  const int boundary_residual = (boundary - predict) & 0xff;
+  const int lower = residual & ~(quantization - 1);
+  const int upper = lower + quantization;
+  // Resolve ties towards a value closer to the prediction (i.e. towards lower
+  // if value comes after prediction and towards upper otherwise).
+  const int bias = ((boundary - value) & 0xff) < boundary_residual;
+  if (residual - lower < upper - residual + bias) {
+    // lower is closer to residual than upper.
+    if (residual > boundary_residual && lower <= boundary_residual) {
+      // Halve quantization step to avoid crossing boundary. This midpoint is
+      // on the same side of boundary as residual because midpoint >= residual
+      // (since lower is closer than upper) and residual is above the boundary.
+      return lower + (quantization >> 1);
+    }
+    return lower;
+  } else {
+    // upper is closer to residual than lower.
+    if (residual <= boundary_residual && upper > boundary_residual) {
+      // Halve quantization step to avoid crossing boundary. This midpoint is
+      // on the same side of boundary as residual because midpoint <= residual
+      // (since upper is closer than lower) and residual is below the boundary.
+      return lower + (quantization >> 1);
+    }
+    return upper & 0xff;
+  }
+}
+
+static WEBP_INLINE uint8_t NearLosslessDiff(uint8_t a, uint8_t b) {
+  return (uint8_t)((((int)(a) - (int)(b))) & 0xff);
+}
+
+// Quantize every component of the difference between the actual pixel value and
+// its prediction to a multiple of a quantization (a power of 2, not larger than
+// max_quantization which is a power of 2, smaller than max_diff). Take care if
+// value and predict have undergone subtract green, which means that red and
+// blue are represented as offsets from green.
+static uint32_t NearLossless(uint32_t value, uint32_t predict,
+                             int max_quantization, int max_diff,
+                             int used_subtract_green) {
+  int quantization;
+  uint8_t new_green = 0;
+  uint8_t green_diff = 0;
+  uint8_t a, r, g, b;
+  if (max_diff <= 2) {
+    return VP8LSubPixels(value, predict);
+  }
+  quantization = max_quantization;
+  while (quantization >= max_diff) {
+    quantization >>= 1;
+  }
+  if ((value >> 24) == 0 || (value >> 24) == 0xff) {
+    // Preserve transparency of fully transparent or fully opaque pixels.
+    a = NearLosslessDiff((value >> 24) & 0xff, (predict >> 24) & 0xff);
+  } else {
+    a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
+  }
+  g = NearLosslessComponent((value >> 8) & 0xff, (predict >> 8) & 0xff, 0xff,
+                            quantization);
+  if (used_subtract_green) {
+    // The green offset will be added to red and blue components during decoding
+    // to obtain the actual red and blue values.
+    new_green = ((predict >> 8) + g) & 0xff;
+    // The amount by which green has been adjusted during quantization. It is
+    // subtracted from red and blue for compensation, to avoid accumulating two
+    // quantization errors in them.
+    green_diff = NearLosslessDiff(new_green, (value >> 8) & 0xff);
+  }
+  r = NearLosslessComponent(NearLosslessDiff((value >> 16) & 0xff, green_diff),
+                            (predict >> 16) & 0xff, 0xff - new_green,
+                            quantization);
+  b = NearLosslessComponent(NearLosslessDiff(value & 0xff, green_diff),
+                            predict & 0xff, 0xff - new_green, quantization);
+  return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
+}
+#endif  // (WEBP_NEAR_LOSSLESS == 1)
+
+// Stores the difference between the pixel and its prediction in "out".
+// In case of a lossy encoding, updates the source image to avoid propagating
+// the deviation further to pixels which depend on the current pixel for their
+// predictions.
+static WEBP_INLINE void GetResidual(
+    int width, int height, uint32_t* const upper_row,
+    uint32_t* const current_row, const uint8_t* const max_diffs, int mode,
+    int x_start, int x_end, int y, int max_quantization, int exact,
+    int used_subtract_green, uint32_t* const out) {
+  if (exact) {
+    PredictBatch(mode, x_start, y, x_end - x_start, current_row, upper_row,
+                 out);
+  } else {
+    const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
+    int x;
+    for (x = x_start; x < x_end; ++x) {
+      uint32_t predict;
+      uint32_t residual;
+      if (y == 0) {
+        predict = (x == 0) ? ARGB_BLACK : current_row[x - 1];  // Left.
+      } else if (x == 0) {
+        predict = upper_row[x];  // Top.
+      } else {
+        predict = pred_func(&current_row[x - 1], upper_row + x);
+      }
+#if (WEBP_NEAR_LOSSLESS == 1)
+      if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
+          x == 0 || x == width - 1) {
+        residual = VP8LSubPixels(current_row[x], predict);
+      } else {
+        residual = NearLossless(current_row[x], predict, max_quantization,
+                                max_diffs[x], used_subtract_green);
+        // Update the source image.
+        current_row[x] = VP8LAddPixels(predict, residual);
+        // x is never 0 here so we do not need to update upper_row like below.
+      }
+#else
+      (void)max_diffs;
+      (void)height;
+      (void)max_quantization;
+      (void)used_subtract_green;
+      residual = VP8LSubPixels(current_row[x], predict);
+#endif
+      if ((current_row[x] & kMaskAlpha) == 0) {
+        // If alpha is 0, cleanup RGB. We can choose the RGB values of the
+        // residual for best compression. The prediction of alpha itself can be
+        // non-zero and must be kept though. We choose RGB of the residual to be
+        // 0.
+        residual &= kMaskAlpha;
+        // Update the source image.
+        current_row[x] = predict & ~kMaskAlpha;
+        // The prediction for the rightmost pixel in a row uses the leftmost
+        // pixel
+        // in that row as its top-right context pixel. Hence if we change the
+        // leftmost pixel of current_row, the corresponding change must be
+        // applied
+        // to upper_row as well where top-right context is being read from.
+        if (x == 0 && y != 0) upper_row[width] = current_row[0];
+      }
+      out[x - x_start] = residual;
+    }
+  }
+}
+
+// Returns best predictor and updates the accumulated histogram.
+// If max_quantization > 1, assumes that near lossless processing will be
+// applied, quantizing residuals to multiples of quantization levels up to
+// max_quantization (the actual quantization level depends on smoothness near
+// the given pixel).
+static int GetBestPredictorForTile(int width, int height,
+                                   int tile_x, int tile_y, int bits,
+                                   int accumulated[4][256],
+                                   uint32_t* const argb_scratch,
+                                   const uint32_t* const argb,
+                                   int max_quantization,
+                                   int exact, int used_subtract_green,
+                                   const uint32_t* const modes) {
+  const int kNumPredModes = 14;
+  const int start_x = tile_x << bits;
+  const int start_y = tile_y << bits;
+  const int tile_size = 1 << bits;
+  const int max_y = GetMin(tile_size, height - start_y);
+  const int max_x = GetMin(tile_size, width - start_x);
+  // Whether there exist columns just outside the tile.
+  const int have_left = (start_x > 0);
+  // Position and size of the strip covering the tile and adjacent columns if
+  // they exist.
+  const int context_start_x = start_x - have_left;
+#if (WEBP_NEAR_LOSSLESS == 1)
+  const int context_width = max_x + have_left + (max_x < width - start_x);
+#endif
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  // Prediction modes of the left and above neighbor tiles.
+  const int left_mode = (tile_x > 0) ?
+      (modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff : 0xff;
+  const int above_mode = (tile_y > 0) ?
+      (modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff : 0xff;
+  // The width of upper_row and current_row is one pixel larger than image width
+  // to allow the top right pixel to point to the leftmost pixel of the next row
+  // when at the right edge.
+  uint32_t* upper_row = argb_scratch;
+  uint32_t* current_row = upper_row + width + 1;
+  uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
+  float best_diff = MAX_DIFF_COST;
+  int best_mode = 0;
+  int mode;
+  int histo_stack_1[4][256];
+  int histo_stack_2[4][256];
+  // Need pointers to be able to swap arrays.
+  int (*histo_argb)[256] = histo_stack_1;
+  int (*best_histo)[256] = histo_stack_2;
+  int i, j;
+  uint32_t residuals[1 << MAX_TRANSFORM_BITS];
+  assert(bits <= MAX_TRANSFORM_BITS);
+  assert(max_x <= (1 << MAX_TRANSFORM_BITS));
+
+  for (mode = 0; mode < kNumPredModes; ++mode) {
+    float cur_diff;
+    int relative_y;
+    memset(histo_argb, 0, sizeof(histo_stack_1));
+    if (start_y > 0) {
+      // Read the row above the tile which will become the first upper_row.
+      // Include a pixel to the left if it exists; include a pixel to the right
+      // in all cases (wrapping to the leftmost pixel of the next row if it does
+      // not exist).
+      memcpy(current_row + context_start_x,
+             argb + (start_y - 1) * width + context_start_x,
+             sizeof(*argb) * (max_x + have_left + 1));
+    }
+    for (relative_y = 0; relative_y < max_y; ++relative_y) {
+      const int y = start_y + relative_y;
+      int relative_x;
+      uint32_t* tmp = upper_row;
+      upper_row = current_row;
+      current_row = tmp;
+      // Read current_row. Include a pixel to the left if it exists; include a
+      // pixel to the right in all cases except at the bottom right corner of
+      // the image (wrapping to the leftmost pixel of the next row if it does
+      // not exist in the current row).
+      memcpy(current_row + context_start_x,
+             argb + y * width + context_start_x,
+             sizeof(*argb) * (max_x + have_left + (y + 1 < height)));
+#if (WEBP_NEAR_LOSSLESS == 1)
+      if (max_quantization > 1 && y >= 1 && y + 1 < height) {
+        MaxDiffsForRow(context_width, width, argb + y * width + context_start_x,
+                       max_diffs + context_start_x, used_subtract_green);
+      }
+#endif
+
+      GetResidual(width, height, upper_row, current_row, max_diffs, mode,
+                  start_x, start_x + max_x, y, max_quantization, exact,
+                  used_subtract_green, residuals);
+      for (relative_x = 0; relative_x < max_x; ++relative_x) {
+        UpdateHisto(histo_argb, residuals[relative_x]);
+      }
+    }
+    cur_diff = PredictionCostSpatialHistogram(
+        (const int (*)[256])accumulated, (const int (*)[256])histo_argb);
+    // Favor keeping the areas locally similar.
+    if (mode == left_mode) cur_diff -= kSpatialPredictorBias;
+    if (mode == above_mode) cur_diff -= kSpatialPredictorBias;
+
+    if (cur_diff < best_diff) {
+      int (*tmp)[256] = histo_argb;
+      histo_argb = best_histo;
+      best_histo = tmp;
+      best_diff = cur_diff;
+      best_mode = mode;
+    }
+  }
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 256; j++) {
+      accumulated[i][j] += best_histo[i][j];
+    }
+  }
+
+  return best_mode;
+}
+
+// Converts pixels of the image to residuals with respect to predictions.
+// If max_quantization > 1, applies near lossless processing, quantizing
+// residuals to multiples of quantization levels up to max_quantization
+// (the actual quantization level depends on smoothness near the given pixel).
+static void CopyImageWithPrediction(int width, int height,
+                                    int bits, uint32_t* const modes,
+                                    uint32_t* const argb_scratch,
+                                    uint32_t* const argb,
+                                    int low_effort, int max_quantization,
+                                    int exact, int used_subtract_green) {
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  // The width of upper_row and current_row is one pixel larger than image width
+  // to allow the top right pixel to point to the leftmost pixel of the next row
+  // when at the right edge.
+  uint32_t* upper_row = argb_scratch;
+  uint32_t* current_row = upper_row + width + 1;
+  uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1);
+#if (WEBP_NEAR_LOSSLESS == 1)
+  uint8_t* lower_max_diffs = current_max_diffs + width;
+#endif
+  int y;
+
+  for (y = 0; y < height; ++y) {
+    int x;
+    uint32_t* const tmp32 = upper_row;
+    upper_row = current_row;
+    current_row = tmp32;
+    memcpy(current_row, argb + y * width,
+           sizeof(*argb) * (width + (y + 1 < height)));
+
+    if (low_effort) {
+      PredictBatch(kPredLowEffort, 0, y, width, current_row, upper_row,
+                   argb + y * width);
+    } else {
+#if (WEBP_NEAR_LOSSLESS == 1)
+      if (max_quantization > 1) {
+        // Compute max_diffs for the lower row now, because that needs the
+        // contents of argb for the current row, which we will overwrite with
+        // residuals before proceeding with the next row.
+        uint8_t* const tmp8 = current_max_diffs;
+        current_max_diffs = lower_max_diffs;
+        lower_max_diffs = tmp8;
+        if (y + 2 < height) {
+          MaxDiffsForRow(width, width, argb + (y + 1) * width, lower_max_diffs,
+                         used_subtract_green);
+        }
+      }
+#endif
+      for (x = 0; x < width;) {
+        const int mode =
+            (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
+        int x_end = x + (1 << bits);
+        if (x_end > width) x_end = width;
+        GetResidual(width, height, upper_row, current_row, current_max_diffs,
+                    mode, x, x_end, y, max_quantization, exact,
+                    used_subtract_green, argb + y * width + x);
+        x = x_end;
+      }
+    }
+  }
+}
+
+// Finds the best predictor for each tile, and converts the image to residuals
+// with respect to predictions. If near_lossless_quality < 100, applies
+// near lossless processing, shaving off more bits of residuals for lower
+// qualities.
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image, int near_lossless_quality,
+                       int exact, int used_subtract_green) {
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  const int tiles_per_col = VP8LSubSampleSize(height, bits);
+  int tile_y;
+  int histo[4][256];
+  const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
+  if (low_effort) {
+    int i;
+    for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
+      image[i] = ARGB_BLACK | (kPredLowEffort << 8);
+    }
+  } else {
+    memset(histo, 0, sizeof(histo));
+    for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
+      int tile_x;
+      for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
+        const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
+            bits, histo, argb_scratch, argb, max_quantization, exact,
+            used_subtract_green, image);
+        image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
+      }
+    }
+  }
+
+  CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
+                          low_effort, max_quantization, exact,
+                          used_subtract_green);
+}
+
+//------------------------------------------------------------------------------
+// Color transform functions.
+
+static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
+  m->green_to_red_ = 0;
+  m->green_to_blue_ = 0;
+  m->red_to_blue_ = 0;
+}
+
+static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
+                                               VP8LMultipliers* const m) {
+  m->green_to_red_  = (color_code >>  0) & 0xff;
+  m->green_to_blue_ = (color_code >>  8) & 0xff;
+  m->red_to_blue_   = (color_code >> 16) & 0xff;
+}
+
+static WEBP_INLINE uint32_t MultipliersToColorCode(
+    const VP8LMultipliers* const m) {
+  return 0xff000000u |
+         ((uint32_t)(m->red_to_blue_) << 16) |
+         ((uint32_t)(m->green_to_blue_) << 8) |
+         m->green_to_red_;
+}
+
+static float PredictionCostCrossColor(const int accumulated[256],
+                                      const int counts[256]) {
+  // Favor low entropy, locally and globally.
+  // Favor small absolute values for PredictionCostSpatial
+  static const double kExpValue = 2.4;
+  return VP8LCombinedShannonEntropy(counts, accumulated) +
+         PredictionCostSpatial(counts, 3, kExpValue);
+}
+
+static float GetPredictionCostCrossColorRed(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
+    const int accumulated_red_histo[256]) {
+  int histo[256] = { 0 };
+  float cur_diff;
+
+  VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
+                                green_to_red, histo);
+
+  cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
+  if ((uint8_t)green_to_red == prev_x.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_red == prev_y.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_red == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+static void GetBestGreenToRed(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) {
+  const int kMaxIters = 4 + ((7 * quality) >> 8);  // in range [4..6]
+  int green_to_red_best = 0;
+  int iter, offset;
+  float best_diff = GetPredictionCostCrossColorRed(
+      argb, stride, tile_width, tile_height, prev_x, prev_y,
+      green_to_red_best, accumulated_red_histo);
+  for (iter = 0; iter < kMaxIters; ++iter) {
+    // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
+    // one in color computation. Having initial delta here as 1 is sufficient
+    // to explore the range of (-2, 2).
+    const int delta = 32 >> iter;
+    // Try a negative and a positive delta from the best known value.
+    for (offset = -delta; offset <= delta; offset += 2 * delta) {
+      const int green_to_red_cur = offset + green_to_red_best;
+      const float cur_diff = GetPredictionCostCrossColorRed(
+          argb, stride, tile_width, tile_height, prev_x, prev_y,
+          green_to_red_cur, accumulated_red_histo);
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        green_to_red_best = green_to_red_cur;
+      }
+    }
+  }
+  best_tx->green_to_red_ = (green_to_red_best & 0xff);
+}
+
+static float GetPredictionCostCrossColorBlue(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y,
+    int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) {
+  int histo[256] = { 0 };
+  float cur_diff;
+
+  VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
+                                 green_to_blue, red_to_blue, histo);
+
+  cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
+  if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  if (red_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+#define kGreenRedToBlueNumAxis 8
+#define kGreenRedToBlueMaxIters 7
+static void GetBestGreenRedToBlue(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_blue_histo[256],
+    VP8LMultipliers* const best_tx) {
+  const int8_t offset[kGreenRedToBlueNumAxis][2] =
+      {{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
+  const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
+  const int iters =
+      (quality < 25) ? 1 : (quality > 50) ? kGreenRedToBlueMaxIters : 4;
+  int green_to_blue_best = 0;
+  int red_to_blue_best = 0;
+  int iter;
+  // Initial value at origin:
+  float best_diff = GetPredictionCostCrossColorBlue(
+      argb, stride, tile_width, tile_height, prev_x, prev_y,
+      green_to_blue_best, red_to_blue_best, accumulated_blue_histo);
+  for (iter = 0; iter < iters; ++iter) {
+    const int delta = delta_lut[iter];
+    int axis;
+    for (axis = 0; axis < kGreenRedToBlueNumAxis; ++axis) {
+      const int green_to_blue_cur =
+          offset[axis][0] * delta + green_to_blue_best;
+      const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
+      const float cur_diff = GetPredictionCostCrossColorBlue(
+          argb, stride, tile_width, tile_height, prev_x, prev_y,
+          green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        green_to_blue_best = green_to_blue_cur;
+        red_to_blue_best = red_to_blue_cur;
+      }
+      if (quality < 25 && iter == 4) {
+        // Only axis aligned diffs for lower quality.
+        break;  // next iter.
+      }
+    }
+    if (delta == 2 && green_to_blue_best == 0 && red_to_blue_best == 0) {
+      // Further iterations would not help.
+      break;  // out of iter-loop.
+    }
+  }
+  best_tx->green_to_blue_ = green_to_blue_best & 0xff;
+  best_tx->red_to_blue_ = red_to_blue_best & 0xff;
+}
+#undef kGreenRedToBlueMaxIters
+#undef kGreenRedToBlueNumAxis
+
+static VP8LMultipliers GetBestColorTransformForTile(
+    int tile_x, int tile_y, int bits,
+    VP8LMultipliers prev_x,
+    VP8LMultipliers prev_y,
+    int quality, int xsize, int ysize,
+    const int accumulated_red_histo[256],
+    const int accumulated_blue_histo[256],
+    const uint32_t* const argb) {
+  const int max_tile_size = 1 << bits;
+  const int tile_y_offset = tile_y * max_tile_size;
+  const int tile_x_offset = tile_x * max_tile_size;
+  const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
+  const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
+  const int tile_width = all_x_max - tile_x_offset;
+  const int tile_height = all_y_max - tile_y_offset;
+  const uint32_t* const tile_argb = argb + tile_y_offset * xsize
+                                  + tile_x_offset;
+  VP8LMultipliers best_tx;
+  MultipliersClear(&best_tx);
+
+  GetBestGreenToRed(tile_argb, xsize, tile_width, tile_height,
+                    prev_x, prev_y, quality, accumulated_red_histo, &best_tx);
+  GetBestGreenRedToBlue(tile_argb, xsize, tile_width, tile_height,
+                        prev_x, prev_y, quality, accumulated_blue_histo,
+                        &best_tx);
+  return best_tx;
+}
+
+static void CopyTileWithColorTransform(int xsize, int ysize,
+                                       int tile_x, int tile_y,
+                                       int max_tile_size,
+                                       VP8LMultipliers color_transform,
+                                       uint32_t* argb) {
+  const int xscan = GetMin(max_tile_size, xsize - tile_x);
+  int yscan = GetMin(max_tile_size, ysize - tile_y);
+  argb += tile_y * xsize + tile_x;
+  while (yscan-- > 0) {
+    VP8LTransformColor(&color_transform, argb, xscan);
+    argb += xsize;
+  }
+}
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                             uint32_t* const argb, uint32_t* image) {
+  const int max_tile_size = 1 << bits;
+  const int tile_xsize = VP8LSubSampleSize(width, bits);
+  const int tile_ysize = VP8LSubSampleSize(height, bits);
+  int accumulated_red_histo[256] = { 0 };
+  int accumulated_blue_histo[256] = { 0 };
+  int tile_x, tile_y;
+  VP8LMultipliers prev_x, prev_y;
+  MultipliersClear(&prev_y);
+  MultipliersClear(&prev_x);
+  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
+    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
+      int y;
+      const int tile_x_offset = tile_x * max_tile_size;
+      const int tile_y_offset = tile_y * max_tile_size;
+      const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
+      const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
+      const int offset = tile_y * tile_xsize + tile_x;
+      if (tile_y != 0) {
+        ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
+      }
+      prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
+                                            prev_x, prev_y,
+                                            quality, width, height,
+                                            accumulated_red_histo,
+                                            accumulated_blue_histo,
+                                            argb);
+      image[offset] = MultipliersToColorCode(&prev_x);
+      CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
+                                 max_tile_size, prev_x, argb);
+
+      // Gather accumulated histogram data.
+      for (y = tile_y_offset; y < all_y_max; ++y) {
+        int ix = y * width + tile_x_offset;
+        const int ix_end = ix + all_x_max - tile_x_offset;
+        for (; ix < ix_end; ++ix) {
+          const uint32_t pix = argb[ix];
+          if (ix >= 2 &&
+              pix == argb[ix - 2] &&
+              pix == argb[ix - 1]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          if (ix >= width + 2 &&
+              argb[ix - 2] == argb[ix - width - 2] &&
+              argb[ix - 1] == argb[ix - width - 1] &&
+              pix == argb[ix - width]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          ++accumulated_red_histo[(pix >> 16) & 0xff];
+          ++accumulated_blue_histo[(pix >> 0) & 0xff];
+        }
+      }
+    }
+  }
+}
diff --git a/media/libwebp/enc/quant_enc.c b/media/libwebp/enc/quant_enc.c
new file mode 100644
index 0000000000..029d62ca05
--- /dev/null
+++ b/media/libwebp/enc/quant_enc.c
@@ -0,0 +1,1388 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   Quantization
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>  // for abs()
+
+#include "../dsp/quant.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/cost_enc.h"
+
+#define DO_TRELLIS_I4  1
+#define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
+#define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
+#define USE_TDISTO 1
+
+#define MID_ALPHA 64      // neutral value for susceptibility
+#define MIN_ALPHA 30      // lowest usable value for susceptibility
+#define MAX_ALPHA 100     // higher meaningful value for susceptibility
+
+#define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
+                          // power-law modulation. Must be strictly less than 1.
+
+// number of non-zero coeffs below which we consider the block very flat
+// (and apply a penalty to complex predictions)
+#define FLATNESS_LIMIT_I16 0       // I16 mode (special case)
+#define FLATNESS_LIMIT_I4  3       // I4 mode
+#define FLATNESS_LIMIT_UV  2       // UV mode
+#define FLATNESS_PENALTY   140     // roughly ~1bit per block
+
+#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
+
+#define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
+
+// #define DEBUG_BLOCK
+
+//------------------------------------------------------------------------------
+
+#if defined(DEBUG_BLOCK)
+
+#include <stdio.h>
+#include <stdlib.h>
+
+static void PrintBlockInfo(const VP8EncIterator* const it,
+                           const VP8ModeScore* const rd) {
+  int i, j;
+  const int is_i16 = (it->mb_->type_ == 1);
+  const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
+  const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
+  const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
+  const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
+  printf("SOURCE / OUTPUT / ABS DELTA\n");
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
+    printf("     ");
+    for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
+    printf("     ");
+    for (i = 0; i < 16; ++i) {
+      printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
+    }
+    printf("\n");
+  }
+  printf("\n");   // newline before the U/V block
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
+    printf(" ");
+    for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
+    printf("    ");
+    for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
+    printf(" ");
+    for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
+    printf("   ");
+    for (i = 0; i < 8; ++i) {
+      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
+    }
+    printf(" ");
+    for (i = 8; i < 16; ++i) {
+      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
+    }
+    printf("\n");
+  }
+  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
+    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
+    (int)rd->score);
+  if (is_i16) {
+    printf("Mode: %d\n", rd->mode_i16);
+    printf("y_dc_levels:");
+    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
+    printf("\n");
+  } else {
+    printf("Modes[16]: ");
+    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
+    printf("\n");
+  }
+  printf("y_ac_levels:\n");
+  for (j = 0; j < 16; ++j) {
+    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
+      printf("%4d ", rd->y_ac_levels[j][i]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+  printf("uv_levels (mode=%d):\n", rd->mode_uv);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 16; ++i) {
+      printf("%4d ", rd->uv_levels[j][i]);
+    }
+    printf("\n");
+  }
+}
+
+#endif   // DEBUG_BLOCK
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int clip(int v, int m, int M) {
+  return v < m ? m : v > M ? M : v;
+}
+
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+static const uint8_t kDcTable[128] = {
+  4,     5,   6,   7,   8,   9,  10,  10,
+  11,   12,  13,  14,  15,  16,  17,  17,
+  18,   19,  20,  20,  21,  21,  22,  22,
+  23,   23,  24,  25,  25,  26,  27,  28,
+  29,   30,  31,  32,  33,  34,  35,  36,
+  37,   37,  38,  39,  40,  41,  42,  43,
+  44,   45,  46,  46,  47,  48,  49,  50,
+  51,   52,  53,  54,  55,  56,  57,  58,
+  59,   60,  61,  62,  63,  64,  65,  66,
+  67,   68,  69,  70,  71,  72,  73,  74,
+  75,   76,  76,  77,  78,  79,  80,  81,
+  82,   83,  84,  85,  86,  87,  88,  89,
+  91,   93,  95,  96,  98, 100, 101, 102,
+  104, 106, 108, 110, 112, 114, 116, 118,
+  122, 124, 126, 128, 130, 132, 134, 136,
+  138, 140, 143, 145, 148, 151, 154, 157
+};
+
+static const uint16_t kAcTable[128] = {
+  4,     5,   6,   7,   8,   9,  10,  11,
+  12,   13,  14,  15,  16,  17,  18,  19,
+  20,   21,  22,  23,  24,  25,  26,  27,
+  28,   29,  30,  31,  32,  33,  34,  35,
+  36,   37,  38,  39,  40,  41,  42,  43,
+  44,   45,  46,  47,  48,  49,  50,  51,
+  52,   53,  54,  55,  56,  57,  58,  60,
+  62,   64,  66,  68,  70,  72,  74,  76,
+  78,   80,  82,  84,  86,  88,  90,  92,
+  94,   96,  98, 100, 102, 104, 106, 108,
+  110, 112, 114, 116, 119, 122, 125, 128,
+  131, 134, 137, 140, 143, 146, 149, 152,
+  155, 158, 161, 164, 167, 170, 173, 177,
+  181, 185, 189, 193, 197, 201, 205, 209,
+  213, 217, 221, 225, 229, 234, 239, 245,
+  249, 254, 259, 264, 269, 274, 279, 284
+};
+
+static const uint16_t kAcTable2[128] = {
+  8,     8,   9,  10,  12,  13,  15,  17,
+  18,   20,  21,  23,  24,  26,  27,  29,
+  31,   32,  34,  35,  37,  38,  40,  41,
+  43,   44,  46,  48,  49,  51,  52,  54,
+  55,   57,  58,  60,  62,  63,  65,  66,
+  68,   69,  71,  72,  74,  75,  77,  79,
+  80,   82,  83,  85,  86,  88,  89,  93,
+  96,   99, 102, 105, 108, 111, 114, 117,
+  120, 124, 127, 130, 133, 136, 139, 142,
+  145, 148, 151, 155, 158, 161, 164, 167,
+  170, 173, 176, 179, 184, 189, 193, 198,
+  203, 207, 212, 217, 221, 226, 230, 235,
+  240, 244, 249, 254, 258, 263, 268, 274,
+  280, 286, 292, 299, 305, 311, 317, 323,
+  330, 336, 342, 348, 354, 362, 370, 379,
+  385, 393, 401, 409, 416, 424, 432, 440
+};
+
+static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
+  { 96, 110 }, { 96, 108 }, { 110, 115 }
+};
+
+// Sharpening by (slightly) raising the hi-frequency coeffs.
+// Hack-ish but helpful for mid-bitrate range. Use with care.
+#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
+static const uint8_t kFreqSharpening[16] = {
+  0,  30, 60, 90,
+  30, 60, 90, 90,
+  60, 90, 90, 90,
+  90, 90, 90, 90
+};
+
+//------------------------------------------------------------------------------
+// Initialize quantization parameters in VP8Matrix
+
+// Returns the average quantizer
+static int ExpandMatrix(VP8Matrix* const m, int type) {
+  int i, sum;
+  for (i = 0; i < 2; ++i) {
+    const int is_ac_coeff = (i > 0);
+    const int bias = kBiasMatrices[type][is_ac_coeff];
+    m->iq_[i] = (1 << QFIX) / m->q_[i];
+    m->bias_[i] = BIAS(bias);
+    // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
+    //   * zero if coeff <= zthresh
+    //   * non-zero if coeff > zthresh
+    m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
+  }
+  for (i = 2; i < 16; ++i) {
+    m->q_[i] = m->q_[1];
+    m->iq_[i] = m->iq_[1];
+    m->bias_[i] = m->bias_[1];
+    m->zthresh_[i] = m->zthresh_[1];
+  }
+  for (sum = 0, i = 0; i < 16; ++i) {
+    if (type == 0) {  // we only use sharpening for AC luma coeffs
+      m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
+    } else {
+      m->sharpen_[i] = 0;
+    }
+    sum += m->q_[i];
+  }
+  return (sum + 8) >> 4;
+}
+
+static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
+
+static void SetupMatrices(VP8Encoder* enc) {
+  int i;
+  const int tlambda_scale =
+    (enc->method_ >= 4) ? enc->config_->sns_strength
+                        : 0;
+  const int num_segments = enc->segment_hdr_.num_segments_;
+  for (i = 0; i < num_segments; ++i) {
+    VP8SegmentInfo* const m = &enc->dqm_[i];
+    const int q = m->quant_;
+    int q_i4, q_i16, q_uv;
+    m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
+    m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
+
+    m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
+    m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
+
+    m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
+    m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
+
+    q_i4  = ExpandMatrix(&m->y1_, 0);
+    q_i16 = ExpandMatrix(&m->y2_, 1);
+    q_uv  = ExpandMatrix(&m->uv_, 2);
+
+    m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
+    m->lambda_i16_         = (3 * q_i16 * q_i16);
+    m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
+    m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
+    m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
+    m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
+    m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
+    m->tlambda_            = (tlambda_scale * q_i4) >> 5;
+
+    // none of these constants should be < 1
+    CheckLambdaValue(&m->lambda_i4_);
+    CheckLambdaValue(&m->lambda_i16_);
+    CheckLambdaValue(&m->lambda_uv_);
+    CheckLambdaValue(&m->lambda_mode_);
+    CheckLambdaValue(&m->lambda_trellis_i4_);
+    CheckLambdaValue(&m->lambda_trellis_i16_);
+    CheckLambdaValue(&m->lambda_trellis_uv_);
+    CheckLambdaValue(&m->tlambda_);
+
+    m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
+    m->max_edge_  = 0;
+
+    m->i4_penalty_ = 1000 * q_i4 * q_i4;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Initialize filtering parameters
+
+// Very small filter-strength values have close to no visual effect. So we can
+// save a little decoding-CPU by turning filtering off for these.
+#define FSTRENGTH_CUTOFF 2
+
+static void SetupFilterStrength(VP8Encoder* const enc) {
+  int i;
+  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
+  const int level0 = 5 * enc->config_->filter_strength;
+  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
+    VP8SegmentInfo* const m = &enc->dqm_[i];
+    // We focus on the quantization of AC coeffs.
+    const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
+    const int base_strength =
+        VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
+    // Segments with lower complexity ('beta') will be less filtered.
+    const int f = base_strength * level0 / (256 + m->beta_);
+    m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
+  }
+  // We record the initial strength (mainly for the case of 1-segment only).
+  enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
+  enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
+  enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
+}
+
+//------------------------------------------------------------------------------
+
+// Note: if you change the values below, remember that the max range
+// allowed by the syntax for DQ_UV is [-16,16].
+#define MAX_DQ_UV (6)
+#define MIN_DQ_UV (-4)
+
+// We want to emulate jpeg-like behaviour where the expected "good" quality
+// is around q=75. Internally, our "good" middle is around c=50. So we
+// map accordingly using linear piece-wise function
+static double QualityToCompression(double c) {
+  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
+  // The file size roughly scales as pow(quantizer, 3.). Actually, the
+  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
+  // in the mid-quant range. So we scale the compressibility inversely to
+  // this power-law: quant ~= compression ^ 1/3. This law holds well for
+  // low quant. Finer modeling for high-quant would make use of kAcTable[]
+  // more explicitly.
+  const double v = pow(linear_c, 1 / 3.);
+  return v;
+}
+
+static double QualityToJPEGCompression(double c, double alpha) {
+  // We map the complexity 'alpha' and quality setting 'c' to a compression
+  // exponent empirically matched to the compression curve of libjpeg6b.
+  // On average, the WebP output size will be roughly similar to that of a
+  // JPEG file compressed with same quality factor.
+  const double amin = 0.30;
+  const double amax = 0.85;
+  const double exp_min = 0.4;
+  const double exp_max = 0.9;
+  const double slope = (exp_min - exp_max) / (amax - amin);
+  // Linearly interpolate 'expn' from exp_min to exp_max
+  // in the [amin, amax] range.
+  const double expn = (alpha > amax) ? exp_min
+                    : (alpha < amin) ? exp_max
+                    : exp_max + slope * (alpha - amin);
+  const double v = pow(c, expn);
+  return v;
+}
+
+static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
+                                 const VP8SegmentInfo* const S2) {
+  return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
+}
+
+static void SimplifySegments(VP8Encoder* const enc) {
+  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
+  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
+  // explicit check is needed to avoid a spurious warning about 'i' exceeding
+  // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
+  const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
+                               ? enc->segment_hdr_.num_segments_
+                               : NUM_MB_SEGMENTS;
+  int num_final_segments = 1;
+  int s1, s2;
+  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
+    const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
+    int found = 0;
+    // check if we already have similar segment
+    for (s2 = 0; s2 < num_final_segments; ++s2) {
+      const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
+      if (SegmentsAreEquivalent(S1, S2)) {
+        found = 1;
+        break;
+      }
+    }
+    map[s1] = s2;
+    if (!found) {
+      if (num_final_segments != s1) {
+        enc->dqm_[num_final_segments] = enc->dqm_[s1];
+      }
+      ++num_final_segments;
+    }
+  }
+  if (num_final_segments < num_segments) {  // Remap
+    int i = enc->mb_w_ * enc->mb_h_;
+    while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
+    enc->segment_hdr_.num_segments_ = num_final_segments;
+    // Replicate the trailing segment infos (it's mostly cosmetics)
+    for (i = num_final_segments; i < num_segments; ++i) {
+      enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
+    }
+  }
+}
+
+void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
+  int i;
+  int dq_uv_ac, dq_uv_dc;
+  const int num_segments = enc->segment_hdr_.num_segments_;
+  const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
+  const double Q = quality / 100.;
+  const double c_base = enc->config_->emulate_jpeg_size ?
+      QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
+      QualityToCompression(Q);
+  for (i = 0; i < num_segments; ++i) {
+    // We modulate the base coefficient to accommodate for the quantization
+    // susceptibility and allow denser segments to be quantized more.
+    const double expn = 1. - amp * enc->dqm_[i].alpha_;
+    const double c = pow(c_base, expn);
+    const int q = (int)(127. * (1. - c));
+    assert(expn > 0.);
+    enc->dqm_[i].quant_ = clip(q, 0, 127);
+  }
+
+  // purely indicative in the bitstream (except for the 1-segment case)
+  enc->base_quant_ = enc->dqm_[0].quant_;
+
+  // fill-in values for the unused segments (required by the syntax)
+  for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
+    enc->dqm_[i].quant_ = enc->base_quant_;
+  }
+
+  // uv_alpha_ is normally spread around ~60. The useful range is
+  // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
+  // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
+  dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
+                                          / (MAX_ALPHA - MIN_ALPHA);
+  // we rescale by the user-defined strength of adaptation
+  dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
+  // and make it safe.
+  dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
+  // We also boost the dc-uv-quant a little, based on sns-strength, since
+  // U/V channels are quite more reactive to high quants (flat DC-blocks
+  // tend to appear, and are unpleasant).
+  dq_uv_dc = -4 * enc->config_->sns_strength / 100;
+  dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
+
+  enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
+  enc->dq_y2_dc_ = 0;
+  enc->dq_y2_ac_ = 0;
+  enc->dq_uv_dc_ = dq_uv_dc;
+  enc->dq_uv_ac_ = dq_uv_ac;
+
+  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
+
+  if (num_segments > 1) SimplifySegments(enc);
+
+  SetupMatrices(enc);         // finalize quantization matrices
+}
+
+//------------------------------------------------------------------------------
+// Form the predictions in cache
+
+// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
+const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
+const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
+
+// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
+const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
+  I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
+};
+
+void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
+  const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
+  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
+  VP8EncPredLuma16(it->yuv_p_, left, top);
+}
+
+void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
+  const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
+  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
+  VP8EncPredChroma8(it->yuv_p_, left, top);
+}
+
+void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
+  VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
+}
+
+//------------------------------------------------------------------------------
+// Quantize
+
+// Layout:
+// +----+----+
+// |YYYY|UUVV| 0
+// |YYYY|UUVV| 4
+// |YYYY|....| 8
+// |YYYY|....| 12
+// +----+----+
+
+const uint16_t VP8Scan[16] = {  // Luma
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
+};
+
+static const uint16_t VP8ScanUV[4 + 4] = {
+  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
+  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
+};
+
+//------------------------------------------------------------------------------
+// Distortion measurement
+
+static const uint16_t kWeightY[16] = {
+  38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
+};
+
+static const uint16_t kWeightTrellis[16] = {
+#if USE_TDISTO == 0
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+#else
+  30, 27, 19, 11,
+  27, 24, 17, 10,
+  19, 17, 12,  8,
+  11, 10,  8,  6
+#endif
+};
+
+// Init/Copy the common fields in score.
+static void InitScore(VP8ModeScore* const rd) {
+  rd->D  = 0;
+  rd->SD = 0;
+  rd->R  = 0;
+  rd->H  = 0;
+  rd->nz = 0;
+  rd->score = MAX_COST;
+}
+
+static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
+  dst->D  = src->D;
+  dst->SD = src->SD;
+  dst->R  = src->R;
+  dst->H  = src->H;
+  dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
+  dst->score = src->score;
+}
+
+static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
+  dst->D  += src->D;
+  dst->SD += src->SD;
+  dst->R  += src->R;
+  dst->H  += src->H;
+  dst->nz |= src->nz;     // here, new nz bits are accumulated.
+  dst->score += src->score;
+}
+
+//------------------------------------------------------------------------------
+// Performs trellis-optimized quantization.
+
+// Trellis node
+typedef struct {
+  int8_t prev;            // best previous node
+  int8_t sign;            // sign of coeff_i
+  int16_t level;          // level
+} Node;
+
+// Score state
+typedef struct {
+  score_t score;          // partial RD score
+  const uint16_t* costs;  // shortcut to cost tables
+} ScoreState;
+
+// If a coefficient was quantized to a value Q (using a neutral bias),
+// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
+// We don't test negative values though.
+#define MIN_DELTA 0   // how much lower level to try
+#define MAX_DELTA 1   // how much higher
+#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
+#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
+#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
+
+static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
+  rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
+}
+
+static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
+                                          score_t distortion) {
+  return rate * lambda + RD_DISTO_MULT * distortion;
+}
+
+// Coefficient type.
+enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
+
+static int TrellisQuantizeBlock(const VP8Encoder* const enc,
+                                int16_t in[16], int16_t out[16],
+                                int ctx0, int coeff_type,
+                                const VP8Matrix* const mtx,
+                                int lambda) {
+  const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
+  CostArrayPtr const costs =
+      (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
+  const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
+  Node nodes[16][NUM_NODES];
+  ScoreState score_states[2][NUM_NODES];
+  ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
+  ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
+  int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
+  score_t best_score;
+  int n, m, p, last;
+
+  {
+    score_t cost;
+    const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
+    const int last_proba = probas[VP8EncBands[first]][ctx0][0];
+
+    // compute the position of the last interesting coefficient
+    last = first - 1;
+    for (n = 15; n >= first; --n) {
+      const int j = kZigzag[n];
+      const int err = in[j] * in[j];
+      if (err > thresh) {
+        last = n;
+        break;
+      }
+    }
+    // we don't need to go inspect up to n = 16 coeffs. We can just go up
+    // to last + 1 (inclusive) without losing much.
+    if (last < 15) ++last;
+
+    // compute 'skip' score. This is the max score one can do.
+    cost = VP8BitCost(0, last_proba);
+    best_score = RDScoreTrellis(lambda, cost, 0);
+
+    // initialize source node.
+    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
+      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
+      ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
+      ss_cur[m].costs = costs[first][ctx0];
+    }
+  }
+
+  // traverse trellis.
+  for (n = first; n <= last; ++n) {
+    const int j = kZigzag[n];
+    const uint32_t Q  = mtx->q_[j];
+    const uint32_t iQ = mtx->iq_[j];
+    const uint32_t B = BIAS(0x00);     // neutral bias
+    // note: it's important to take sign of the _original_ coeff,
+    // so we don't have to consider level < 0 afterward.
+    const int sign = (in[j] < 0);
+    const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    int level0 = QUANTDIV(coeff0, iQ, B);
+    int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
+    if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
+    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
+
+    {   // Swap current and previous score states
+      ScoreState* const tmp = ss_cur;
+      ss_cur = ss_prev;
+      ss_prev = tmp;
+    }
+
+    // test all alternate level values around level0.
+    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
+      Node* const cur = &NODE(n, m);
+      const int level = level0 + m;
+      const int ctx = (level > 2) ? 2 : level;
+      const int band = VP8EncBands[n + 1];
+      score_t base_score;
+      score_t best_cur_score;
+      int best_prev;
+      score_t cost, score;
+
+      ss_cur[m].costs = costs[n + 1][ctx];
+      if (level < 0 || level > thresh_level) {
+        ss_cur[m].score = MAX_COST;
+        // Node is dead.
+        continue;
+      }
+
+      {
+        // Compute delta_error = how much coding this level will
+        // subtract to max_error as distortion.
+        // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
+        const int new_error = coeff0 - level * Q;
+        const int delta_error =
+            kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
+        base_score = RDScoreTrellis(lambda, 0, delta_error);
+      }
+
+      // Inspect all possible non-dead predecessors. Retain only the best one.
+      // The base_score is added to all scores so it is only added for the final
+      // value after the loop.
+      cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
+      best_cur_score =
+          ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
+      best_prev = -MIN_DELTA;
+      for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
+        // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
+        // eliminated since their score can't be better than the current best.
+        cost = VP8LevelCost(ss_prev[p].costs, level);
+        // Examine node assuming it's a non-terminal one.
+        score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
+        if (score < best_cur_score) {
+          best_cur_score = score;
+          best_prev = p;
+        }
+      }
+      best_cur_score += base_score;
+      // Store best finding in current node.
+      cur->sign = sign;
+      cur->level = level;
+      cur->prev = best_prev;
+      ss_cur[m].score = best_cur_score;
+
+      // Now, record best terminal node (and thus best entry in the graph).
+      if (level != 0 && best_cur_score < best_score) {
+        const score_t last_pos_cost =
+            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
+        const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
+        score = best_cur_score + last_pos_score;
+        if (score < best_score) {
+          best_score = score;
+          best_path[0] = n;                     // best eob position
+          best_path[1] = m;                     // best node index
+          best_path[2] = best_prev;             // best predecessor
+        }
+      }
+    }
+  }
+
+  // Fresh start
+  // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
+  if (coeff_type == TYPE_I16_AC) {
+    memset(in + 1, 0, 15 * sizeof(*in));
+    memset(out + 1, 0, 15 * sizeof(*out));
+  } else {
+    memset(in, 0, 16 * sizeof(*in));
+    memset(out, 0, 16 * sizeof(*out));
+  }
+  if (best_path[0] == -1) {
+    return 0;  // skip!
+  }
+
+  {
+    // Unwind the best path.
+    // Note: best-prev on terminal node is not necessarily equal to the
+    // best_prev for non-terminal. So we patch best_path[2] in.
+    int nz = 0;
+    int best_node = best_path[1];
+    n = best_path[0];
+    NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
+
+    for (; n >= first; --n) {
+      const Node* const node = &NODE(n, best_node);
+      const int j = kZigzag[n];
+      out[n] = node->sign ? -node->level : node->level;
+      nz |= node->level;
+      in[j] = out[n] * mtx->q_[j];
+      best_node = node->prev;
+    }
+    return (nz != 0);
+  }
+}
+
+#undef NODE
+
+//------------------------------------------------------------------------------
+// Performs: difference, transform, quantize, back-transform, add
+// all at once. Output is the reconstructed block in *yuv_out, and the
+// quantized levels in *levels.
+
+static int ReconstructIntra16(VP8EncIterator* const it,
+                              VP8ModeScore* const rd,
+                              uint8_t* const yuv_out,
+                              int mode) {
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
+  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  int nz = 0;
+  int n;
+  int16_t tmp[16][16], dc_tmp[16];
+
+  for (n = 0; n < 16; n += 2) {
+    VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
+  }
+  VP8FTransformWHT(tmp[0], dc_tmp);
+  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
+
+  if (DO_TRELLIS_I16 && it->do_trellis_) {
+    int x, y;
+    VP8IteratorNzToBytes(it);
+    for (y = 0, n = 0; y < 4; ++y) {
+      for (x = 0; x < 4; ++x, ++n) {
+        const int ctx = it->top_nz_[x] + it->left_nz_[y];
+        const int non_zero = TrellisQuantizeBlock(
+            enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
+            dqm->lambda_trellis_i16_);
+        it->top_nz_[x] = it->left_nz_[y] = non_zero;
+        rd->y_ac_levels[n][0] = 0;
+        nz |= non_zero << n;
+      }
+    }
+  } else {
+    for (n = 0; n < 16; n += 2) {
+      // Zero-out the first coeff, so that: a) nz is correct below, and
+      // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
+      tmp[n][0] = tmp[n + 1][0] = 0;
+      nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
+      assert(rd->y_ac_levels[n + 0][0] == 0);
+      assert(rd->y_ac_levels[n + 1][0] == 0);
+    }
+  }
+
+  // Transform back
+  VP8TransformWHT(dc_tmp, tmp[0]);
+  for (n = 0; n < 16; n += 2) {
+    VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
+  }
+
+  return nz;
+}
+
+static int ReconstructIntra4(VP8EncIterator* const it,
+                             int16_t levels[16],
+                             const uint8_t* const src,
+                             uint8_t* const yuv_out,
+                             int mode) {
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  int nz = 0;
+  int16_t tmp[16];
+
+  VP8FTransform(src, ref, tmp);
+  if (DO_TRELLIS_I4 && it->do_trellis_) {
+    const int x = it->i4_ & 3, y = it->i4_ >> 2;
+    const int ctx = it->top_nz_[x] + it->left_nz_[y];
+    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
+                              dqm->lambda_trellis_i4_);
+  } else {
+    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
+  }
+  VP8ITransform(ref, tmp, yuv_out, 0);
+  return nz;
+}
+
+//------------------------------------------------------------------------------
+// DC-error diffusion
+
+// Diffusion weights. We under-correct a bit (15/16th of the error is actually
+// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
+#define C1 7    // fraction of error sent to the 4x4 block below
+#define C2 8    // fraction of error sent to the 4x4 block on the right
+#define DSHIFT 4
+#define DSCALE 1   // storage descaling, needed to make the error fit int8_t
+
+// Quantize as usual, but also compute and return the quantization error.
+// Error is already divided by DSHIFT.
+static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
+  int V = *v;
+  const int sign = (V < 0);
+  if (sign) V = -V;
+  if (V > (int)mtx->zthresh_[0]) {
+    const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
+    const int err = (V - qV);
+    *v = sign ? -qV : qV;
+    return (sign ? -err : err) >> DSCALE;
+  }
+  *v = 0;
+  return (sign ? -V : V) >> DSCALE;
+}
+
+static void CorrectDCValues(const VP8EncIterator* const it,
+                            const VP8Matrix* const mtx,
+                            int16_t tmp[][16], VP8ModeScore* const rd) {
+  //         | top[0] | top[1]
+  // --------+--------+---------
+  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
+  // left[1] | tmp[2]   tmp[3]        err2 err3
+  //
+  // Final errors {err1,err2,err3} are preserved and later restored
+  // as top[]/left[] on the next block.
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    const int8_t* const top = it->top_derr_[it->x_][ch];
+    const int8_t* const left = it->left_derr_[ch];
+    int16_t (* const c)[16] = &tmp[ch * 4];
+    int err0, err1, err2, err3;
+    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
+    err0 = QuantizeSingle(&c[0][0], mtx);
+    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
+    err1 = QuantizeSingle(&c[1][0], mtx);
+    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
+    err2 = QuantizeSingle(&c[2][0], mtx);
+    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
+    err3 = QuantizeSingle(&c[3][0], mtx);
+    // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
+    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
+    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
+    rd->derr[ch][0] = (int8_t)err1;
+    rd->derr[ch][1] = (int8_t)err2;
+    rd->derr[ch][2] = (int8_t)err3;
+  }
+}
+
+static void StoreDiffusionErrors(VP8EncIterator* const it,
+                                 const VP8ModeScore* const rd) {
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    int8_t* const top = it->top_derr_[it->x_][ch];
+    int8_t* const left = it->left_derr_[ch];
+    left[0] = rd->derr[ch][0];            // restore err1
+    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
+    top[0]  = rd->derr[ch][1];            //     ... err2
+    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
+  }
+}
+
+#undef C1
+#undef C2
+#undef DSHIFT
+#undef DSCALE
+
+//------------------------------------------------------------------------------
+
+static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
+                         uint8_t* const yuv_out, int mode) {
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
+  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  int nz = 0;
+  int n;
+  int16_t tmp[8][16];
+
+  for (n = 0; n < 8; n += 2) {
+    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
+  }
+  if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
+
+  if (DO_TRELLIS_UV && it->do_trellis_) {
+    int ch, x, y;
+    for (ch = 0, n = 0; ch <= 2; ch += 2) {
+      for (y = 0; y < 2; ++y) {
+        for (x = 0; x < 2; ++x, ++n) {
+          const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+          const int non_zero = TrellisQuantizeBlock(
+              enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
+              dqm->lambda_trellis_uv_);
+          it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
+          nz |= non_zero << n;
+        }
+      }
+    }
+  } else {
+    for (n = 0; n < 8; n += 2) {
+      nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
+    }
+  }
+
+  for (n = 0; n < 8; n += 2) {
+    VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
+  }
+  return (nz << 16);
+}
+
+//------------------------------------------------------------------------------
+// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
+// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
+
+static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
+  // We look at the first three AC coefficients to determine what is the average
+  // delta between each sub-4x4 block.
+  const int v0 = abs(DCs[1]);
+  const int v1 = abs(DCs[2]);
+  const int v2 = abs(DCs[4]);
+  int max_v = (v1 > v0) ? v1 : v0;
+  max_v = (v2 > max_v) ? v2 : max_v;
+  if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
+}
+
+static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
+  VP8ModeScore* const tmp = *a;
+  *a = *b;
+  *b = tmp;
+}
+
+static void SwapPtr(uint8_t** a, uint8_t** b) {
+  uint8_t* const tmp = *a;
+  *a = *b;
+  *b = tmp;
+}
+
+static void SwapOut(VP8EncIterator* const it) {
+  SwapPtr(&it->yuv_out_, &it->yuv_out2_);
+}
+
+static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
+  const int kNumBlocks = 16;
+  VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
+  const int lambda = dqm->lambda_i16_;
+  const int tlambda = dqm->tlambda_;
+  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
+  VP8ModeScore rd_tmp;
+  VP8ModeScore* rd_cur = &rd_tmp;
+  VP8ModeScore* rd_best = rd;
+  int mode;
+  int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
+
+  rd->mode_i16 = -1;
+  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
+    rd_cur->mode_i16 = mode;
+
+    // Reconstruct
+    rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
+
+    // Measure RD-score
+    rd_cur->D = VP8SSE16x16(src, tmp_dst);
+    rd_cur->SD =
+        tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
+    rd_cur->H = VP8FixedCostsI16[mode];
+    rd_cur->R = VP8GetCostLuma16(it, rd_cur);
+    if (is_flat) {
+      // refine the first impression (which was in pixel space)
+      is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
+      if (is_flat) {
+        // Block is very flat. We put emphasis on the distortion being very low!
+        rd_cur->D *= 2;
+        rd_cur->SD *= 2;
+      }
+    }
+
+    // Since we always examine Intra16 first, we can overwrite *rd directly.
+    SetRDScore(lambda, rd_cur);
+    if (mode == 0 || rd_cur->score < rd_best->score) {
+      SwapModeScore(&rd_cur, &rd_best);
+      SwapOut(it);
+    }
+  }
+  if (rd_best != rd) {
+    memcpy(rd, rd_best, sizeof(*rd));
+  }
+  SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
+  VP8SetIntra16Mode(it, rd->mode_i16);
+
+  // we have a blocky macroblock (only DCs are non-zero) with fairly high
+  // distortion, record max delta so we can later adjust the minimal filtering
+  // strength needed to smooth these blocks out.
+  if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
+    StoreMaxDelta(dqm, rd->y_dc_levels);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+// return the cost array corresponding to the surrounding prediction modes.
+static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
+                                     const uint8_t modes[16]) {
+  const int preds_w = it->enc_->preds_w_;
+  const int x = (it->i4_ & 3), y = it->i4_ >> 2;
+  const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
+  const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
+  return VP8FixedCostsI4[top][left];
+}
+
+static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
+  const VP8Encoder* const enc = it->enc_;
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const int lambda = dqm->lambda_i4_;
+  const int tlambda = dqm->tlambda_;
+  const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
+  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
+  int total_header_bits = 0;
+  VP8ModeScore rd_best;
+
+  if (enc->max_i4_header_bits_ == 0) {
+    return 0;
+  }
+
+  InitScore(&rd_best);
+  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
+  SetRDScore(dqm->lambda_mode_, &rd_best);
+  VP8IteratorStartI4(it);
+  do {
+    const int kNumBlocks = 1;
+    VP8ModeScore rd_i4;
+    int mode;
+    int best_mode = -1;
+    const uint8_t* const src = src0 + VP8Scan[it->i4_];
+    const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
+    uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
+    uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
+
+    InitScore(&rd_i4);
+    VP8MakeIntra4Preds(it);
+    for (mode = 0; mode < NUM_BMODES; ++mode) {
+      VP8ModeScore rd_tmp;
+      int16_t tmp_levels[16];
+
+      // Reconstruct
+      rd_tmp.nz =
+          ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
+
+      // Compute RD-score
+      rd_tmp.D = VP8SSE4x4(src, tmp_dst);
+      rd_tmp.SD =
+          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
+                  : 0;
+      rd_tmp.H = mode_costs[mode];
+
+      // Add flatness penalty, to avoid flat area to be mispredicted
+      // by a complex mode.
+      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
+        rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
+      } else {
+        rd_tmp.R = 0;
+      }
+
+      // early-out check
+      SetRDScore(lambda, &rd_tmp);
+      if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
+
+      // finish computing score
+      rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
+      SetRDScore(lambda, &rd_tmp);
+
+      if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
+        CopyScore(&rd_i4, &rd_tmp);
+        best_mode = mode;
+        SwapPtr(&tmp_dst, &best_block);
+        memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
+               sizeof(rd_best.y_ac_levels[it->i4_]));
+      }
+    }
+    SetRDScore(dqm->lambda_mode_, &rd_i4);
+    AddScore(&rd_best, &rd_i4);
+    if (rd_best.score >= rd->score) {
+      return 0;
+    }
+    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
+    if (total_header_bits > enc->max_i4_header_bits_) {
+      return 0;
+    }
+    // Copy selected samples if not in the right place already.
+    if (best_block != best_blocks + VP8Scan[it->i4_]) {
+      VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
+    }
+    rd->modes_i4[it->i4_] = best_mode;
+    it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
+  } while (VP8IteratorRotateI4(it, best_blocks));
+
+  // finalize state
+  CopyScore(rd, &rd_best);
+  VP8SetIntra4Mode(it, rd->modes_i4);
+  SwapOut(it);
+  memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
+  return 1;   // select intra4x4 over intra16x16
+}
+
+//------------------------------------------------------------------------------
+
+static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
+  const int kNumBlocks = 8;
+  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
+  const int lambda = dqm->lambda_uv_;
+  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+  uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
+  uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
+  uint8_t* dst = dst0;
+  VP8ModeScore rd_best;
+  int mode;
+
+  rd->mode_uv = -1;
+  InitScore(&rd_best);
+  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+    VP8ModeScore rd_uv;
+
+    // Reconstruct
+    rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
+
+    // Compute RD-score
+    rd_uv.D  = VP8SSE16x8(src, tmp_dst);
+    rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
+    rd_uv.H  = VP8FixedCostsUV[mode];
+    rd_uv.R  = VP8GetCostUV(it, &rd_uv);
+    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
+      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
+    }
+
+    SetRDScore(lambda, &rd_uv);
+    if (mode == 0 || rd_uv.score < rd_best.score) {
+      CopyScore(&rd_best, &rd_uv);
+      rd->mode_uv = mode;
+      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
+      if (it->top_derr_ != NULL) {
+        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
+      }
+      SwapPtr(&dst, &tmp_dst);
+    }
+  }
+  VP8SetIntraUVMode(it, rd->mode_uv);
+  AddScore(rd, &rd_best);
+  if (dst != dst0) {   // copy 16x8 block if needed
+    VP8Copy16x8(dst, dst0);
+  }
+  if (it->top_derr_ != NULL) {  // store diffusion errors for next block
+    StoreDiffusionErrors(it, rd);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Final reconstruction and quantization.
+
+static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
+  const VP8Encoder* const enc = it->enc_;
+  const int is_i16 = (it->mb_->type_ == 1);
+  int nz = 0;
+
+  if (is_i16) {
+    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
+  } else {
+    VP8IteratorStartI4(it);
+    do {
+      const int mode =
+          it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
+      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
+      uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
+      VP8MakeIntra4Preds(it);
+      nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
+                              src, dst, mode) << it->i4_;
+    } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
+  }
+
+  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
+  rd->nz = nz;
+}
+
+// Refine intra16/intra4 sub-modes based on distortion only (not rate).
+static void RefineUsingDistortion(VP8EncIterator* const it,
+                                  int try_both_modes, int refine_uv_mode,
+                                  VP8ModeScore* const rd) {
+  score_t best_score = MAX_COST;
+  int nz = 0;
+  int mode;
+  int is_i16 = try_both_modes || (it->mb_->type_ == 1);
+
+  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
+  // Some empiric constants, of approximate order of magnitude.
+  const int lambda_d_i16 = 106;
+  const int lambda_d_i4 = 11;
+  const int lambda_d_uv = 120;
+  score_t score_i4 = dqm->i4_penalty_;
+  score_t i4_bit_sum = 0;
+  const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
+                                           : MAX_COST;  // no early-out allowed
+
+  if (is_i16) {   // First, evaluate Intra16 distortion
+    int best_mode = -1;
+    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
+    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
+      const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
+                          + VP8FixedCostsI16[mode] * lambda_d_i16;
+      if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
+        continue;
+      }
+
+      if (score < best_score) {
+        best_mode = mode;
+        best_score = score;
+      }
+    }
+    if (it->x_ == 0 || it->y_ == 0) {
+      // avoid starting a checkerboard resonance from the border. See bug #432.
+      if (IsFlatSource16(src)) {
+        best_mode = (it->x_ == 0) ? 0 : 2;
+        try_both_modes = 0;  // stick to i16
+      }
+    }
+    VP8SetIntra16Mode(it, best_mode);
+    // we'll reconstruct later, if i16 mode actually gets selected
+  }
+
+  // Next, evaluate Intra4
+  if (try_both_modes || !is_i16) {
+    // We don't evaluate the rate here, but just account for it through a
+    // constant penalty (i4 mode usually needs more bits compared to i16).
+    is_i16 = 0;
+    VP8IteratorStartI4(it);
+    do {
+      int best_i4_mode = -1;
+      score_t best_i4_score = MAX_COST;
+      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
+      const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
+
+      VP8MakeIntra4Preds(it);
+      for (mode = 0; mode < NUM_BMODES; ++mode) {
+        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
+        const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
+                            + mode_costs[mode] * lambda_d_i4;
+        if (score < best_i4_score) {
+          best_i4_mode = mode;
+          best_i4_score = score;
+        }
+      }
+      i4_bit_sum += mode_costs[best_i4_mode];
+      rd->modes_i4[it->i4_] = best_i4_mode;
+      score_i4 += best_i4_score;
+      if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
+        // Intra4 won't be better than Intra16. Bail out and pick Intra16.
+        is_i16 = 1;
+        break;
+      } else {  // reconstruct partial block inside yuv_out2_ buffer
+        uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
+        nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
+                                src, tmp_dst, best_i4_mode) << it->i4_;
+      }
+    } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
+  }
+
+  // Final reconstruction, depending on which mode is selected.
+  if (!is_i16) {
+    VP8SetIntra4Mode(it, rd->modes_i4);
+    SwapOut(it);
+    best_score = score_i4;
+  } else {
+    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
+  }
+
+  // ... and UV!
+  if (refine_uv_mode) {
+    int best_mode = -1;
+    score_t best_uv_score = MAX_COST;
+    const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+      const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
+      const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
+                          + VP8FixedCostsUV[mode] * lambda_d_uv;
+      if (score < best_uv_score) {
+        best_mode = mode;
+        best_uv_score = score;
+      }
+    }
+    VP8SetIntraUVMode(it, best_mode);
+  }
+  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
+
+  rd->nz = nz;
+  rd->score = best_score;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
+                VP8RDLevel rd_opt) {
+  int is_skipped;
+  const int method = it->enc_->method_;
+
+  InitScore(rd);
+
+  // We can perform predictions for Luma16x16 and Chroma8x8 already.
+  // Luma4x4 predictions needs to be done as-we-go.
+  VP8MakeLuma16Preds(it);
+  VP8MakeChroma8Preds(it);
+
+  if (rd_opt > RD_OPT_NONE) {
+    it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
+    PickBestIntra16(it, rd);
+    if (method >= 2) {
+      PickBestIntra4(it, rd);
+    }
+    PickBestUV(it, rd);
+    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
+      it->do_trellis_ = 1;
+      SimpleQuantize(it, rd);
+    }
+  } else {
+    // At this point we have heuristically decided intra16 / intra4.
+    // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
+    // For method <= 1, we don't re-examine the decision but just go ahead with
+    // quantization/reconstruction.
+    RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
+  }
+  is_skipped = (rd->nz == 0);
+  VP8SetSkip(it, is_skipped);
+  return is_skipped;
+}
diff --git a/media/libwebp/enc/syntax_enc.c b/media/libwebp/enc/syntax_enc.c
new file mode 100644
index 0000000000..28fd1f1ee0
--- /dev/null
+++ b/media/libwebp/enc/syntax_enc.c
@@ -0,0 +1,388 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Header syntax writing
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"  // RIFF constants
+#include "../webp/mux_types.h"         // ALPHA_FLAG
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Helper functions
+
+static int IsVP8XNeeded(const VP8Encoder* const enc) {
+  return !!enc->has_alpha_;  // Currently the only case when VP8X is needed.
+                             // This could change in the future.
+}
+
+static int PutPaddingByte(const WebPPicture* const pic) {
+  const uint8_t pad_byte[1] = { 0 };
+  return !!pic->writer(pad_byte, 1, pic);
+}
+
+//------------------------------------------------------------------------------
+// Writers for header's various pieces (in order of appearance)
+
+static WebPEncodingError PutRIFFHeader(const VP8Encoder* const enc,
+                                       size_t riff_size) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t riff[RIFF_HEADER_SIZE] = {
+    'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P'
+  };
+  assert(riff_size == (uint32_t)riff_size);
+  PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
+  if (!pic->writer(riff, sizeof(riff), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8XHeader(const VP8Encoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t vp8x[CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE] = {
+    'V', 'P', '8', 'X'
+  };
+  uint32_t flags = 0;
+
+  assert(IsVP8XNeeded(enc));
+  assert(pic->width >= 1 && pic->height >= 1);
+  assert(pic->width <= MAX_CANVAS_SIZE && pic->height <= MAX_CANVAS_SIZE);
+
+  if (enc->has_alpha_) {
+    flags |= ALPHA_FLAG;
+  }
+
+  PutLE32(vp8x + TAG_SIZE,              VP8X_CHUNK_SIZE);
+  PutLE32(vp8x + CHUNK_HEADER_SIZE,     flags);
+  PutLE24(vp8x + CHUNK_HEADER_SIZE + 4, pic->width - 1);
+  PutLE24(vp8x + CHUNK_HEADER_SIZE + 7, pic->height - 1);
+  if (!pic->writer(vp8x, sizeof(vp8x), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutAlphaChunk(const VP8Encoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  uint8_t alpha_chunk_hdr[CHUNK_HEADER_SIZE] = {
+    'A', 'L', 'P', 'H'
+  };
+
+  assert(enc->has_alpha_);
+
+  // Alpha chunk header.
+  PutLE32(alpha_chunk_hdr + TAG_SIZE, enc->alpha_data_size_);
+  if (!pic->writer(alpha_chunk_hdr, sizeof(alpha_chunk_hdr), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+
+  // Alpha chunk data.
+  if (!pic->writer(enc->alpha_data_, enc->alpha_data_size_, pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+
+  // Padding.
+  if ((enc->alpha_data_size_ & 1) && !PutPaddingByte(pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8Header(const WebPPicture* const pic,
+                                      size_t vp8_size) {
+  uint8_t vp8_chunk_hdr[CHUNK_HEADER_SIZE] = {
+    'V', 'P', '8', ' '
+  };
+  assert(vp8_size == (uint32_t)vp8_size);
+  PutLE32(vp8_chunk_hdr + TAG_SIZE, (uint32_t)vp8_size);
+  if (!pic->writer(vp8_chunk_hdr, sizeof(vp8_chunk_hdr), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static WebPEncodingError PutVP8FrameHeader(const WebPPicture* const pic,
+                                           int profile, size_t size0) {
+  uint8_t vp8_frm_hdr[VP8_FRAME_HEADER_SIZE];
+  uint32_t bits;
+
+  if (size0 >= VP8_MAX_PARTITION0_SIZE) {  // partition #0 is too big to fit
+    return VP8_ENC_ERROR_PARTITION0_OVERFLOW;
+  }
+
+  // Paragraph 9.1.
+  bits = 0                         // keyframe (1b)
+       | (profile << 1)            // profile (3b)
+       | (1 << 4)                  // visible (1b)
+       | ((uint32_t)size0 << 5);   // partition length (19b)
+  vp8_frm_hdr[0] = (bits >>  0) & 0xff;
+  vp8_frm_hdr[1] = (bits >>  8) & 0xff;
+  vp8_frm_hdr[2] = (bits >> 16) & 0xff;
+  // signature
+  vp8_frm_hdr[3] = (VP8_SIGNATURE >> 16) & 0xff;
+  vp8_frm_hdr[4] = (VP8_SIGNATURE >>  8) & 0xff;
+  vp8_frm_hdr[5] = (VP8_SIGNATURE >>  0) & 0xff;
+  // dimensions
+  vp8_frm_hdr[6] = pic->width & 0xff;
+  vp8_frm_hdr[7] = pic->width >> 8;
+  vp8_frm_hdr[8] = pic->height & 0xff;
+  vp8_frm_hdr[9] = pic->height >> 8;
+
+  if (!pic->writer(vp8_frm_hdr, sizeof(vp8_frm_hdr), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+// WebP Headers.
+static int PutWebPHeaders(const VP8Encoder* const enc, size_t size0,
+                          size_t vp8_size, size_t riff_size) {
+  WebPPicture* const pic = enc->pic_;
+  WebPEncodingError err = VP8_ENC_OK;
+
+  // RIFF header.
+  err = PutRIFFHeader(enc, riff_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // VP8X.
+  if (IsVP8XNeeded(enc)) {
+    err = PutVP8XHeader(enc);
+    if (err != VP8_ENC_OK) goto Error;
+  }
+
+  // Alpha.
+  if (enc->has_alpha_) {
+    err = PutAlphaChunk(enc);
+    if (err != VP8_ENC_OK) goto Error;
+  }
+
+  // VP8 header.
+  err = PutVP8Header(pic, vp8_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // VP8 frame header.
+  err = PutVP8FrameHeader(pic, enc->profile_, size0);
+  if (err != VP8_ENC_OK) goto Error;
+
+  // All OK.
+  return 1;
+
+  // Error.
+ Error:
+  return WebPEncodingSetError(pic, err);
+}
+
+// Segmentation header
+static void PutSegmentHeader(VP8BitWriter* const bw,
+                             const VP8Encoder* const enc) {
+  const VP8EncSegmentHeader* const hdr = &enc->segment_hdr_;
+  const VP8EncProba* const proba = &enc->proba_;
+  if (VP8PutBitUniform(bw, (hdr->num_segments_ > 1))) {
+    // We always 'update' the quant and filter strength values
+    const int update_data = 1;
+    int s;
+    VP8PutBitUniform(bw, hdr->update_map_);
+    if (VP8PutBitUniform(bw, update_data)) {
+      // we always use absolute values, not relative ones
+      VP8PutBitUniform(bw, 1);   // (segment_feature_mode = 1. Paragraph 9.3.)
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        VP8PutSignedBits(bw, enc->dqm_[s].quant_, 7);
+      }
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        VP8PutSignedBits(bw, enc->dqm_[s].fstrength_, 6);
+      }
+    }
+    if (hdr->update_map_) {
+      for (s = 0; s < 3; ++s) {
+        if (VP8PutBitUniform(bw, (proba->segments_[s] != 255u))) {
+          VP8PutBits(bw, proba->segments_[s], 8);
+        }
+      }
+    }
+  }
+}
+
+// Filtering parameters header
+static void PutFilterHeader(VP8BitWriter* const bw,
+                            const VP8EncFilterHeader* const hdr) {
+  const int use_lf_delta = (hdr->i4x4_lf_delta_ != 0);
+  VP8PutBitUniform(bw, hdr->simple_);
+  VP8PutBits(bw, hdr->level_, 6);
+  VP8PutBits(bw, hdr->sharpness_, 3);
+  if (VP8PutBitUniform(bw, use_lf_delta)) {
+    // '0' is the default value for i4x4_lf_delta_ at frame #0.
+    const int need_update = (hdr->i4x4_lf_delta_ != 0);
+    if (VP8PutBitUniform(bw, need_update)) {
+      // we don't use ref_lf_delta => emit four 0 bits
+      VP8PutBits(bw, 0, 4);
+      // we use mode_lf_delta for i4x4
+      VP8PutSignedBits(bw, hdr->i4x4_lf_delta_, 6);
+      VP8PutBits(bw, 0, 3);    // all others unused
+    }
+  }
+}
+
+// Nominal quantization parameters
+static void PutQuant(VP8BitWriter* const bw,
+                     const VP8Encoder* const enc) {
+  VP8PutBits(bw, enc->base_quant_, 7);
+  VP8PutSignedBits(bw, enc->dq_y1_dc_, 4);
+  VP8PutSignedBits(bw, enc->dq_y2_dc_, 4);
+  VP8PutSignedBits(bw, enc->dq_y2_ac_, 4);
+  VP8PutSignedBits(bw, enc->dq_uv_dc_, 4);
+  VP8PutSignedBits(bw, enc->dq_uv_ac_, 4);
+}
+
+// Partition sizes
+static int EmitPartitionsSize(const VP8Encoder* const enc,
+                              WebPPicture* const pic) {
+  uint8_t buf[3 * (MAX_NUM_PARTITIONS - 1)];
+  int p;
+  for (p = 0; p < enc->num_parts_ - 1; ++p) {
+    const size_t part_size = VP8BitWriterSize(enc->parts_ + p);
+    if (part_size >= VP8_MAX_PARTITION_SIZE) {
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION_OVERFLOW);
+    }
+    buf[3 * p + 0] = (part_size >>  0) & 0xff;
+    buf[3 * p + 1] = (part_size >>  8) & 0xff;
+    buf[3 * p + 2] = (part_size >> 16) & 0xff;
+  }
+  return p ? pic->writer(buf, 3 * p, pic) : 1;
+}
+
+//------------------------------------------------------------------------------
+
+static int GeneratePartition0(VP8Encoder* const enc) {
+  VP8BitWriter* const bw = &enc->bw_;
+  const int mb_size = enc->mb_w_ * enc->mb_h_;
+  uint64_t pos1, pos2, pos3;
+
+  pos1 = VP8BitWriterPos(bw);
+  if (!VP8BitWriterInit(bw, mb_size * 7 / 8)) {        // ~7 bits per macroblock
+    return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+  VP8PutBitUniform(bw, 0);   // colorspace
+  VP8PutBitUniform(bw, 0);   // clamp type
+
+  PutSegmentHeader(bw, enc);
+  PutFilterHeader(bw, &enc->filter_hdr_);
+  VP8PutBits(bw, enc->num_parts_ == 8 ? 3 :
+                 enc->num_parts_ == 4 ? 2 :
+                 enc->num_parts_ == 2 ? 1 : 0, 2);
+  PutQuant(bw, enc);
+  VP8PutBitUniform(bw, 0);   // no proba update
+  VP8WriteProbas(bw, &enc->proba_);
+  pos2 = VP8BitWriterPos(bw);
+  VP8CodeIntraModes(enc);
+  VP8BitWriterFinish(bw);
+
+  pos3 = VP8BitWriterPos(bw);
+
+#if !defined(WEBP_DISABLE_STATS)
+  if (enc->pic_->stats) {
+    enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3);
+    enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3);
+    enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_;
+  }
+#else
+  (void)pos1;
+  (void)pos2;
+  (void)pos3;
+#endif
+  if (bw->error_) {
+    return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+  return 1;
+}
+
+void VP8EncFreeBitWriters(VP8Encoder* const enc) {
+  int p;
+  VP8BitWriterWipeOut(&enc->bw_);
+  for (p = 0; p < enc->num_parts_; ++p) {
+    VP8BitWriterWipeOut(enc->parts_ + p);
+  }
+}
+
+int VP8EncWrite(VP8Encoder* const enc) {
+  WebPPicture* const pic = enc->pic_;
+  VP8BitWriter* const bw = &enc->bw_;
+  const int task_percent = 19;
+  const int percent_per_part = task_percent / enc->num_parts_;
+  const int final_percent = enc->percent_ + task_percent;
+  int ok = 0;
+  size_t vp8_size, pad, riff_size;
+  int p;
+
+  // Partition #0 with header and partition sizes
+  ok = GeneratePartition0(enc);
+  if (!ok) return 0;
+
+  // Compute VP8 size
+  vp8_size = VP8_FRAME_HEADER_SIZE +
+             VP8BitWriterSize(bw) +
+             3 * (enc->num_parts_ - 1);
+  for (p = 0; p < enc->num_parts_; ++p) {
+    vp8_size += VP8BitWriterSize(enc->parts_ + p);
+  }
+  pad = vp8_size & 1;
+  vp8_size += pad;
+
+  // Compute RIFF size
+  // At the minimum it is: "WEBPVP8 nnnn" + VP8 data size.
+  riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8_size;
+  if (IsVP8XNeeded(enc)) {  // Add size for: VP8X header + data.
+    riff_size += CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
+  }
+  if (enc->has_alpha_) {  // Add size for: ALPH header + data.
+    const uint32_t padded_alpha_size = enc->alpha_data_size_ +
+                                       (enc->alpha_data_size_ & 1);
+    riff_size += CHUNK_HEADER_SIZE + padded_alpha_size;
+  }
+  // RIFF size should fit in 32-bits.
+  if (riff_size > 0xfffffffeU) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_FILE_TOO_BIG);
+  }
+
+  // Emit headers and partition #0
+  {
+    const uint8_t* const part0 = VP8BitWriterBuf(bw);
+    const size_t size0 = VP8BitWriterSize(bw);
+    ok = ok && PutWebPHeaders(enc, size0, vp8_size, riff_size)
+            && pic->writer(part0, size0, pic)
+            && EmitPartitionsSize(enc, pic);
+    VP8BitWriterWipeOut(bw);    // will free the internal buffer.
+  }
+
+  // Token partitions
+  for (p = 0; p < enc->num_parts_; ++p) {
+    const uint8_t* const buf = VP8BitWriterBuf(enc->parts_ + p);
+    const size_t size = VP8BitWriterSize(enc->parts_ + p);
+    if (size) ok = ok && pic->writer(buf, size, pic);
+    VP8BitWriterWipeOut(enc->parts_ + p);    // will free the internal buffer.
+    ok = ok && WebPReportProgress(pic, enc->percent_ + percent_per_part,
+                                  &enc->percent_);
+  }
+
+  // Padding byte
+  if (ok && pad) {
+    ok = PutPaddingByte(pic);
+  }
+
+  enc->coded_size_ = (int)(CHUNK_HEADER_SIZE + riff_size);
+  ok = ok && WebPReportProgress(pic, final_percent, &enc->percent_);
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+
diff --git a/media/libwebp/enc/token_enc.c b/media/libwebp/enc/token_enc.c
new file mode 100644
index 0000000000..52711eb782
--- /dev/null
+++ b/media/libwebp/enc/token_enc.c
@@ -0,0 +1,262 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Paginated token buffer
+//
+//  A 'token' is a bit value associated with a probability, either fixed
+// or a later-to-be-determined after statistics have been collected.
+// For dynamic probability, we just record the slot id (idx) for the probability
+// value in the final probability array (uint8_t* probas in VP8EmitTokens).
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../utils/utils.h"
+
+#if !defined(DISABLE_TOKEN_BUFFER)
+
+// we use pages to reduce the number of memcpy()
+#define MIN_PAGE_SIZE 8192          // minimum number of token per page
+#define FIXED_PROBA_BIT (1u << 14)
+
+typedef uint16_t token_t;  // bit #15: bit value
+                           // bit #14: flags for constant proba or idx
+                           // bits #0..13: slot or constant proba
+struct VP8Tokens {
+  VP8Tokens* next_;        // pointer to next page
+};
+// Token data is located in memory just after the next_ field.
+// This macro is used to return their address and hide the trick.
+#define TOKEN_DATA(p) ((const token_t*)&(p)[1])
+
+//------------------------------------------------------------------------------
+
+void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
+  b->tokens_ = NULL;
+  b->pages_ = NULL;
+  b->last_page_ = &b->pages_;
+  b->left_ = 0;
+  b->page_size_ = (page_size < MIN_PAGE_SIZE) ? MIN_PAGE_SIZE : page_size;
+  b->error_ = 0;
+}
+
+void VP8TBufferClear(VP8TBuffer* const b) {
+  if (b != NULL) {
+    VP8Tokens* p = b->pages_;
+    while (p != NULL) {
+      VP8Tokens* const next = p->next_;
+      WebPSafeFree(p);
+      p = next;
+    }
+    VP8TBufferInit(b, b->page_size_);
+  }
+}
+
+static int TBufferNewPage(VP8TBuffer* const b) {
+  VP8Tokens* page = NULL;
+  if (!b->error_) {
+    const size_t size = sizeof(*page) + b->page_size_ * sizeof(token_t);
+    page = (VP8Tokens*)WebPSafeMalloc(1ULL, size);
+  }
+  if (page == NULL) {
+    b->error_ = 1;
+    return 0;
+  }
+  page->next_ = NULL;
+
+  *b->last_page_ = page;
+  b->last_page_ = &page->next_;
+  b->left_ = b->page_size_;
+  b->tokens_ = (token_t*)TOKEN_DATA(page);
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+#define TOKEN_ID(t, b, ctx) \
+    (NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
+
+static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b, uint32_t bit,
+                                     uint32_t proba_idx,
+                                     proba_t* const stats) {
+  assert(proba_idx < FIXED_PROBA_BIT);
+  assert(bit <= 1);
+  if (b->left_ > 0 || TBufferNewPage(b)) {
+    const int slot = --b->left_;
+    b->tokens_[slot] = (bit << 15) | proba_idx;
+  }
+  VP8RecordStats(bit, stats);
+  return bit;
+}
+
+static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
+                                         uint32_t bit, uint32_t proba) {
+  assert(proba < 256);
+  assert(bit <= 1);
+  if (b->left_ > 0 || TBufferNewPage(b)) {
+    const int slot = --b->left_;
+    b->tokens_[slot] = (bit << 15) | FIXED_PROBA_BIT | proba;
+  }
+}
+
+int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
+                         VP8TBuffer* const tokens) {
+  const int16_t* const coeffs = res->coeffs;
+  const int coeff_type = res->coeff_type;
+  const int last = res->last;
+  int n = res->first;
+  uint32_t base_id = TOKEN_ID(coeff_type, n, ctx);
+  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  proba_t* s = res->stats[n][ctx];
+  if (!AddToken(tokens, last >= 0, base_id + 0, s + 0)) {
+    return 0;
+  }
+
+  while (n < 16) {
+    const int c = coeffs[n++];
+    const int sign = c < 0;
+    const uint32_t v = sign ? -c : c;
+    if (!AddToken(tokens, v != 0, base_id + 1, s + 1)) {
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 0);  // ctx=0
+      s = res->stats[VP8EncBands[n]][0];
+      continue;
+    }
+    if (!AddToken(tokens, v > 1, base_id + 2, s + 2)) {
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 1);  // ctx=1
+      s = res->stats[VP8EncBands[n]][1];
+    } else {
+      if (!AddToken(tokens, v > 4, base_id + 3, s + 3)) {
+        if (AddToken(tokens, v != 2, base_id + 4, s + 4)) {
+          AddToken(tokens, v == 4, base_id + 5, s + 5);
+        }
+      } else if (!AddToken(tokens, v > 10, base_id + 6, s + 6)) {
+        if (!AddToken(tokens, v > 6, base_id + 7, s + 7)) {
+          AddConstantToken(tokens, v == 6, 159);
+        } else {
+          AddConstantToken(tokens, v >= 9, 165);
+          AddConstantToken(tokens, !(v & 1), 145);
+        }
+      } else {
+        int mask;
+        const uint8_t* tab;
+        uint32_t residue = v - 3;
+        if (residue < (8 << 1)) {          // VP8Cat3  (3b)
+          AddToken(tokens, 0, base_id + 8, s + 8);
+          AddToken(tokens, 0, base_id + 9, s + 9);
+          residue -= (8 << 0);
+          mask = 1 << 2;
+          tab = VP8Cat3;
+        } else if (residue < (8 << 2)) {   // VP8Cat4  (4b)
+          AddToken(tokens, 0, base_id + 8, s + 8);
+          AddToken(tokens, 1, base_id + 9, s + 9);
+          residue -= (8 << 1);
+          mask = 1 << 3;
+          tab = VP8Cat4;
+        } else if (residue < (8 << 3)) {   // VP8Cat5  (5b)
+          AddToken(tokens, 1, base_id + 8, s + 8);
+          AddToken(tokens, 0, base_id + 10, s + 9);
+          residue -= (8 << 2);
+          mask = 1 << 4;
+          tab = VP8Cat5;
+        } else {                         // VP8Cat6 (11b)
+          AddToken(tokens, 1, base_id + 8, s + 8);
+          AddToken(tokens, 1, base_id + 10, s + 9);
+          residue -= (8 << 3);
+          mask = 1 << 10;
+          tab = VP8Cat6;
+        }
+        while (mask) {
+          AddConstantToken(tokens, !!(residue & mask), *tab++);
+          mask >>= 1;
+        }
+      }
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 2);  // ctx=2
+      s = res->stats[VP8EncBands[n]][2];
+    }
+    AddConstantToken(tokens, sign, 128);
+    if (n == 16 || !AddToken(tokens, n <= last, base_id + 0, s + 0)) {
+      return 1;   // EOB
+    }
+  }
+  return 1;
+}
+
+#undef TOKEN_ID
+
+//------------------------------------------------------------------------------
+// Final coding pass, with known probabilities
+
+int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
+                  const uint8_t* const probas, int final_pass) {
+  const VP8Tokens* p = b->pages_;
+  assert(!b->error_);
+  while (p != NULL) {
+    const VP8Tokens* const next = p->next_;
+    const int N = (next == NULL) ? b->left_ : 0;
+    int n = b->page_size_;
+    const token_t* const tokens = TOKEN_DATA(p);
+    while (n-- > N) {
+      const token_t token = tokens[n];
+      const int bit = (token >> 15) & 1;
+      if (token & FIXED_PROBA_BIT) {
+        VP8PutBit(bw, bit, token & 0xffu);  // constant proba
+      } else {
+        VP8PutBit(bw, bit, probas[token & 0x3fffu]);
+      }
+    }
+    if (final_pass) WebPSafeFree((void*)p);
+    p = next;
+  }
+  if (final_pass) b->pages_ = NULL;
+  return 1;
+}
+
+// Size estimation
+size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
+  size_t size = 0;
+  const VP8Tokens* p = b->pages_;
+  assert(!b->error_);
+  while (p != NULL) {
+    const VP8Tokens* const next = p->next_;
+    const int N = (next == NULL) ? b->left_ : 0;
+    int n = b->page_size_;
+    const token_t* const tokens = TOKEN_DATA(p);
+    while (n-- > N) {
+      const token_t token = tokens[n];
+      const int bit = token & (1 << 15);
+      if (token & FIXED_PROBA_BIT) {
+        size += VP8BitCost(bit, token & 0xffu);
+      } else {
+        size += VP8BitCost(bit, probas[token & 0x3fffu]);
+      }
+    }
+    p = next;
+  }
+  return size;
+}
+
+//------------------------------------------------------------------------------
+
+#else     // DISABLE_TOKEN_BUFFER
+
+void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
+  (void)b;
+  (void)page_size;
+}
+void VP8TBufferClear(VP8TBuffer* const b) {
+  (void)b;
+}
+
+#endif    // !DISABLE_TOKEN_BUFFER
+
diff --git a/media/libwebp/enc/tree_enc.c b/media/libwebp/enc/tree_enc.c
new file mode 100644
index 0000000000..7bf9b47d08
--- /dev/null
+++ b/media/libwebp/enc/tree_enc.c
@@ -0,0 +1,504 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Coding of token probabilities, intra modes and segments.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Default probabilities
+
+// Paragraph 13.5
+const uint8_t
+  VP8CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+      { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+    },
+    { { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+      { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+    },
+    { { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+      { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+      { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
+      { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
+      { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
+    },
+    { { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+      { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
+    },
+    { { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+      { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+      { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
+    },
+    { { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+      { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+      { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+      { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
+    },
+    { { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+      { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
+    },
+    { { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+      { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
+    },
+    { { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+      { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+      { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
+    },
+    { { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+      { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+      { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+      { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+      { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
+    },
+    { { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+      { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+      { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+      { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
+    },
+    { { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+      { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
+    },
+    { { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+      { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+      { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
+    },
+    { { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+      { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
+    },
+    { { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+      { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
+    },
+    { { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+      { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+      { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+      { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
+    },
+    { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  }
+};
+
+void VP8DefaultProbas(VP8Encoder* const enc) {
+  VP8EncProba* const probas = &enc->proba_;
+  probas->use_skip_proba_ = 0;
+  memset(probas->segments_, 255u, sizeof(probas->segments_));
+  memcpy(probas->coeffs_, VP8CoeffsProba0, sizeof(VP8CoeffsProba0));
+  // Note: we could hard-code the level_costs_ corresponding to VP8CoeffsProba0,
+  // but that's ~11k of static data. Better call VP8CalculateLevelCosts() later.
+  probas->dirty_ = 1;
+}
+
+// Paragraph 11.5.  900bytes.
+static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
+  { { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
+    { 152, 179, 64, 126, 170, 118, 46, 70, 95 },
+    { 175, 69, 143, 80, 85, 82, 72, 155, 103 },
+    { 56, 58, 10, 171, 218, 189, 17, 13, 152 },
+    { 114, 26, 17, 163, 44, 195, 21, 10, 173 },
+    { 121, 24, 80, 195, 26, 62, 44, 64, 85 },
+    { 144, 71, 10, 38, 171, 213, 144, 34, 26 },
+    { 170, 46, 55, 19, 136, 160, 33, 206, 71 },
+    { 63, 20, 8, 114, 114, 208, 12, 9, 226 },
+    { 81, 40, 11, 96, 182, 84, 29, 16, 36 } },
+  { { 134, 183, 89, 137, 98, 101, 106, 165, 148 },
+    { 72, 187, 100, 130, 157, 111, 32, 75, 80 },
+    { 66, 102, 167, 99, 74, 62, 40, 234, 128 },
+    { 41, 53, 9, 178, 241, 141, 26, 8, 107 },
+    { 74, 43, 26, 146, 73, 166, 49, 23, 157 },
+    { 65, 38, 105, 160, 51, 52, 31, 115, 128 },
+    { 104, 79, 12, 27, 217, 255, 87, 17, 7 },
+    { 87, 68, 71, 44, 114, 51, 15, 186, 23 },
+    { 47, 41, 14, 110, 182, 183, 21, 17, 194 },
+    { 66, 45, 25, 102, 197, 189, 23, 18, 22 } },
+  { { 88, 88, 147, 150, 42, 46, 45, 196, 205 },
+    { 43, 97, 183, 117, 85, 38, 35, 179, 61 },
+    { 39, 53, 200, 87, 26, 21, 43, 232, 171 },
+    { 56, 34, 51, 104, 114, 102, 29, 93, 77 },
+    { 39, 28, 85, 171, 58, 165, 90, 98, 64 },
+    { 34, 22, 116, 206, 23, 34, 43, 166, 73 },
+    { 107, 54, 32, 26, 51, 1, 81, 43, 31 },
+    { 68, 25, 106, 22, 64, 171, 36, 225, 114 },
+    { 34, 19, 21, 102, 132, 188, 16, 76, 124 },
+    { 62, 18, 78, 95, 85, 57, 50, 48, 51 } },
+  { { 193, 101, 35, 159, 215, 111, 89, 46, 111 },
+    { 60, 148, 31, 172, 219, 228, 21, 18, 111 },
+    { 112, 113, 77, 85, 179, 255, 38, 120, 114 },
+    { 40, 42, 1, 196, 245, 209, 10, 25, 109 },
+    { 88, 43, 29, 140, 166, 213, 37, 43, 154 },
+    { 61, 63, 30, 155, 67, 45, 68, 1, 209 },
+    { 100, 80, 8, 43, 154, 1, 51, 26, 71 },
+    { 142, 78, 78, 16, 255, 128, 34, 197, 171 },
+    { 41, 40, 5, 102, 211, 183, 4, 1, 221 },
+    { 51, 50, 17, 168, 209, 192, 23, 25, 82 } },
+  { { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
+    { 67, 87, 58, 169, 82, 115, 26, 59, 179 },
+    { 63, 59, 90, 180, 59, 166, 93, 73, 154 },
+    { 40, 40, 21, 116, 143, 209, 34, 39, 175 },
+    { 47, 15, 16, 183, 34, 223, 49, 45, 183 },
+    { 46, 17, 33, 183, 6, 98, 15, 32, 183 },
+    { 57, 46, 22, 24, 128, 1, 54, 17, 37 },
+    { 65, 32, 73, 115, 28, 128, 23, 128, 205 },
+    { 40, 3, 9, 115, 51, 192, 18, 6, 223 },
+    { 87, 37, 9, 115, 59, 77, 64, 21, 47 } },
+  { { 104, 55, 44, 218, 9, 54, 53, 130, 226 },
+    { 64, 90, 70, 205, 40, 41, 23, 26, 57 },
+    { 54, 57, 112, 184, 5, 41, 38, 166, 213 },
+    { 30, 34, 26, 133, 152, 116, 10, 32, 134 },
+    { 39, 19, 53, 221, 26, 114, 32, 73, 255 },
+    { 31, 9, 65, 234, 2, 15, 1, 118, 73 },
+    { 75, 32, 12, 51, 192, 255, 160, 43, 51 },
+    { 88, 31, 35, 67, 102, 85, 55, 186, 85 },
+    { 56, 21, 23, 111, 59, 205, 45, 37, 192 },
+    { 55, 38, 70, 124, 73, 102, 1, 34, 98 } },
+  { { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
+    { 95, 84, 53, 89, 128, 100, 113, 101, 45 },
+    { 75, 79, 123, 47, 51, 128, 81, 171, 1 },
+    { 57, 17, 5, 71, 102, 57, 53, 41, 49 },
+    { 38, 33, 13, 121, 57, 73, 26, 1, 85 },
+    { 41, 10, 67, 138, 77, 110, 90, 47, 114 },
+    { 115, 21, 2, 10, 102, 255, 166, 23, 6 },
+    { 101, 29, 16, 10, 85, 128, 101, 196, 26 },
+    { 57, 18, 10, 102, 102, 213, 34, 20, 43 },
+    { 117, 20, 15, 36, 163, 128, 68, 1, 26 } },
+  { { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
+    { 69, 60, 71, 38, 73, 119, 28, 222, 37 },
+    { 68, 45, 128, 34, 1, 47, 11, 245, 171 },
+    { 62, 17, 19, 70, 146, 85, 55, 62, 70 },
+    { 37, 43, 37, 154, 100, 163, 85, 160, 1 },
+    { 63, 9, 92, 136, 28, 64, 32, 201, 85 },
+    { 75, 15, 9, 9, 64, 255, 184, 119, 16 },
+    { 86, 6, 28, 5, 64, 255, 25, 248, 1 },
+    { 56, 8, 17, 132, 137, 255, 55, 116, 128 },
+    { 58, 15, 20, 82, 135, 57, 26, 121, 40 } },
+  { { 164, 50, 31, 137, 154, 133, 25, 35, 218 },
+    { 51, 103, 44, 131, 131, 123, 31, 6, 158 },
+    { 86, 40, 64, 135, 148, 224, 45, 183, 128 },
+    { 22, 26, 17, 131, 240, 154, 14, 1, 209 },
+    { 45, 16, 21, 91, 64, 222, 7, 1, 197 },
+    { 56, 21, 39, 155, 60, 138, 23, 102, 213 },
+    { 83, 12, 13, 54, 192, 255, 68, 47, 28 },
+    { 85, 26, 85, 85, 128, 128, 32, 146, 171 },
+    { 18, 11, 7, 63, 144, 171, 4, 4, 246 },
+    { 35, 27, 10, 146, 174, 171, 12, 26, 128 } },
+  { { 190, 80, 35, 99, 180, 80, 126, 54, 45 },
+    { 85, 126, 47, 87, 176, 51, 41, 20, 32 },
+    { 101, 75, 128, 139, 118, 146, 116, 128, 85 },
+    { 56, 41, 15, 176, 236, 85, 37, 9, 62 },
+    { 71, 30, 17, 119, 118, 255, 17, 18, 138 },
+    { 101, 38, 60, 138, 55, 70, 43, 26, 142 },
+    { 146, 36, 19, 30, 171, 255, 97, 27, 20 },
+    { 138, 45, 61, 62, 219, 1, 81, 188, 64 },
+    { 32, 41, 20, 117, 151, 142, 20, 21, 163 },
+    { 112, 19, 12, 61, 195, 128, 48, 4, 24 } }
+};
+
+static int PutI4Mode(VP8BitWriter* const bw, int mode,
+                     const uint8_t* const prob) {
+  if (VP8PutBit(bw, mode != B_DC_PRED, prob[0])) {
+    if (VP8PutBit(bw, mode != B_TM_PRED, prob[1])) {
+      if (VP8PutBit(bw, mode != B_VE_PRED, prob[2])) {
+        if (!VP8PutBit(bw, mode >= B_LD_PRED, prob[3])) {
+          if (VP8PutBit(bw, mode != B_HE_PRED, prob[4])) {
+            VP8PutBit(bw, mode != B_RD_PRED, prob[5]);
+          }
+        } else {
+          if (VP8PutBit(bw, mode != B_LD_PRED, prob[6])) {
+            if (VP8PutBit(bw, mode != B_VL_PRED, prob[7])) {
+              VP8PutBit(bw, mode != B_HD_PRED, prob[8]);
+            }
+          }
+        }
+      }
+    }
+  }
+  return mode;
+}
+
+static void PutI16Mode(VP8BitWriter* const bw, int mode) {
+  if (VP8PutBit(bw, (mode == TM_PRED || mode == H_PRED), 156)) {
+    VP8PutBit(bw, mode == TM_PRED, 128);    // TM or HE
+  } else {
+    VP8PutBit(bw, mode == V_PRED, 163);     // VE or DC
+  }
+}
+
+static void PutUVMode(VP8BitWriter* const bw, int uv_mode) {
+  if (VP8PutBit(bw, uv_mode != DC_PRED, 142)) {
+    if (VP8PutBit(bw, uv_mode != V_PRED, 114)) {
+      VP8PutBit(bw, uv_mode != H_PRED, 183);    // else: TM_PRED
+    }
+  }
+}
+
+static void PutSegment(VP8BitWriter* const bw, int s, const uint8_t* p) {
+  if (VP8PutBit(bw, s >= 2, p[0])) p += 1;
+  VP8PutBit(bw, s & 1, p[1]);
+}
+
+void VP8CodeIntraModes(VP8Encoder* const enc) {
+  VP8BitWriter* const bw = &enc->bw_;
+  VP8EncIterator it;
+  VP8IteratorInit(enc, &it);
+  do {
+    const VP8MBInfo* const mb = it.mb_;
+    const uint8_t* preds = it.preds_;
+    if (enc->segment_hdr_.update_map_) {
+      PutSegment(bw, mb->segment_, enc->proba_.segments_);
+    }
+    if (enc->proba_.use_skip_proba_) {
+      VP8PutBit(bw, mb->skip_, enc->proba_.skip_proba_);
+    }
+    if (VP8PutBit(bw, (mb->type_ != 0), 145)) {  // i16x16
+      PutI16Mode(bw, preds[0]);
+    } else {
+      const int preds_w = enc->preds_w_;
+      const uint8_t* top_pred = preds - preds_w;
+      int x, y;
+      for (y = 0; y < 4; ++y) {
+        int left = preds[-1];
+        for (x = 0; x < 4; ++x) {
+          const uint8_t* const probas = kBModesProba[top_pred[x]][left];
+          left = PutI4Mode(bw, preds[x], probas);
+        }
+        top_pred = preds;
+        preds += preds_w;
+      }
+    }
+    PutUVMode(bw, mb->uv_mode_);
+  } while (VP8IteratorNext(&it));
+}
+
+//------------------------------------------------------------------------------
+// Paragraph 13
+
+const uint8_t
+    VP8CoeffsUpdateProba[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  { { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255 },
+      { 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255 }
+    },
+    { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255 },
+      { 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  }
+};
+
+void VP8WriteProbas(VP8BitWriter* const bw, const VP8EncProba* const probas) {
+  int t, b, c, p;
+  for (t = 0; t < NUM_TYPES; ++t) {
+    for (b = 0; b < NUM_BANDS; ++b) {
+      for (c = 0; c < NUM_CTX; ++c) {
+        for (p = 0; p < NUM_PROBAS; ++p) {
+          const uint8_t p0 = probas->coeffs_[t][b][c][p];
+          const int update = (p0 != VP8CoeffsProba0[t][b][c][p]);
+          if (VP8PutBit(bw, update, VP8CoeffsUpdateProba[t][b][c][p])) {
+            VP8PutBits(bw, p0, 8);
+          }
+        }
+      }
+    }
+  }
+  if (VP8PutBitUniform(bw, probas->use_skip_proba_)) {
+    VP8PutBits(bw, probas->skip_proba_, 8);
+  }
+}
+
diff --git a/media/libwebp/enc/vp8i_enc.h b/media/libwebp/enc/vp8i_enc.h
index 009ccf2239..9bca205e01 100644
--- a/media/libwebp/enc/vp8i_enc.h
+++ b/media/libwebp/enc/vp8i_enc.h
@@ -31,7 +31,7 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 1
-#define ENC_MIN_VERSION 0
+#define ENC_MIN_VERSION 2
 #define ENC_REV_VERSION 2
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
@@ -249,7 +249,7 @@ typedef struct {
   int           percent0_;         // saved initial progress percent
 
   DError        left_derr_;        // left error diffusion (u/v)
-  DError       *top_derr_;         // top diffusion error - NULL if disabled
+  DError*       top_derr_;         // top diffusion error - NULL if disabled
 
   uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
   uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
@@ -286,8 +286,7 @@ int VP8IteratorNext(VP8EncIterator* const it);
 // save the yuv_out_ boundary values to top_/left_ arrays for next iterations.
 void VP8IteratorSaveBoundary(VP8EncIterator* const it);
 // Report progression based on macroblock rows. Return 0 for user-abort request.
-int VP8IteratorProgress(const VP8EncIterator* const it,
-                        int final_delta_percent);
+int VP8IteratorProgress(const VP8EncIterator* const it, int delta);
 // Intra4x4 iterations
 void VP8IteratorStartI4(VP8EncIterator* const it);
 // returns true if not done.
@@ -505,9 +504,9 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
 // Returns false in case of error (invalid param, out-of-memory).
 int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
 
-// Clean-up the RGB samples under fully transparent area, to help lossless
-// compressibility (no guarantee, though). Assumes that pic->use_argb is true.
-void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);
+// Replace samples that are fully transparent by 'color' to help compressibility
+// (no guarantee, though). Assumes pic->use_argb is true.
+void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color);
 
 //------------------------------------------------------------------------------
 
diff --git a/media/libwebp/enc/vp8l_enc.c b/media/libwebp/enc/vp8l_enc.c
new file mode 100644
index 0000000000..4aed5d8e32
--- /dev/null
+++ b/media/libwebp/enc/vp8l_enc.c
@@ -0,0 +1,2138 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// main entry for the lossless encoder.
+//
+// Author: Vikas Arora (vikaas.arora@gmail.com)
+//
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "../enc/backward_references_enc.h"
+#include "../enc/histogram_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/vp8li_enc.h"
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/bit_writer_utils.h"
+#include "../utils/huffman_encode_utils.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+// Maximum number of histogram images (sub-blocks).
+#define MAX_HUFF_IMAGE_SIZE       2600
+
+// Palette reordering for smaller sum of deltas (and for smaller storage).
+
+static int PaletteCompareColorsForQsort(const void* p1, const void* p2) {
+  const uint32_t a = WebPMemToUint32((uint8_t*)p1);
+  const uint32_t b = WebPMemToUint32((uint8_t*)p2);
+  assert(a != b);
+  return (a < b) ? -1 : 1;
+}
+
+static WEBP_INLINE uint32_t PaletteComponentDistance(uint32_t v) {
+  return (v <= 128) ? v : (256 - v);
+}
+
+// Computes a value that is related to the entropy created by the
+// palette entry diff.
+//
+// Note that the last & 0xff is a no-operation in the next statement, but
+// removed by most compilers and is here only for regularity of the code.
+static WEBP_INLINE uint32_t PaletteColorDistance(uint32_t col1, uint32_t col2) {
+  const uint32_t diff = VP8LSubPixels(col1, col2);
+  const int kMoreWeightForRGBThanForAlpha = 9;
+  uint32_t score;
+  score =  PaletteComponentDistance((diff >>  0) & 0xff);
+  score += PaletteComponentDistance((diff >>  8) & 0xff);
+  score += PaletteComponentDistance((diff >> 16) & 0xff);
+  score *= kMoreWeightForRGBThanForAlpha;
+  score += PaletteComponentDistance((diff >> 24) & 0xff);
+  return score;
+}
+
+static WEBP_INLINE void SwapColor(uint32_t* const col1, uint32_t* const col2) {
+  const uint32_t tmp = *col1;
+  *col1 = *col2;
+  *col2 = tmp;
+}
+
+static WEBP_INLINE int SearchColorNoIdx(const uint32_t sorted[], uint32_t color,
+                                        int num_colors) {
+  int low = 0, hi = num_colors;
+  if (sorted[low] == color) return low;  // loop invariant: sorted[low] != color
+  while (1) {
+    const int mid = (low + hi) >> 1;
+    if (sorted[mid] == color) {
+      return mid;
+    } else if (sorted[mid] < color) {
+      low = mid;
+    } else {
+      hi = mid;
+    }
+  }
+  assert(0);
+  return 0;
+}
+
+// The palette has been sorted by alpha. This function checks if the other
+// components of the palette have a monotonic development with regards to
+// position in the palette. If all have monotonic development, there is
+// no benefit to re-organize them greedily. A monotonic development
+// would be spotted in green-only situations (like lossy alpha) or gray-scale
+// images.
+static int PaletteHasNonMonotonousDeltas(const uint32_t* const palette,
+                                         int num_colors) {
+  uint32_t predict = 0x000000;
+  int i;
+  uint8_t sign_found = 0x00;
+  for (i = 0; i < num_colors; ++i) {
+    const uint32_t diff = VP8LSubPixels(palette[i], predict);
+    const uint8_t rd = (diff >> 16) & 0xff;
+    const uint8_t gd = (diff >>  8) & 0xff;
+    const uint8_t bd = (diff >>  0) & 0xff;
+    if (rd != 0x00) {
+      sign_found |= (rd < 0x80) ? 1 : 2;
+    }
+    if (gd != 0x00) {
+      sign_found |= (gd < 0x80) ? 8 : 16;
+    }
+    if (bd != 0x00) {
+      sign_found |= (bd < 0x80) ? 64 : 128;
+    }
+    predict = palette[i];
+  }
+  return (sign_found & (sign_found << 1)) != 0;  // two consequent signs.
+}
+
+static void PaletteSortMinimizeDeltas(const uint32_t* const palette_sorted,
+                                      int num_colors, uint32_t* const palette) {
+  uint32_t predict = 0x00000000;
+  int i, k;
+  memcpy(palette, palette_sorted, num_colors * sizeof(*palette));
+  if (!PaletteHasNonMonotonousDeltas(palette_sorted, num_colors)) return;
+  // Find greedily always the closest color of the predicted color to minimize
+  // deltas in the palette. This reduces storage needs since the
+  // palette is stored with delta encoding.
+  for (i = 0; i < num_colors; ++i) {
+    int best_ix = i;
+    uint32_t best_score = ~0U;
+    for (k = i; k < num_colors; ++k) {
+      const uint32_t cur_score = PaletteColorDistance(palette[k], predict);
+      if (best_score > cur_score) {
+        best_score = cur_score;
+        best_ix = k;
+      }
+    }
+    SwapColor(&palette[best_ix], &palette[i]);
+    predict = palette[i];
+  }
+}
+
+// Sort palette in increasing order and prepare an inverse mapping array.
+static void PrepareMapToPalette(const uint32_t palette[], uint32_t num_colors,
+                                uint32_t sorted[], uint32_t idx_map[]) {
+  uint32_t i;
+  memcpy(sorted, palette, num_colors * sizeof(*sorted));
+  qsort(sorted, num_colors, sizeof(*sorted), PaletteCompareColorsForQsort);
+  for (i = 0; i < num_colors; ++i) {
+    idx_map[SearchColorNoIdx(sorted, palette[i], num_colors)] = i;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Modified Zeng method from "A Survey on Palette Reordering
+// Methods for Improving the Compression of Color-Indexed Images" by Armando J.
+// Pinho and Antonio J. R. Neves.
+
+// Finds the biggest cooccurrence in the matrix.
+static void CoOccurrenceFindMax(const uint32_t* const cooccurrence,
+                                uint32_t num_colors, uint8_t* const c1,
+                                uint8_t* const c2) {
+  // Find the index that is most frequently located adjacent to other
+  // (different) indexes.
+  uint32_t best_sum = 0u;
+  uint32_t i, j, best_cooccurrence;
+  *c1 = 0u;
+  for (i = 0; i < num_colors; ++i) {
+    uint32_t sum = 0;
+    for (j = 0; j < num_colors; ++j) sum += cooccurrence[i * num_colors + j];
+    if (sum > best_sum) {
+      best_sum = sum;
+      *c1 = i;
+    }
+  }
+  // Find the index that is most frequently found adjacent to *c1.
+  *c2 = 0u;
+  best_cooccurrence = 0u;
+  for (i = 0; i < num_colors; ++i) {
+    if (cooccurrence[*c1 * num_colors + i] > best_cooccurrence) {
+      best_cooccurrence = cooccurrence[*c1 * num_colors + i];
+      *c2 = i;
+    }
+  }
+  assert(*c1 != *c2);
+}
+
+// Builds the cooccurrence matrix
+static WebPEncodingError CoOccurrenceBuild(const WebPPicture* const pic,
+                                           const uint32_t* const palette,
+                                           uint32_t num_colors,
+                                           uint32_t* cooccurrence) {
+  uint32_t *lines, *line_top, *line_current, *line_tmp;
+  int x, y;
+  const uint32_t* src = pic->argb;
+  uint32_t prev_pix = ~src[0];
+  uint32_t prev_idx = 0u;
+  uint32_t idx_map[MAX_PALETTE_SIZE] = {0};
+  uint32_t palette_sorted[MAX_PALETTE_SIZE];
+  lines = (uint32_t*)WebPSafeMalloc(2 * pic->width, sizeof(*lines));
+  if (lines == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+  line_top = &lines[0];
+  line_current = &lines[pic->width];
+  PrepareMapToPalette(palette, num_colors, palette_sorted, idx_map);
+  for (y = 0; y < pic->height; ++y) {
+    for (x = 0; x < pic->width; ++x) {
+      const uint32_t pix = src[x];
+      if (pix != prev_pix) {
+        prev_idx = idx_map[SearchColorNoIdx(palette_sorted, pix, num_colors)];
+        prev_pix = pix;
+      }
+      line_current[x] = prev_idx;
+      // 4-connectivity is what works best as mentioned in "On the relation
+      // between Memon's and the modified Zeng's palette reordering methods".
+      if (x > 0 && prev_idx != line_current[x - 1]) {
+        const uint32_t left_idx = line_current[x - 1];
+        ++cooccurrence[prev_idx * num_colors + left_idx];
+        ++cooccurrence[left_idx * num_colors + prev_idx];
+      }
+      if (y > 0 && prev_idx != line_top[x]) {
+        const uint32_t top_idx = line_top[x];
+        ++cooccurrence[prev_idx * num_colors + top_idx];
+        ++cooccurrence[top_idx * num_colors + prev_idx];
+      }
+    }
+    line_tmp = line_top;
+    line_top = line_current;
+    line_current = line_tmp;
+    src += pic->argb_stride;
+  }
+  WebPSafeFree(lines);
+  return VP8_ENC_OK;
+}
+
+struct Sum {
+  uint8_t index;
+  uint32_t sum;
+};
+
+// Implements the modified Zeng method from "A Survey on Palette Reordering
+// Methods for Improving the Compression of Color-Indexed Images" by Armando J.
+// Pinho and Antonio J. R. Neves.
+static WebPEncodingError PaletteSortModifiedZeng(
+    const WebPPicture* const pic, const uint32_t* const palette_sorted,
+    uint32_t num_colors, uint32_t* const palette) {
+  uint32_t i, j, ind;
+  uint8_t remapping[MAX_PALETTE_SIZE];
+  uint32_t* cooccurrence;
+  struct Sum sums[MAX_PALETTE_SIZE];
+  uint32_t first, last;
+  uint32_t num_sums;
+  // TODO(vrabaud) check whether one color images should use palette or not.
+  if (num_colors <= 1) return VP8_ENC_OK;
+  // Build the co-occurrence matrix.
+  cooccurrence =
+      (uint32_t*)WebPSafeCalloc(num_colors * num_colors, sizeof(*cooccurrence));
+  if (cooccurrence == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+  if (CoOccurrenceBuild(pic, palette_sorted, num_colors, cooccurrence) !=
+      VP8_ENC_OK) {
+    WebPSafeFree(cooccurrence);
+    return VP8_ENC_ERROR_OUT_OF_MEMORY;
+  }
+
+  // Initialize the mapping list with the two best indices.
+  CoOccurrenceFindMax(cooccurrence, num_colors, &remapping[0], &remapping[1]);
+
+  // We need to append and prepend to the list of remapping. To this end, we
+  // actually define the next start/end of the list as indices in a vector (with
+  // a wrap around when the end is reached).
+  first = 0;
+  last = 1;
+  num_sums = num_colors - 2;  // -2 because we know the first two values
+  if (num_sums > 0) {
+    // Initialize the sums with the first two remappings and find the best one
+    struct Sum* best_sum = &sums[0];
+    best_sum->index = 0u;
+    best_sum->sum = 0u;
+    for (i = 0, j = 0; i < num_colors; ++i) {
+      if (i == remapping[0] || i == remapping[1]) continue;
+      sums[j].index = i;
+      sums[j].sum = cooccurrence[i * num_colors + remapping[0]] +
+                    cooccurrence[i * num_colors + remapping[1]];
+      if (sums[j].sum > best_sum->sum) best_sum = &sums[j];
+      ++j;
+    }
+
+    while (num_sums > 0) {
+      const uint8_t best_index = best_sum->index;
+      // Compute delta to know if we need to prepend or append the best index.
+      int32_t delta = 0;
+      const int32_t n = num_colors - num_sums;
+      for (ind = first, j = 0; (ind + j) % num_colors != last + 1; ++j) {
+        const uint16_t l_j = remapping[(ind + j) % num_colors];
+        delta += (n - 1 - 2 * (int32_t)j) *
+                 (int32_t)cooccurrence[best_index * num_colors + l_j];
+      }
+      if (delta > 0) {
+        first = (first == 0) ? num_colors - 1 : first - 1;
+        remapping[first] = best_index;
+      } else {
+        ++last;
+        remapping[last] = best_index;
+      }
+      // Remove best_sum from sums.
+      *best_sum = sums[num_sums - 1];
+      --num_sums;
+      // Update all the sums and find the best one.
+      best_sum = &sums[0];
+      for (i = 0; i < num_sums; ++i) {
+        sums[i].sum += cooccurrence[best_index * num_colors + sums[i].index];
+        if (sums[i].sum > best_sum->sum) best_sum = &sums[i];
+      }
+    }
+  }
+  assert((last + 1) % num_colors == first);
+  WebPSafeFree(cooccurrence);
+
+  // Re-map the palette.
+  for (i = 0; i < num_colors; ++i) {
+    palette[i] = palette_sorted[remapping[(first + i) % num_colors]];
+  }
+  return VP8_ENC_OK;
+}
+
+// -----------------------------------------------------------------------------
+// Palette
+
+// These five modes are evaluated and their respective entropy is computed.
+typedef enum {
+  kDirect = 0,
+  kSpatial = 1,
+  kSubGreen = 2,
+  kSpatialSubGreen = 3,
+  kPalette = 4,
+  kPaletteAndSpatial = 5,
+  kNumEntropyIx = 6
+} EntropyIx;
+
+typedef enum {
+  kSortedDefault = 0,
+  kMinimizeDelta = 1,
+  kModifiedZeng = 2,
+  kUnusedPalette = 3,
+} PaletteSorting;
+
+typedef enum {
+  kHistoAlpha = 0,
+  kHistoAlphaPred,
+  kHistoGreen,
+  kHistoGreenPred,
+  kHistoRed,
+  kHistoRedPred,
+  kHistoBlue,
+  kHistoBluePred,
+  kHistoRedSubGreen,
+  kHistoRedPredSubGreen,
+  kHistoBlueSubGreen,
+  kHistoBluePredSubGreen,
+  kHistoPalette,
+  kHistoTotal  // Must be last.
+} HistoIx;
+
+static void AddSingleSubGreen(int p, uint32_t* const r, uint32_t* const b) {
+  const int green = p >> 8;  // The upper bits are masked away later.
+  ++r[((p >> 16) - green) & 0xff];
+  ++b[((p >>  0) - green) & 0xff];
+}
+
+static void AddSingle(uint32_t p,
+                      uint32_t* const a, uint32_t* const r,
+                      uint32_t* const g, uint32_t* const b) {
+  ++a[(p >> 24) & 0xff];
+  ++r[(p >> 16) & 0xff];
+  ++g[(p >>  8) & 0xff];
+  ++b[(p >>  0) & 0xff];
+}
+
+static WEBP_INLINE uint32_t HashPix(uint32_t pix) {
+  // Note that masking with 0xffffffffu is for preventing an
+  // 'unsigned int overflow' warning. Doesn't impact the compiled code.
+  return ((((uint64_t)pix + (pix >> 19)) * 0x39c5fba7ull) & 0xffffffffu) >> 24;
+}
+
+static int AnalyzeEntropy(const uint32_t* argb,
+                          int width, int height, int argb_stride,
+                          int use_palette,
+                          int palette_size, int transform_bits,
+                          EntropyIx* const min_entropy_ix,
+                          int* const red_and_blue_always_zero) {
+  // Allocate histogram set with cache_bits = 0.
+  uint32_t* histo;
+
+  if (use_palette && palette_size <= 16) {
+    // In the case of small palettes, we pack 2, 4 or 8 pixels together. In
+    // practice, small palettes are better than any other transform.
+    *min_entropy_ix = kPalette;
+    *red_and_blue_always_zero = 1;
+    return 1;
+  }
+  histo = (uint32_t*)WebPSafeCalloc(kHistoTotal, sizeof(*histo) * 256);
+  if (histo != NULL) {
+    int i, x, y;
+    const uint32_t* prev_row = NULL;
+    const uint32_t* curr_row = argb;
+    uint32_t pix_prev = argb[0];  // Skip the first pixel.
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        const uint32_t pix = curr_row[x];
+        const uint32_t pix_diff = VP8LSubPixels(pix, pix_prev);
+        pix_prev = pix;
+        if ((pix_diff == 0) || (prev_row != NULL && pix == prev_row[x])) {
+          continue;
+        }
+        AddSingle(pix,
+                  &histo[kHistoAlpha * 256],
+                  &histo[kHistoRed * 256],
+                  &histo[kHistoGreen * 256],
+                  &histo[kHistoBlue * 256]);
+        AddSingle(pix_diff,
+                  &histo[kHistoAlphaPred * 256],
+                  &histo[kHistoRedPred * 256],
+                  &histo[kHistoGreenPred * 256],
+                  &histo[kHistoBluePred * 256]);
+        AddSingleSubGreen(pix,
+                          &histo[kHistoRedSubGreen * 256],
+                          &histo[kHistoBlueSubGreen * 256]);
+        AddSingleSubGreen(pix_diff,
+                          &histo[kHistoRedPredSubGreen * 256],
+                          &histo[kHistoBluePredSubGreen * 256]);
+        {
+          // Approximate the palette by the entropy of the multiplicative hash.
+          const uint32_t hash = HashPix(pix);
+          ++histo[kHistoPalette * 256 + hash];
+        }
+      }
+      prev_row = curr_row;
+      curr_row += argb_stride;
+    }
+    {
+      double entropy_comp[kHistoTotal];
+      double entropy[kNumEntropyIx];
+      int k;
+      int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen;
+      int j;
+      // Let's add one zero to the predicted histograms. The zeros are removed
+      // too efficiently by the pix_diff == 0 comparison, at least one of the
+      // zeros is likely to exist.
+      ++histo[kHistoRedPredSubGreen * 256];
+      ++histo[kHistoBluePredSubGreen * 256];
+      ++histo[kHistoRedPred * 256];
+      ++histo[kHistoGreenPred * 256];
+      ++histo[kHistoBluePred * 256];
+      ++histo[kHistoAlphaPred * 256];
+
+      for (j = 0; j < kHistoTotal; ++j) {
+        entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256);
+      }
+      entropy[kDirect] = entropy_comp[kHistoAlpha] +
+          entropy_comp[kHistoRed] +
+          entropy_comp[kHistoGreen] +
+          entropy_comp[kHistoBlue];
+      entropy[kSpatial] = entropy_comp[kHistoAlphaPred] +
+          entropy_comp[kHistoRedPred] +
+          entropy_comp[kHistoGreenPred] +
+          entropy_comp[kHistoBluePred];
+      entropy[kSubGreen] = entropy_comp[kHistoAlpha] +
+          entropy_comp[kHistoRedSubGreen] +
+          entropy_comp[kHistoGreen] +
+          entropy_comp[kHistoBlueSubGreen];
+      entropy[kSpatialSubGreen] = entropy_comp[kHistoAlphaPred] +
+          entropy_comp[kHistoRedPredSubGreen] +
+          entropy_comp[kHistoGreenPred] +
+          entropy_comp[kHistoBluePredSubGreen];
+      entropy[kPalette] = entropy_comp[kHistoPalette];
+
+      // When including transforms, there is an overhead in bits from
+      // storing them. This overhead is small but matters for small images.
+      // For spatial, there are 14 transformations.
+      entropy[kSpatial] += VP8LSubSampleSize(width, transform_bits) *
+                           VP8LSubSampleSize(height, transform_bits) *
+                           VP8LFastLog2(14);
+      // For color transforms: 24 as only 3 channels are considered in a
+      // ColorTransformElement.
+      entropy[kSpatialSubGreen] += VP8LSubSampleSize(width, transform_bits) *
+                                   VP8LSubSampleSize(height, transform_bits) *
+                                   VP8LFastLog2(24);
+      // For palettes, add the cost of storing the palette.
+      // We empirically estimate the cost of a compressed entry as 8 bits.
+      // The palette is differential-coded when compressed hence a much
+      // lower cost than sizeof(uint32_t)*8.
+      entropy[kPalette] += palette_size * 8;
+
+      *min_entropy_ix = kDirect;
+      for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
+        if (entropy[*min_entropy_ix] > entropy[k]) {
+          *min_entropy_ix = (EntropyIx)k;
+        }
+      }
+      assert((int)*min_entropy_ix <= last_mode_to_analyze);
+      *red_and_blue_always_zero = 1;
+      // Let's check if the histogram of the chosen entropy mode has
+      // non-zero red and blue values. If all are zero, we can later skip
+      // the cross color optimization.
+      {
+        static const uint8_t kHistoPairs[5][2] = {
+          { kHistoRed, kHistoBlue },
+          { kHistoRedPred, kHistoBluePred },
+          { kHistoRedSubGreen, kHistoBlueSubGreen },
+          { kHistoRedPredSubGreen, kHistoBluePredSubGreen },
+          { kHistoRed, kHistoBlue }
+        };
+        const uint32_t* const red_histo =
+            &histo[256 * kHistoPairs[*min_entropy_ix][0]];
+        const uint32_t* const blue_histo =
+            &histo[256 * kHistoPairs[*min_entropy_ix][1]];
+        for (i = 1; i < 256; ++i) {
+          if ((red_histo[i] | blue_histo[i]) != 0) {
+            *red_and_blue_always_zero = 0;
+            break;
+          }
+        }
+      }
+    }
+    WebPSafeFree(histo);
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+static int GetHistoBits(int method, int use_palette, int width, int height) {
+  // Make tile size a function of encoding method (Range: 0 to 6).
+  int histo_bits = (use_palette ? 9 : 7) - method;
+  while (1) {
+    const int huff_image_size = VP8LSubSampleSize(width, histo_bits) *
+                                VP8LSubSampleSize(height, histo_bits);
+    if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
+    ++histo_bits;
+  }
+  return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
+         (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
+}
+
+static int GetTransformBits(int method, int histo_bits) {
+  const int max_transform_bits = (method < 4) ? 6 : (method > 4) ? 4 : 5;
+  const int res =
+      (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+  assert(res <= MAX_TRANSFORM_BITS);
+  return res;
+}
+
+// Set of parameters to be used in each iteration of the cruncher.
+#define CRUNCH_SUBCONFIGS_MAX 2
+typedef struct {
+  int lz77_;
+  int do_no_cache_;
+} CrunchSubConfig;
+typedef struct {
+  int entropy_idx_;
+  PaletteSorting palette_sorting_type_;
+  CrunchSubConfig sub_configs_[CRUNCH_SUBCONFIGS_MAX];
+  int sub_configs_size_;
+} CrunchConfig;
+
+// +2 because we add a palette sorting configuration for kPalette and
+// kPaletteAndSpatial.
+#define CRUNCH_CONFIGS_MAX (kNumEntropyIx + 2)
+
+static int EncoderAnalyze(VP8LEncoder* const enc,
+                          CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX],
+                          int* const crunch_configs_size,
+                          int* const red_and_blue_always_zero) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+  const WebPConfig* const config = enc->config_;
+  const int method = config->method;
+  const int low_effort = (config->method == 0);
+  int i;
+  int use_palette;
+  int n_lz77s;
+  // If set to 0, analyze the cache with the computed cache value. If 1, also
+  // analyze with no-cache.
+  int do_no_cache = 0;
+  assert(pic != NULL && pic->argb != NULL);
+
+  // Check whether a palette is possible.
+  enc->palette_size_ = WebPGetColorPalette(pic, enc->palette_sorted_);
+  use_palette = (enc->palette_size_ <= MAX_PALETTE_SIZE);
+  if (!use_palette) {
+    enc->palette_size_ = 0;
+  } else {
+    qsort(enc->palette_sorted_, enc->palette_size_,
+          sizeof(*enc->palette_sorted_), PaletteCompareColorsForQsort);
+  }
+
+  // Empirical bit sizes.
+  enc->histo_bits_ = GetHistoBits(method, use_palette,
+                                  pic->width, pic->height);
+  enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
+
+  if (low_effort) {
+    // AnalyzeEntropy is somewhat slow.
+    crunch_configs[0].entropy_idx_ = use_palette ? kPalette : kSpatialSubGreen;
+    crunch_configs[0].palette_sorting_type_ =
+        use_palette ? kSortedDefault : kUnusedPalette;
+    n_lz77s = 1;
+    *crunch_configs_size = 1;
+  } else {
+    EntropyIx min_entropy_ix;
+    // Try out multiple LZ77 on images with few colors.
+    n_lz77s = (enc->palette_size_ > 0 && enc->palette_size_ <= 16) ? 2 : 1;
+    if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride, use_palette,
+                        enc->palette_size_, enc->transform_bits_,
+                        &min_entropy_ix, red_and_blue_always_zero)) {
+      return 0;
+    }
+    if (method == 6 && config->quality == 100) {
+      do_no_cache = 1;
+      // Go brute force on all transforms.
+      *crunch_configs_size = 0;
+      for (i = 0; i < kNumEntropyIx; ++i) {
+        // We can only apply kPalette or kPaletteAndSpatial if we can indeed use
+        // a palette.
+        if ((i != kPalette && i != kPaletteAndSpatial) || use_palette) {
+          assert(*crunch_configs_size < CRUNCH_CONFIGS_MAX);
+          crunch_configs[(*crunch_configs_size)].entropy_idx_ = i;
+          if (use_palette && (i == kPalette || i == kPaletteAndSpatial)) {
+            crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+                kMinimizeDelta;
+            ++*crunch_configs_size;
+            // Also add modified Zeng's method.
+            crunch_configs[(*crunch_configs_size)].entropy_idx_ = i;
+            crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+                kModifiedZeng;
+          } else {
+            crunch_configs[(*crunch_configs_size)].palette_sorting_type_ =
+                kUnusedPalette;
+          }
+          ++*crunch_configs_size;
+        }
+      }
+    } else {
+      // Only choose the guessed best transform.
+      *crunch_configs_size = 1;
+      crunch_configs[0].entropy_idx_ = min_entropy_ix;
+      crunch_configs[0].palette_sorting_type_ =
+          use_palette ? kMinimizeDelta : kUnusedPalette;
+      if (config->quality >= 75 && method == 5) {
+        // Test with and without color cache.
+        do_no_cache = 1;
+        // If we have a palette, also check in combination with spatial.
+        if (min_entropy_ix == kPalette) {
+          *crunch_configs_size = 2;
+          crunch_configs[1].entropy_idx_ = kPaletteAndSpatial;
+          crunch_configs[1].palette_sorting_type_ = kMinimizeDelta;
+        }
+      }
+    }
+  }
+  // Fill in the different LZ77s.
+  assert(n_lz77s <= CRUNCH_SUBCONFIGS_MAX);
+  for (i = 0; i < *crunch_configs_size; ++i) {
+    int j;
+    for (j = 0; j < n_lz77s; ++j) {
+      assert(j < CRUNCH_SUBCONFIGS_MAX);
+      crunch_configs[i].sub_configs_[j].lz77_ =
+          (j == 0) ? kLZ77Standard | kLZ77RLE : kLZ77Box;
+      crunch_configs[i].sub_configs_[j].do_no_cache_ = do_no_cache;
+    }
+    crunch_configs[i].sub_configs_size_ = n_lz77s;
+  }
+  return 1;
+}
+
+static int EncoderInit(VP8LEncoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+  const int pix_cnt = width * height;
+  // we round the block size up, so we're guaranteed to have
+  // at most MAX_REFS_BLOCK_PER_IMAGE blocks used:
+  const int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1;
+  int i;
+  if (!VP8LHashChainInit(&enc->hash_chain_, pix_cnt)) return 0;
+
+  for (i = 0; i < 4; ++i) VP8LBackwardRefsInit(&enc->refs_[i], refs_block_size);
+
+  return 1;
+}
+
+// Returns false in case of memory error.
+static int GetHuffBitLengthsAndCodes(
+    const VP8LHistogramSet* const histogram_image,
+    HuffmanTreeCode* const huffman_codes) {
+  int i, k;
+  int ok = 0;
+  uint64_t total_length_size = 0;
+  uint8_t* mem_buf = NULL;
+  const int histogram_image_size = histogram_image->size;
+  int max_num_symbols = 0;
+  uint8_t* buf_rle = NULL;
+  HuffmanTree* huff_tree = NULL;
+
+  // Iterate over all histograms and get the aggregate number of codes used.
+  for (i = 0; i < histogram_image_size; ++i) {
+    const VP8LHistogram* const histo = histogram_image->histograms[i];
+    HuffmanTreeCode* const codes = &huffman_codes[5 * i];
+    assert(histo != NULL);
+    for (k = 0; k < 5; ++k) {
+      const int num_symbols =
+          (k == 0) ? VP8LHistogramNumCodes(histo->palette_code_bits_) :
+          (k == 4) ? NUM_DISTANCE_CODES : 256;
+      codes[k].num_symbols = num_symbols;
+      total_length_size += num_symbols;
+    }
+  }
+
+  // Allocate and Set Huffman codes.
+  {
+    uint16_t* codes;
+    uint8_t* lengths;
+    mem_buf = (uint8_t*)WebPSafeCalloc(total_length_size,
+                                       sizeof(*lengths) + sizeof(*codes));
+    if (mem_buf == NULL) goto End;
+
+    codes = (uint16_t*)mem_buf;
+    lengths = (uint8_t*)&codes[total_length_size];
+    for (i = 0; i < 5 * histogram_image_size; ++i) {
+      const int bit_length = huffman_codes[i].num_symbols;
+      huffman_codes[i].codes = codes;
+      huffman_codes[i].code_lengths = lengths;
+      codes += bit_length;
+      lengths += bit_length;
+      if (max_num_symbols < bit_length) {
+        max_num_symbols = bit_length;
+      }
+    }
+  }
+
+  buf_rle = (uint8_t*)WebPSafeMalloc(1ULL, max_num_symbols);
+  huff_tree = (HuffmanTree*)WebPSafeMalloc(3ULL * max_num_symbols,
+                                           sizeof(*huff_tree));
+  if (buf_rle == NULL || huff_tree == NULL) goto End;
+
+  // Create Huffman trees.
+  for (i = 0; i < histogram_image_size; ++i) {
+    HuffmanTreeCode* const codes = &huffman_codes[5 * i];
+    VP8LHistogram* const histo = histogram_image->histograms[i];
+    VP8LCreateHuffmanTree(histo->literal_, 15, buf_rle, huff_tree, codes + 0);
+    VP8LCreateHuffmanTree(histo->red_, 15, buf_rle, huff_tree, codes + 1);
+    VP8LCreateHuffmanTree(histo->blue_, 15, buf_rle, huff_tree, codes + 2);
+    VP8LCreateHuffmanTree(histo->alpha_, 15, buf_rle, huff_tree, codes + 3);
+    VP8LCreateHuffmanTree(histo->distance_, 15, buf_rle, huff_tree, codes + 4);
+  }
+  ok = 1;
+ End:
+  WebPSafeFree(huff_tree);
+  WebPSafeFree(buf_rle);
+  if (!ok) {
+    WebPSafeFree(mem_buf);
+    memset(huffman_codes, 0, 5 * histogram_image_size * sizeof(*huffman_codes));
+  }
+  return ok;
+}
+
+static void StoreHuffmanTreeOfHuffmanTreeToBitMask(
+    VP8LBitWriter* const bw, const uint8_t* code_length_bitdepth) {
+  // RFC 1951 will calm you down if you are worried about this funny sequence.
+  // This sequence is tuned from that, but more weighted for lower symbol count,
+  // and more spiking histograms.
+  static const uint8_t kStorageOrder[CODE_LENGTH_CODES] = {
+    17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  int i;
+  // Throw away trailing zeros:
+  int codes_to_store = CODE_LENGTH_CODES;
+  for (; codes_to_store > 4; --codes_to_store) {
+    if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
+      break;
+    }
+  }
+  VP8LPutBits(bw, codes_to_store - 4, 4);
+  for (i = 0; i < codes_to_store; ++i) {
+    VP8LPutBits(bw, code_length_bitdepth[kStorageOrder[i]], 3);
+  }
+}
+
+static void ClearHuffmanTreeIfOnlyOneSymbol(
+    HuffmanTreeCode* const huffman_code) {
+  int k;
+  int count = 0;
+  for (k = 0; k < huffman_code->num_symbols; ++k) {
+    if (huffman_code->code_lengths[k] != 0) {
+      ++count;
+      if (count > 1) return;
+    }
+  }
+  for (k = 0; k < huffman_code->num_symbols; ++k) {
+    huffman_code->code_lengths[k] = 0;
+    huffman_code->codes[k] = 0;
+  }
+}
+
+static void StoreHuffmanTreeToBitMask(
+    VP8LBitWriter* const bw,
+    const HuffmanTreeToken* const tokens, const int num_tokens,
+    const HuffmanTreeCode* const huffman_code) {
+  int i;
+  for (i = 0; i < num_tokens; ++i) {
+    const int ix = tokens[i].code;
+    const int extra_bits = tokens[i].extra_bits;
+    VP8LPutBits(bw, huffman_code->codes[ix], huffman_code->code_lengths[ix]);
+    switch (ix) {
+      case 16:
+        VP8LPutBits(bw, extra_bits, 2);
+        break;
+      case 17:
+        VP8LPutBits(bw, extra_bits, 3);
+        break;
+      case 18:
+        VP8LPutBits(bw, extra_bits, 7);
+        break;
+    }
+  }
+}
+
+// 'huff_tree' and 'tokens' are pre-alloacted buffers.
+static void StoreFullHuffmanCode(VP8LBitWriter* const bw,
+                                 HuffmanTree* const huff_tree,
+                                 HuffmanTreeToken* const tokens,
+                                 const HuffmanTreeCode* const tree) {
+  uint8_t code_length_bitdepth[CODE_LENGTH_CODES] = { 0 };
+  uint16_t code_length_bitdepth_symbols[CODE_LENGTH_CODES] = { 0 };
+  const int max_tokens = tree->num_symbols;
+  int num_tokens;
+  HuffmanTreeCode huffman_code;
+  huffman_code.num_symbols = CODE_LENGTH_CODES;
+  huffman_code.code_lengths = code_length_bitdepth;
+  huffman_code.codes = code_length_bitdepth_symbols;
+
+  VP8LPutBits(bw, 0, 1);
+  num_tokens = VP8LCreateCompressedHuffmanTree(tree, tokens, max_tokens);
+  {
+    uint32_t histogram[CODE_LENGTH_CODES] = { 0 };
+    uint8_t buf_rle[CODE_LENGTH_CODES] = { 0 };
+    int i;
+    for (i = 0; i < num_tokens; ++i) {
+      ++histogram[tokens[i].code];
+    }
+
+    VP8LCreateHuffmanTree(histogram, 7, buf_rle, huff_tree, &huffman_code);
+  }
+
+  StoreHuffmanTreeOfHuffmanTreeToBitMask(bw, code_length_bitdepth);
+  ClearHuffmanTreeIfOnlyOneSymbol(&huffman_code);
+  {
+    int trailing_zero_bits = 0;
+    int trimmed_length = num_tokens;
+    int write_trimmed_length;
+    int length;
+    int i = num_tokens;
+    while (i-- > 0) {
+      const int ix = tokens[i].code;
+      if (ix == 0 || ix == 17 || ix == 18) {
+        --trimmed_length;   // discount trailing zeros
+        trailing_zero_bits += code_length_bitdepth[ix];
+        if (ix == 17) {
+          trailing_zero_bits += 3;
+        } else if (ix == 18) {
+          trailing_zero_bits += 7;
+        }
+      } else {
+        break;
+      }
+    }
+    write_trimmed_length = (trimmed_length > 1 && trailing_zero_bits > 12);
+    length = write_trimmed_length ? trimmed_length : num_tokens;
+    VP8LPutBits(bw, write_trimmed_length, 1);
+    if (write_trimmed_length) {
+      if (trimmed_length == 2) {
+        VP8LPutBits(bw, 0, 3 + 2);     // nbitpairs=1, trimmed_length=2
+      } else {
+        const int nbits = BitsLog2Floor(trimmed_length - 2);
+        const int nbitpairs = nbits / 2 + 1;
+        assert(trimmed_length > 2);
+        assert(nbitpairs - 1 < 8);
+        VP8LPutBits(bw, nbitpairs - 1, 3);
+        VP8LPutBits(bw, trimmed_length - 2, nbitpairs * 2);
+      }
+    }
+    StoreHuffmanTreeToBitMask(bw, tokens, length, &huffman_code);
+  }
+}
+
+// 'huff_tree' and 'tokens' are pre-alloacted buffers.
+static void StoreHuffmanCode(VP8LBitWriter* const bw,
+                             HuffmanTree* const huff_tree,
+                             HuffmanTreeToken* const tokens,
+                             const HuffmanTreeCode* const huffman_code) {
+  int i;
+  int count = 0;
+  int symbols[2] = { 0, 0 };
+  const int kMaxBits = 8;
+  const int kMaxSymbol = 1 << kMaxBits;
+
+  // Check whether it's a small tree.
+  for (i = 0; i < huffman_code->num_symbols && count < 3; ++i) {
+    if (huffman_code->code_lengths[i] != 0) {
+      if (count < 2) symbols[count] = i;
+      ++count;
+    }
+  }
+
+  if (count == 0) {   // emit minimal tree for empty cases
+    // bits: small tree marker: 1, count-1: 0, large 8-bit code: 0, code: 0
+    VP8LPutBits(bw, 0x01, 4);
+  } else if (count <= 2 && symbols[0] < kMaxSymbol && symbols[1] < kMaxSymbol) {
+    VP8LPutBits(bw, 1, 1);  // Small tree marker to encode 1 or 2 symbols.
+    VP8LPutBits(bw, count - 1, 1);
+    if (symbols[0] <= 1) {
+      VP8LPutBits(bw, 0, 1);  // Code bit for small (1 bit) symbol value.
+      VP8LPutBits(bw, symbols[0], 1);
+    } else {
+      VP8LPutBits(bw, 1, 1);
+      VP8LPutBits(bw, symbols[0], 8);
+    }
+    if (count == 2) {
+      VP8LPutBits(bw, symbols[1], 8);
+    }
+  } else {
+    StoreFullHuffmanCode(bw, huff_tree, tokens, huffman_code);
+  }
+}
+
+static WEBP_INLINE void WriteHuffmanCode(VP8LBitWriter* const bw,
+                             const HuffmanTreeCode* const code,
+                             int code_index) {
+  const int depth = code->code_lengths[code_index];
+  const int symbol = code->codes[code_index];
+  VP8LPutBits(bw, symbol, depth);
+}
+
+static WEBP_INLINE void WriteHuffmanCodeWithExtraBits(
+    VP8LBitWriter* const bw,
+    const HuffmanTreeCode* const code,
+    int code_index,
+    int bits,
+    int n_bits) {
+  const int depth = code->code_lengths[code_index];
+  const int symbol = code->codes[code_index];
+  VP8LPutBits(bw, (bits << depth) | symbol, depth + n_bits);
+}
+
+static WebPEncodingError StoreImageToBitMask(
+    VP8LBitWriter* const bw, int width, int histo_bits,
+    const VP8LBackwardRefs* const refs,
+    const uint16_t* histogram_symbols,
+    const HuffmanTreeCode* const huffman_codes) {
+  const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
+  const int tile_mask = (histo_bits == 0) ? 0 : -(1 << histo_bits);
+  // x and y trace the position in the image.
+  int x = 0;
+  int y = 0;
+  int tile_x = x & tile_mask;
+  int tile_y = y & tile_mask;
+  int histogram_ix = histogram_symbols[0];
+  const HuffmanTreeCode* codes = huffman_codes + 5 * histogram_ix;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
+    if ((tile_x != (x & tile_mask)) || (tile_y != (y & tile_mask))) {
+      tile_x = x & tile_mask;
+      tile_y = y & tile_mask;
+      histogram_ix = histogram_symbols[(y >> histo_bits) * histo_xsize +
+                                       (x >> histo_bits)];
+      codes = huffman_codes + 5 * histogram_ix;
+    }
+    if (PixOrCopyIsLiteral(v)) {
+      static const uint8_t order[] = { 1, 2, 0, 3 };
+      int k;
+      for (k = 0; k < 4; ++k) {
+        const int code = PixOrCopyLiteral(v, order[k]);
+        WriteHuffmanCode(bw, codes + k, code);
+      }
+    } else if (PixOrCopyIsCacheIdx(v)) {
+      const int code = PixOrCopyCacheIdx(v);
+      const int literal_ix = 256 + NUM_LENGTH_CODES + code;
+      WriteHuffmanCode(bw, codes, literal_ix);
+    } else {
+      int bits, n_bits;
+      int code;
+
+      const int distance = PixOrCopyDistance(v);
+      VP8LPrefixEncode(v->len, &code, &n_bits, &bits);
+      WriteHuffmanCodeWithExtraBits(bw, codes, 256 + code, bits, n_bits);
+
+      // Don't write the distance with the extra bits code since
+      // the distance can be up to 18 bits of extra bits, and the prefix
+      // 15 bits, totaling to 33, and our PutBits only supports up to 32 bits.
+      VP8LPrefixEncode(distance, &code, &n_bits, &bits);
+      WriteHuffmanCode(bw, codes + 4, code);
+      VP8LPutBits(bw, bits, n_bits);
+    }
+    x += PixOrCopyLength(v);
+    while (x >= width) {
+      x -= width;
+      ++y;
+    }
+    VP8LRefsCursorNext(&c);
+  }
+  return bw->error_ ? VP8_ENC_ERROR_OUT_OF_MEMORY : VP8_ENC_OK;
+}
+
+// Special case of EncodeImageInternal() for cache-bits=0, histo_bits=31
+static WebPEncodingError EncodeImageNoHuffman(
+    VP8LBitWriter* const bw, const uint32_t* const argb,
+    VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_array,
+    int width, int height, int quality, int low_effort) {
+  int i;
+  int max_tokens = 0;
+  WebPEncodingError err = VP8_ENC_OK;
+  VP8LBackwardRefs* refs;
+  HuffmanTreeToken* tokens = NULL;
+  HuffmanTreeCode huffman_codes[5] = { { 0, NULL, NULL } };
+  const uint16_t histogram_symbols[1] = { 0 };    // only one tree, one symbol
+  int cache_bits = 0;
+  VP8LHistogramSet* histogram_image = NULL;
+  HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
+        3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
+  if (huff_tree == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  // Calculate backward references from ARGB image.
+  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+  err = VP8LGetBackwardReferences(
+      width, height, argb, quality, /*low_effort=*/0, kLZ77Standard | kLZ77RLE,
+      cache_bits, /*do_no_cache=*/0, hash_chain, refs_array, &cache_bits);
+  if (err != VP8_ENC_OK) goto Error;
+  refs = &refs_array[0];
+  histogram_image = VP8LAllocateHistogramSet(1, cache_bits);
+  if (histogram_image == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+  VP8LHistogramSetClear(histogram_image);
+
+  // Build histogram image and symbols from backward references.
+  VP8LHistogramStoreRefs(refs, histogram_image->histograms[0]);
+
+  // Create Huffman bit lengths and codes for each histogram image.
+  assert(histogram_image->size == 1);
+  if (!GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  // No color cache, no Huffman image.
+  VP8LPutBits(bw, 0, 1);
+
+  // Find maximum number of symbols for the huffman tree-set.
+  for (i = 0; i < 5; ++i) {
+    HuffmanTreeCode* const codes = &huffman_codes[i];
+    if (max_tokens < codes->num_symbols) {
+      max_tokens = codes->num_symbols;
+    }
+  }
+
+  tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
+  if (tokens == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  // Store Huffman codes.
+  for (i = 0; i < 5; ++i) {
+    HuffmanTreeCode* const codes = &huffman_codes[i];
+    StoreHuffmanCode(bw, huff_tree, tokens, codes);
+    ClearHuffmanTreeIfOnlyOneSymbol(codes);
+  }
+
+  // Store actual literals.
+  err = StoreImageToBitMask(bw, width, 0, refs, histogram_symbols,
+                            huffman_codes);
+
+ Error:
+  WebPSafeFree(tokens);
+  WebPSafeFree(huff_tree);
+  VP8LFreeHistogramSet(histogram_image);
+  WebPSafeFree(huffman_codes[0].codes);
+  return err;
+}
+
+static WebPEncodingError EncodeImageInternal(
+    VP8LBitWriter* const bw, const uint32_t* const argb,
+    VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[4], int width,
+    int height, int quality, int low_effort, int use_cache,
+    const CrunchConfig* const config, int* cache_bits, int histogram_bits,
+    size_t init_byte_position, int* const hdr_size, int* const data_size) {
+  WebPEncodingError err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+  const uint32_t histogram_image_xysize =
+      VP8LSubSampleSize(width, histogram_bits) *
+      VP8LSubSampleSize(height, histogram_bits);
+  VP8LHistogramSet* histogram_image = NULL;
+  VP8LHistogram* tmp_histo = NULL;
+  int histogram_image_size = 0;
+  size_t bit_array_size = 0;
+  HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
+      3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
+  HuffmanTreeToken* tokens = NULL;
+  HuffmanTreeCode* huffman_codes = NULL;
+  uint16_t* const histogram_symbols =
+      (uint16_t*)WebPSafeMalloc(histogram_image_xysize,
+                                sizeof(*histogram_symbols));
+  int sub_configs_idx;
+  int cache_bits_init, write_histogram_image;
+  VP8LBitWriter bw_init = *bw, bw_best;
+  int hdr_size_tmp;
+  VP8LHashChain hash_chain_histogram;  // histogram image hash chain
+  size_t bw_size_best = ~(size_t)0;
+  assert(histogram_bits >= MIN_HUFFMAN_BITS);
+  assert(histogram_bits <= MAX_HUFFMAN_BITS);
+  assert(hdr_size != NULL);
+  assert(data_size != NULL);
+
+  // Make sure we can allocate the different objects.
+  memset(&hash_chain_histogram, 0, sizeof(hash_chain_histogram));
+  if (huff_tree == NULL || histogram_symbols == NULL ||
+      !VP8LHashChainInit(&hash_chain_histogram, histogram_image_xysize) ||
+      !VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort)) {
+    goto Error;
+  }
+  if (use_cache) {
+    // If the value is different from zero, it has been set during the
+    // palette analysis.
+    cache_bits_init = (*cache_bits == 0) ? MAX_COLOR_CACHE_BITS : *cache_bits;
+  } else {
+    cache_bits_init = 0;
+  }
+  // If several iterations will happen, clone into bw_best.
+  if (!VP8LBitWriterInit(&bw_best, 0) ||
+      ((config->sub_configs_size_ > 1 ||
+        config->sub_configs_[0].do_no_cache_) &&
+       !VP8LBitWriterClone(bw, &bw_best))) {
+    goto Error;
+  }
+  for (sub_configs_idx = 0; sub_configs_idx < config->sub_configs_size_;
+       ++sub_configs_idx) {
+    const CrunchSubConfig* const sub_config =
+        &config->sub_configs_[sub_configs_idx];
+    int cache_bits_best, i_cache;
+    err = VP8LGetBackwardReferences(width, height, argb, quality, low_effort,
+                                    sub_config->lz77_, cache_bits_init,
+                                    sub_config->do_no_cache_, hash_chain,
+                                    &refs_array[0], &cache_bits_best);
+    if (err != VP8_ENC_OK) goto Error;
+
+    for (i_cache = 0; i_cache < (sub_config->do_no_cache_ ? 2 : 1); ++i_cache) {
+      const int cache_bits_tmp = (i_cache == 0) ? cache_bits_best : 0;
+      // Speed-up: no need to study the no-cache case if it was already studied
+      // in i_cache == 0.
+      if (i_cache == 1 && cache_bits_best == 0) break;
+
+      // Reset the bit writer for this iteration.
+      VP8LBitWriterReset(&bw_init, bw);
+
+      // Build histogram image and symbols from backward references.
+      histogram_image =
+          VP8LAllocateHistogramSet(histogram_image_xysize, cache_bits_tmp);
+      tmp_histo = VP8LAllocateHistogram(cache_bits_tmp);
+      if (histogram_image == NULL || tmp_histo == NULL ||
+          !VP8LGetHistoImageSymbols(width, height, &refs_array[i_cache],
+                                    quality, low_effort, histogram_bits,
+                                    cache_bits_tmp, histogram_image, tmp_histo,
+                                    histogram_symbols)) {
+        goto Error;
+      }
+      // Create Huffman bit lengths and codes for each histogram image.
+      histogram_image_size = histogram_image->size;
+      bit_array_size = 5 * histogram_image_size;
+      huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
+                                                       sizeof(*huffman_codes));
+      // Note: some histogram_image entries may point to tmp_histos[], so the
+      // latter need to outlive the following call to
+      // GetHuffBitLengthsAndCodes().
+      if (huffman_codes == NULL ||
+          !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+        goto Error;
+      }
+      // Free combined histograms.
+      VP8LFreeHistogramSet(histogram_image);
+      histogram_image = NULL;
+
+      // Free scratch histograms.
+      VP8LFreeHistogram(tmp_histo);
+      tmp_histo = NULL;
+
+      // Color Cache parameters.
+      if (cache_bits_tmp > 0) {
+        VP8LPutBits(bw, 1, 1);
+        VP8LPutBits(bw, cache_bits_tmp, 4);
+      } else {
+        VP8LPutBits(bw, 0, 1);
+      }
+
+      // Huffman image + meta huffman.
+      write_histogram_image = (histogram_image_size > 1);
+      VP8LPutBits(bw, write_histogram_image, 1);
+      if (write_histogram_image) {
+        uint32_t* const histogram_argb =
+            (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
+                                      sizeof(*histogram_argb));
+        int max_index = 0;
+        uint32_t i;
+        if (histogram_argb == NULL) goto Error;
+        for (i = 0; i < histogram_image_xysize; ++i) {
+          const int symbol_index = histogram_symbols[i] & 0xffff;
+          histogram_argb[i] = (symbol_index << 8);
+          if (symbol_index >= max_index) {
+            max_index = symbol_index + 1;
+          }
+        }
+        histogram_image_size = max_index;
+
+        VP8LPutBits(bw, histogram_bits - 2, 3);
+        err = EncodeImageNoHuffman(
+            bw, histogram_argb, &hash_chain_histogram, &refs_array[2],
+            VP8LSubSampleSize(width, histogram_bits),
+            VP8LSubSampleSize(height, histogram_bits), quality, low_effort);
+        WebPSafeFree(histogram_argb);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+
+      // Store Huffman codes.
+      {
+        int i;
+        int max_tokens = 0;
+        // Find maximum number of symbols for the huffman tree-set.
+        for (i = 0; i < 5 * histogram_image_size; ++i) {
+          HuffmanTreeCode* const codes = &huffman_codes[i];
+          if (max_tokens < codes->num_symbols) {
+            max_tokens = codes->num_symbols;
+          }
+        }
+        tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
+        if (tokens == NULL) goto Error;
+        for (i = 0; i < 5 * histogram_image_size; ++i) {
+          HuffmanTreeCode* const codes = &huffman_codes[i];
+          StoreHuffmanCode(bw, huff_tree, tokens, codes);
+          ClearHuffmanTreeIfOnlyOneSymbol(codes);
+        }
+      }
+      // Store actual literals.
+      hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
+      err = StoreImageToBitMask(bw, width, histogram_bits, &refs_array[i_cache],
+                                histogram_symbols, huffman_codes);
+      if (err != VP8_ENC_OK) goto Error;
+      // Keep track of the smallest image so far.
+      if (VP8LBitWriterNumBytes(bw) < bw_size_best) {
+        bw_size_best = VP8LBitWriterNumBytes(bw);
+        *cache_bits = cache_bits_tmp;
+        *hdr_size = hdr_size_tmp;
+        *data_size =
+            (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
+        VP8LBitWriterSwap(bw, &bw_best);
+      }
+      WebPSafeFree(tokens);
+      tokens = NULL;
+      if (huffman_codes != NULL) {
+        WebPSafeFree(huffman_codes->codes);
+        WebPSafeFree(huffman_codes);
+        huffman_codes = NULL;
+      }
+    }
+  }
+  VP8LBitWriterSwap(bw, &bw_best);
+  err = VP8_ENC_OK;
+
+ Error:
+  WebPSafeFree(tokens);
+  WebPSafeFree(huff_tree);
+  VP8LFreeHistogramSet(histogram_image);
+  VP8LFreeHistogram(tmp_histo);
+  VP8LHashChainClear(&hash_chain_histogram);
+  if (huffman_codes != NULL) {
+    WebPSafeFree(huffman_codes->codes);
+    WebPSafeFree(huffman_codes);
+  }
+  WebPSafeFree(histogram_symbols);
+  VP8LBitWriterWipeOut(&bw_best);
+  return err;
+}
+
+// -----------------------------------------------------------------------------
+// Transforms
+
+static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height,
+                               VP8LBitWriter* const bw) {
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, SUBTRACT_GREEN, 2);
+  VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
+}
+
+static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
+                                            int width, int height,
+                                            int quality, int low_effort,
+                                            int used_subtract_green,
+                                            VP8LBitWriter* const bw) {
+  const int pred_bits = enc->transform_bits_;
+  const int transform_width = VP8LSubSampleSize(width, pred_bits);
+  const int transform_height = VP8LSubSampleSize(height, pred_bits);
+  // we disable near-lossless quantization if palette is used.
+  const int near_lossless_strength = enc->use_palette_ ? 100
+                                   : enc->config_->near_lossless;
+
+  VP8LResidualImage(width, height, pred_bits, low_effort, enc->argb_,
+                    enc->argb_scratch_, enc->transform_data_,
+                    near_lossless_strength, enc->config_->exact,
+                    used_subtract_green);
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
+  assert(pred_bits >= 2);
+  VP8LPutBits(bw, pred_bits - 2, 3);
+  return EncodeImageNoHuffman(
+      bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
+      (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
+      quality, low_effort);
+}
+
+static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
+                                               int width, int height,
+                                               int quality, int low_effort,
+                                               VP8LBitWriter* const bw) {
+  const int ccolor_transform_bits = enc->transform_bits_;
+  const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
+  const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
+
+  VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
+                          enc->argb_, enc->transform_data_);
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
+  assert(ccolor_transform_bits >= 2);
+  VP8LPutBits(bw, ccolor_transform_bits - 2, 3);
+  return EncodeImageNoHuffman(
+      bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
+      (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
+      quality, low_effort);
+}
+
+// -----------------------------------------------------------------------------
+
+static WebPEncodingError WriteRiffHeader(const WebPPicture* const pic,
+                                         size_t riff_size, size_t vp8l_size) {
+  uint8_t riff[RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE + VP8L_SIGNATURE_SIZE] = {
+    'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P',
+    'V', 'P', '8', 'L', 0, 0, 0, 0, VP8L_MAGIC_BYTE,
+  };
+  PutLE32(riff + TAG_SIZE, (uint32_t)riff_size);
+  PutLE32(riff + RIFF_HEADER_SIZE + TAG_SIZE, (uint32_t)vp8l_size);
+  if (!pic->writer(riff, sizeof(riff), pic)) {
+    return VP8_ENC_ERROR_BAD_WRITE;
+  }
+  return VP8_ENC_OK;
+}
+
+static int WriteImageSize(const WebPPicture* const pic,
+                          VP8LBitWriter* const bw) {
+  const int width = pic->width - 1;
+  const int height = pic->height - 1;
+  assert(width < WEBP_MAX_DIMENSION && height < WEBP_MAX_DIMENSION);
+
+  VP8LPutBits(bw, width, VP8L_IMAGE_SIZE_BITS);
+  VP8LPutBits(bw, height, VP8L_IMAGE_SIZE_BITS);
+  return !bw->error_;
+}
+
+static int WriteRealAlphaAndVersion(VP8LBitWriter* const bw, int has_alpha) {
+  VP8LPutBits(bw, has_alpha, 1);
+  VP8LPutBits(bw, VP8L_VERSION, VP8L_VERSION_BITS);
+  return !bw->error_;
+}
+
+static WebPEncodingError WriteImage(const WebPPicture* const pic,
+                                    VP8LBitWriter* const bw,
+                                    size_t* const coded_size) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const uint8_t* const webpll_data = VP8LBitWriterFinish(bw);
+  const size_t webpll_size = VP8LBitWriterNumBytes(bw);
+  const size_t vp8l_size = VP8L_SIGNATURE_SIZE + webpll_size;
+  const size_t pad = vp8l_size & 1;
+  const size_t riff_size = TAG_SIZE + CHUNK_HEADER_SIZE + vp8l_size + pad;
+
+  err = WriteRiffHeader(pic, riff_size, vp8l_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  if (!pic->writer(webpll_data, webpll_size, pic)) {
+    err = VP8_ENC_ERROR_BAD_WRITE;
+    goto Error;
+  }
+
+  if (pad) {
+    const uint8_t pad_byte[1] = { 0 };
+    if (!pic->writer(pad_byte, 1, pic)) {
+      err = VP8_ENC_ERROR_BAD_WRITE;
+      goto Error;
+    }
+  }
+  *coded_size = CHUNK_HEADER_SIZE + riff_size;
+  return VP8_ENC_OK;
+
+ Error:
+  return err;
+}
+
+// -----------------------------------------------------------------------------
+
+static void ClearTransformBuffer(VP8LEncoder* const enc) {
+  WebPSafeFree(enc->transform_mem_);
+  enc->transform_mem_ = NULL;
+  enc->transform_mem_size_ = 0;
+}
+
+// Allocates the memory for argb (W x H) buffer, 2 rows of context for
+// prediction and transform data.
+// Flags influencing the memory allocated:
+//  enc->transform_bits_
+//  enc->use_predict_, enc->use_cross_color_
+static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
+                                                 int width, int height) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const uint64_t image_size = width * height;
+  // VP8LResidualImage needs room for 2 scanlines of uint32 pixels with an extra
+  // pixel in each, plus 2 regular scanlines of bytes.
+  // TODO(skal): Clean up by using arithmetic in bytes instead of words.
+  const uint64_t argb_scratch_size =
+      enc->use_predict_
+          ? (width + 1) * 2 +
+            (width * 2 + sizeof(uint32_t) - 1) / sizeof(uint32_t)
+          : 0;
+  const uint64_t transform_data_size =
+      (enc->use_predict_ || enc->use_cross_color_)
+          ? VP8LSubSampleSize(width, enc->transform_bits_) *
+                VP8LSubSampleSize(height, enc->transform_bits_)
+          : 0;
+  const uint64_t max_alignment_in_words =
+      (WEBP_ALIGN_CST + sizeof(uint32_t) - 1) / sizeof(uint32_t);
+  const uint64_t mem_size =
+      image_size + max_alignment_in_words +
+      argb_scratch_size + max_alignment_in_words +
+      transform_data_size;
+  uint32_t* mem = enc->transform_mem_;
+  if (mem == NULL || mem_size > enc->transform_mem_size_) {
+    ClearTransformBuffer(enc);
+    mem = (uint32_t*)WebPSafeMalloc(mem_size, sizeof(*mem));
+    if (mem == NULL) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+    enc->transform_mem_ = mem;
+    enc->transform_mem_size_ = (size_t)mem_size;
+    enc->argb_content_ = kEncoderNone;
+  }
+  enc->argb_ = mem;
+  mem = (uint32_t*)WEBP_ALIGN(mem + image_size);
+  enc->argb_scratch_ = mem;
+  mem = (uint32_t*)WEBP_ALIGN(mem + argb_scratch_size);
+  enc->transform_data_ = mem;
+
+  enc->current_width_ = width;
+ Error:
+  return err;
+}
+
+static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const WebPPicture* const picture = enc->pic_;
+  const int width = picture->width;
+  const int height = picture->height;
+
+  err = AllocateTransformBuffer(enc, width, height);
+  if (err != VP8_ENC_OK) return err;
+  if (enc->argb_content_ == kEncoderARGB) return VP8_ENC_OK;
+
+  {
+    uint32_t* dst = enc->argb_;
+    const uint32_t* src = picture->argb;
+    int y;
+    for (y = 0; y < height; ++y) {
+      memcpy(dst, src, width * sizeof(*dst));
+      dst += width;
+      src += picture->argb_stride;
+    }
+  }
+  enc->argb_content_ = kEncoderARGB;
+  assert(enc->current_width_ == width);
+  return VP8_ENC_OK;
+}
+
+// -----------------------------------------------------------------------------
+
+#define APPLY_PALETTE_GREEDY_MAX 4
+
+static WEBP_INLINE uint32_t SearchColorGreedy(const uint32_t palette[],
+                                              int palette_size,
+                                              uint32_t color) {
+  (void)palette_size;
+  assert(palette_size < APPLY_PALETTE_GREEDY_MAX);
+  assert(3 == APPLY_PALETTE_GREEDY_MAX - 1);
+  if (color == palette[0]) return 0;
+  if (color == palette[1]) return 1;
+  if (color == palette[2]) return 2;
+  return 3;
+}
+
+static WEBP_INLINE uint32_t ApplyPaletteHash0(uint32_t color) {
+  // Focus on the green color.
+  return (color >> 8) & 0xff;
+}
+
+#define PALETTE_INV_SIZE_BITS 11
+#define PALETTE_INV_SIZE (1 << PALETTE_INV_SIZE_BITS)
+
+static WEBP_INLINE uint32_t ApplyPaletteHash1(uint32_t color) {
+  // Forget about alpha.
+  return ((uint32_t)((color & 0x00ffffffu) * 4222244071ull)) >>
+         (32 - PALETTE_INV_SIZE_BITS);
+}
+
+static WEBP_INLINE uint32_t ApplyPaletteHash2(uint32_t color) {
+  // Forget about alpha.
+  return ((uint32_t)((color & 0x00ffffffu) * ((1ull << 31) - 1))) >>
+         (32 - PALETTE_INV_SIZE_BITS);
+}
+
+// Use 1 pixel cache for ARGB pixels.
+#define APPLY_PALETTE_FOR(COLOR_INDEX) do {         \
+  uint32_t prev_pix = palette[0];                   \
+  uint32_t prev_idx = 0;                            \
+  for (y = 0; y < height; ++y) {                    \
+    for (x = 0; x < width; ++x) {                   \
+      const uint32_t pix = src[x];                  \
+      if (pix != prev_pix) {                        \
+        prev_idx = COLOR_INDEX;                     \
+        prev_pix = pix;                             \
+      }                                             \
+      tmp_row[x] = prev_idx;                        \
+    }                                               \
+    VP8LBundleColorMap(tmp_row, width, xbits, dst); \
+    src += src_stride;                              \
+    dst += dst_stride;                              \
+  }                                                 \
+} while (0)
+
+// Remap argb values in src[] to packed palettes entries in dst[]
+// using 'row' as a temporary buffer of size 'width'.
+// We assume that all src[] values have a corresponding entry in the palette.
+// Note: src[] can be the same as dst[]
+static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
+                                      uint32_t* dst, uint32_t dst_stride,
+                                      const uint32_t* palette, int palette_size,
+                                      int width, int height, int xbits) {
+  // TODO(skal): this tmp buffer is not needed if VP8LBundleColorMap() can be
+  // made to work in-place.
+  uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
+  int x, y;
+
+  if (tmp_row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+
+  if (palette_size < APPLY_PALETTE_GREEDY_MAX) {
+    APPLY_PALETTE_FOR(SearchColorGreedy(palette, palette_size, pix));
+  } else {
+    int i, j;
+    uint16_t buffer[PALETTE_INV_SIZE];
+    uint32_t (*const hash_functions[])(uint32_t) = {
+        ApplyPaletteHash0, ApplyPaletteHash1, ApplyPaletteHash2
+    };
+
+    // Try to find a perfect hash function able to go from a color to an index
+    // within 1 << PALETTE_INV_SIZE_BITS in order to build a hash map to go
+    // from color to index in palette.
+    for (i = 0; i < 3; ++i) {
+      int use_LUT = 1;
+      // Set each element in buffer to max uint16_t.
+      memset(buffer, 0xff, sizeof(buffer));
+      for (j = 0; j < palette_size; ++j) {
+        const uint32_t ind = hash_functions[i](palette[j]);
+        if (buffer[ind] != 0xffffu) {
+          use_LUT = 0;
+          break;
+        } else {
+          buffer[ind] = j;
+        }
+      }
+      if (use_LUT) break;
+    }
+
+    if (i == 0) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash0(pix)]);
+    } else if (i == 1) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash1(pix)]);
+    } else if (i == 2) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash2(pix)]);
+    } else {
+      uint32_t idx_map[MAX_PALETTE_SIZE];
+      uint32_t palette_sorted[MAX_PALETTE_SIZE];
+      PrepareMapToPalette(palette, palette_size, palette_sorted, idx_map);
+      APPLY_PALETTE_FOR(
+          idx_map[SearchColorNoIdx(palette_sorted, pix, palette_size)]);
+    }
+  }
+  WebPSafeFree(tmp_row);
+  return VP8_ENC_OK;
+}
+#undef APPLY_PALETTE_FOR
+#undef PALETTE_INV_SIZE_BITS
+#undef PALETTE_INV_SIZE
+#undef APPLY_PALETTE_GREEDY_MAX
+
+// Note: Expects "enc->palette_" to be set properly.
+static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
+                                             int in_place) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+  const uint32_t* const palette = enc->palette_;
+  const uint32_t* src = in_place ? enc->argb_ : pic->argb;
+  const int src_stride = in_place ? enc->current_width_ : pic->argb_stride;
+  const int palette_size = enc->palette_size_;
+  int xbits;
+
+  // Replace each input pixel by corresponding palette index.
+  // This is done line by line.
+  if (palette_size <= 4) {
+    xbits = (palette_size <= 2) ? 3 : 2;
+  } else {
+    xbits = (palette_size <= 16) ? 1 : 0;
+  }
+
+  err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
+  if (err != VP8_ENC_OK) return err;
+
+  err = ApplyPalette(src, src_stride,
+                     enc->argb_, enc->current_width_,
+                     palette, palette_size, width, height, xbits);
+  enc->argb_content_ = kEncoderPalette;
+  return err;
+}
+
+// Save palette_[] to bitstream.
+static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
+                                       VP8LEncoder* const enc) {
+  int i;
+  uint32_t tmp_palette[MAX_PALETTE_SIZE];
+  const int palette_size = enc->palette_size_;
+  const uint32_t* const palette = enc->palette_;
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, COLOR_INDEXING_TRANSFORM, 2);
+  assert(palette_size >= 1 && palette_size <= MAX_PALETTE_SIZE);
+  VP8LPutBits(bw, palette_size - 1, 8);
+  for (i = palette_size - 1; i >= 1; --i) {
+    tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
+  }
+  tmp_palette[0] = palette[0];
+  return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_,
+                              &enc->refs_[0], palette_size, 1, /*quality=*/20,
+                              low_effort);
+}
+
+// -----------------------------------------------------------------------------
+// VP8LEncoder
+
+static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
+                                   const WebPPicture* const picture) {
+  VP8LEncoder* const enc = (VP8LEncoder*)WebPSafeCalloc(1ULL, sizeof(*enc));
+  if (enc == NULL) {
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return NULL;
+  }
+  enc->config_ = config;
+  enc->pic_ = picture;
+  enc->argb_content_ = kEncoderNone;
+
+  VP8LEncDspInit();
+
+  return enc;
+}
+
+static void VP8LEncoderDelete(VP8LEncoder* enc) {
+  if (enc != NULL) {
+    int i;
+    VP8LHashChainClear(&enc->hash_chain_);
+    for (i = 0; i < 4; ++i) VP8LBackwardRefsClear(&enc->refs_[i]);
+    ClearTransformBuffer(enc);
+    WebPSafeFree(enc);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Main call
+
+typedef struct {
+  const WebPConfig* config_;
+  const WebPPicture* picture_;
+  VP8LBitWriter* bw_;
+  VP8LEncoder* enc_;
+  int use_cache_;
+  CrunchConfig crunch_configs_[CRUNCH_CONFIGS_MAX];
+  int num_crunch_configs_;
+  int red_and_blue_always_zero_;
+  WebPEncodingError err_;
+  WebPAuxStats* stats_;
+} StreamEncodeContext;
+
+static int EncodeStreamHook(void* input, void* data2) {
+  StreamEncodeContext* const params = (StreamEncodeContext*)input;
+  const WebPConfig* const config = params->config_;
+  const WebPPicture* const picture = params->picture_;
+  VP8LBitWriter* const bw = params->bw_;
+  VP8LEncoder* const enc = params->enc_;
+  const int use_cache = params->use_cache_;
+  const CrunchConfig* const crunch_configs = params->crunch_configs_;
+  const int num_crunch_configs = params->num_crunch_configs_;
+  const int red_and_blue_always_zero = params->red_and_blue_always_zero_;
+#if !defined(WEBP_DISABLE_STATS)
+  WebPAuxStats* const stats = params->stats_;
+#endif
+  WebPEncodingError err = VP8_ENC_OK;
+  const int quality = (int)config->quality;
+  const int low_effort = (config->method == 0);
+#if (WEBP_NEAR_LOSSLESS == 1)
+  const int width = picture->width;
+#endif
+  const int height = picture->height;
+  const size_t byte_position = VP8LBitWriterNumBytes(bw);
+#if (WEBP_NEAR_LOSSLESS == 1)
+  int use_near_lossless = 0;
+#endif
+  int hdr_size = 0;
+  int data_size = 0;
+  int use_delta_palette = 0;
+  int idx;
+  size_t best_size = ~(size_t)0;
+  VP8LBitWriter bw_init = *bw, bw_best;
+  (void)data2;
+
+  if (!VP8LBitWriterInit(&bw_best, 0) ||
+      (num_crunch_configs > 1 && !VP8LBitWriterClone(bw, &bw_best))) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  for (idx = 0; idx < num_crunch_configs; ++idx) {
+    const int entropy_idx = crunch_configs[idx].entropy_idx_;
+    enc->use_palette_ =
+        (entropy_idx == kPalette) || (entropy_idx == kPaletteAndSpatial);
+    enc->use_subtract_green_ =
+        (entropy_idx == kSubGreen) || (entropy_idx == kSpatialSubGreen);
+    enc->use_predict_ = (entropy_idx == kSpatial) ||
+                        (entropy_idx == kSpatialSubGreen) ||
+                        (entropy_idx == kPaletteAndSpatial);
+    // When using a palette, R/B==0, hence no need to test for cross-color.
+    if (low_effort || enc->use_palette_) {
+      enc->use_cross_color_ = 0;
+    } else {
+      enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_;
+    }
+    // Reset any parameter in the encoder that is set in the previous iteration.
+    enc->cache_bits_ = 0;
+    VP8LBackwardRefsClear(&enc->refs_[0]);
+    VP8LBackwardRefsClear(&enc->refs_[1]);
+
+#if (WEBP_NEAR_LOSSLESS == 1)
+    // Apply near-lossless preprocessing.
+    use_near_lossless = (config->near_lossless < 100) && !enc->use_palette_ &&
+                        !enc->use_predict_;
+    if (use_near_lossless) {
+      err = AllocateTransformBuffer(enc, width, height);
+      if (err != VP8_ENC_OK) goto Error;
+      if ((enc->argb_content_ != kEncoderNearLossless) &&
+          !VP8ApplyNearLossless(picture, config->near_lossless, enc->argb_)) {
+        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+        goto Error;
+      }
+      enc->argb_content_ = kEncoderNearLossless;
+    } else {
+      enc->argb_content_ = kEncoderNone;
+    }
+#else
+    enc->argb_content_ = kEncoderNone;
+#endif
+
+    // Encode palette
+    if (enc->use_palette_) {
+      if (crunch_configs[idx].palette_sorting_type_ == kSortedDefault) {
+        // Nothing to do, we have already sorted the palette.
+        memcpy(enc->palette_, enc->palette_sorted_,
+               enc->palette_size_ * sizeof(*enc->palette_));
+      } else if (crunch_configs[idx].palette_sorting_type_ == kMinimizeDelta) {
+        PaletteSortMinimizeDeltas(enc->palette_sorted_, enc->palette_size_,
+                                  enc->palette_);
+      } else {
+        assert(crunch_configs[idx].palette_sorting_type_ == kModifiedZeng);
+        err = PaletteSortModifiedZeng(enc->pic_, enc->palette_sorted_,
+                                      enc->palette_size_, enc->palette_);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+      err = EncodePalette(bw, low_effort, enc);
+      if (err != VP8_ENC_OK) goto Error;
+      err = MapImageFromPalette(enc, use_delta_palette);
+      if (err != VP8_ENC_OK) goto Error;
+      // If using a color cache, do not have it bigger than the number of
+      // colors.
+      if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
+        enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
+      }
+    }
+    if (!use_delta_palette) {
+      // In case image is not packed.
+      if (enc->argb_content_ != kEncoderNearLossless &&
+          enc->argb_content_ != kEncoderPalette) {
+        err = MakeInputImageCopy(enc);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+
+      // -----------------------------------------------------------------------
+      // Apply transforms and write transform data.
+
+      if (enc->use_subtract_green_) {
+        ApplySubtractGreen(enc, enc->current_width_, height, bw);
+      }
+
+      if (enc->use_predict_) {
+        err = ApplyPredictFilter(enc, enc->current_width_, height, quality,
+                                 low_effort, enc->use_subtract_green_, bw);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+
+      if (enc->use_cross_color_) {
+        err = ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
+                                    low_effort, bw);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+    }
+
+    VP8LPutBits(bw, !TRANSFORM_PRESENT, 1);  // No more transforms.
+
+    // -------------------------------------------------------------------------
+    // Encode and write the transformed image.
+    err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
+                              enc->current_width_, height, quality, low_effort,
+                              use_cache, &crunch_configs[idx],
+                              &enc->cache_bits_, enc->histo_bits_,
+                              byte_position, &hdr_size, &data_size);
+    if (err != VP8_ENC_OK) goto Error;
+
+    // If we are better than what we already have.
+    if (VP8LBitWriterNumBytes(bw) < best_size) {
+      best_size = VP8LBitWriterNumBytes(bw);
+      // Store the BitWriter.
+      VP8LBitWriterSwap(bw, &bw_best);
+#if !defined(WEBP_DISABLE_STATS)
+      // Update the stats.
+      if (stats != NULL) {
+        stats->lossless_features = 0;
+        if (enc->use_predict_) stats->lossless_features |= 1;
+        if (enc->use_cross_color_) stats->lossless_features |= 2;
+        if (enc->use_subtract_green_) stats->lossless_features |= 4;
+        if (enc->use_palette_) stats->lossless_features |= 8;
+        stats->histogram_bits = enc->histo_bits_;
+        stats->transform_bits = enc->transform_bits_;
+        stats->cache_bits = enc->cache_bits_;
+        stats->palette_size = enc->palette_size_;
+        stats->lossless_size = (int)(best_size - byte_position);
+        stats->lossless_hdr_size = hdr_size;
+        stats->lossless_data_size = data_size;
+      }
+#endif
+    }
+    // Reset the bit writer for the following iteration if any.
+    if (num_crunch_configs > 1) VP8LBitWriterReset(&bw_init, bw);
+  }
+  VP8LBitWriterSwap(&bw_best, bw);
+
+Error:
+  VP8LBitWriterWipeOut(&bw_best);
+  params->err_ = err;
+  // The hook should return false in case of error.
+  return (err == VP8_ENC_OK);
+}
+
+WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
+                                   const WebPPicture* const picture,
+                                   VP8LBitWriter* const bw_main,
+                                   int use_cache) {
+  WebPEncodingError err = VP8_ENC_OK;
+  VP8LEncoder* const enc_main = VP8LEncoderNew(config, picture);
+  VP8LEncoder* enc_side = NULL;
+  CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX];
+  int num_crunch_configs_main, num_crunch_configs_side = 0;
+  int idx;
+  int red_and_blue_always_zero = 0;
+  WebPWorker worker_main, worker_side;
+  StreamEncodeContext params_main, params_side;
+  // The main thread uses picture->stats, the side thread uses stats_side.
+  WebPAuxStats stats_side;
+  VP8LBitWriter bw_side;
+  const WebPWorkerInterface* const worker_interface = WebPGetWorkerInterface();
+  int ok_main;
+
+  // Analyze image (entropy, num_palettes etc)
+  if (enc_main == NULL ||
+      !EncoderAnalyze(enc_main, crunch_configs, &num_crunch_configs_main,
+                      &red_and_blue_always_zero) ||
+      !EncoderInit(enc_main) || !VP8LBitWriterInit(&bw_side, 0)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  // Split the configs between the main and side threads (if any).
+  if (config->thread_level > 0) {
+    num_crunch_configs_side = num_crunch_configs_main / 2;
+    for (idx = 0; idx < num_crunch_configs_side; ++idx) {
+      params_side.crunch_configs_[idx] =
+          crunch_configs[num_crunch_configs_main - num_crunch_configs_side +
+                         idx];
+    }
+    params_side.num_crunch_configs_ = num_crunch_configs_side;
+  }
+  num_crunch_configs_main -= num_crunch_configs_side;
+  for (idx = 0; idx < num_crunch_configs_main; ++idx) {
+    params_main.crunch_configs_[idx] = crunch_configs[idx];
+  }
+  params_main.num_crunch_configs_ = num_crunch_configs_main;
+
+  // Fill in the parameters for the thread workers.
+  {
+    const int params_size = (num_crunch_configs_side > 0) ? 2 : 1;
+    for (idx = 0; idx < params_size; ++idx) {
+      // Create the parameters for each worker.
+      WebPWorker* const worker = (idx == 0) ? &worker_main : &worker_side;
+      StreamEncodeContext* const param =
+          (idx == 0) ? &params_main : &params_side;
+      param->config_ = config;
+      param->picture_ = picture;
+      param->use_cache_ = use_cache;
+      param->red_and_blue_always_zero_ = red_and_blue_always_zero;
+      if (idx == 0) {
+        param->stats_ = picture->stats;
+        param->bw_ = bw_main;
+        param->enc_ = enc_main;
+      } else {
+        param->stats_ = (picture->stats == NULL) ? NULL : &stats_side;
+        // Create a side bit writer.
+        if (!VP8LBitWriterClone(bw_main, &bw_side)) {
+          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          goto Error;
+        }
+        param->bw_ = &bw_side;
+        // Create a side encoder.
+        enc_side = VP8LEncoderNew(config, picture);
+        if (enc_side == NULL || !EncoderInit(enc_side)) {
+          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          goto Error;
+        }
+        // Copy the values that were computed for the main encoder.
+        enc_side->histo_bits_ = enc_main->histo_bits_;
+        enc_side->transform_bits_ = enc_main->transform_bits_;
+        enc_side->palette_size_ = enc_main->palette_size_;
+        memcpy(enc_side->palette_, enc_main->palette_,
+               sizeof(enc_main->palette_));
+        memcpy(enc_side->palette_sorted_, enc_main->palette_sorted_,
+               sizeof(enc_main->palette_sorted_));
+        param->enc_ = enc_side;
+      }
+      // Create the workers.
+      worker_interface->Init(worker);
+      worker->data1 = param;
+      worker->data2 = NULL;
+      worker->hook = EncodeStreamHook;
+    }
+  }
+
+  // Start the second thread if needed.
+  if (num_crunch_configs_side != 0) {
+    if (!worker_interface->Reset(&worker_side)) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+#if !defined(WEBP_DISABLE_STATS)
+    // This line is here and not in the param initialization above to remove a
+    // Clang static analyzer warning.
+    if (picture->stats != NULL) {
+      memcpy(&stats_side, picture->stats, sizeof(stats_side));
+    }
+#endif
+    // This line is only useful to remove a Clang static analyzer warning.
+    params_side.err_ = VP8_ENC_OK;
+    worker_interface->Launch(&worker_side);
+  }
+  // Execute the main thread.
+  worker_interface->Execute(&worker_main);
+  ok_main = worker_interface->Sync(&worker_main);
+  worker_interface->End(&worker_main);
+  if (num_crunch_configs_side != 0) {
+    // Wait for the second thread.
+    const int ok_side = worker_interface->Sync(&worker_side);
+    worker_interface->End(&worker_side);
+    if (!ok_main || !ok_side) {
+      err = ok_main ? params_side.err_ : params_main.err_;
+      goto Error;
+    }
+    if (VP8LBitWriterNumBytes(&bw_side) < VP8LBitWriterNumBytes(bw_main)) {
+      VP8LBitWriterSwap(bw_main, &bw_side);
+#if !defined(WEBP_DISABLE_STATS)
+      if (picture->stats != NULL) {
+        memcpy(picture->stats, &stats_side, sizeof(*picture->stats));
+      }
+#endif
+    }
+  } else {
+    if (!ok_main) {
+      err = params_main.err_;
+      goto Error;
+    }
+  }
+
+Error:
+  VP8LBitWriterWipeOut(&bw_side);
+  VP8LEncoderDelete(enc_main);
+  VP8LEncoderDelete(enc_side);
+  return err;
+}
+
+#undef CRUNCH_CONFIGS_MAX
+#undef CRUNCH_SUBCONFIGS_MAX
+
+int VP8LEncodeImage(const WebPConfig* const config,
+                    const WebPPicture* const picture) {
+  int width, height;
+  int has_alpha;
+  size_t coded_size;
+  int percent = 0;
+  int initial_size;
+  WebPEncodingError err = VP8_ENC_OK;
+  VP8LBitWriter bw;
+
+  if (picture == NULL) return 0;
+
+  if (config == NULL || picture->argb == NULL) {
+    err = VP8_ENC_ERROR_NULL_PARAMETER;
+    WebPEncodingSetError(picture, err);
+    return 0;
+  }
+
+  width = picture->width;
+  height = picture->height;
+  // Initialize BitWriter with size corresponding to 16 bpp to photo images and
+  // 8 bpp for graphical images.
+  initial_size = (config->image_hint == WEBP_HINT_GRAPH) ?
+      width * height : width * height * 2;
+  if (!VP8LBitWriterInit(&bw, initial_size)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  if (!WebPReportProgress(picture, 1, &percent)) {
+ UserAbort:
+    err = VP8_ENC_ERROR_USER_ABORT;
+    goto Error;
+  }
+  // Reset stats (for pure lossless coding)
+  if (picture->stats != NULL) {
+    WebPAuxStats* const stats = picture->stats;
+    memset(stats, 0, sizeof(*stats));
+    stats->PSNR[0] = 99.f;
+    stats->PSNR[1] = 99.f;
+    stats->PSNR[2] = 99.f;
+    stats->PSNR[3] = 99.f;
+    stats->PSNR[4] = 99.f;
+  }
+
+  // Write image size.
+  if (!WriteImageSize(picture, &bw)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  has_alpha = WebPPictureHasTransparency(picture);
+  // Write the non-trivial Alpha flag and lossless version.
+  if (!WriteRealAlphaAndVersion(&bw, has_alpha)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  if (!WebPReportProgress(picture, 5, &percent)) goto UserAbort;
+
+  // Encode main image stream.
+  err = VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/);
+  if (err != VP8_ENC_OK) goto Error;
+
+  if (!WebPReportProgress(picture, 90, &percent)) goto UserAbort;
+
+  // Finish the RIFF chunk.
+  err = WriteImage(picture, &bw, &coded_size);
+  if (err != VP8_ENC_OK) goto Error;
+
+  if (!WebPReportProgress(picture, 100, &percent)) goto UserAbort;
+
+#if !defined(WEBP_DISABLE_STATS)
+  // Save size.
+  if (picture->stats != NULL) {
+    picture->stats->coded_size += (int)coded_size;
+    picture->stats->lossless_size = (int)coded_size;
+  }
+#endif
+
+  if (picture->extra_info != NULL) {
+    const int mb_w = (width + 15) >> 4;
+    const int mb_h = (height + 15) >> 4;
+    memset(picture->extra_info, 0, mb_w * mb_h * sizeof(*picture->extra_info));
+  }
+
+ Error:
+  if (bw.error_) err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+  VP8LBitWriterWipeOut(&bw);
+  if (err != VP8_ENC_OK) {
+    WebPEncodingSetError(picture, err);
+    return 0;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/enc/vp8li_enc.h b/media/libwebp/enc/vp8li_enc.h
index 1e259eda77..f6f8cf6403 100644
--- a/media/libwebp/enc/vp8li_enc.h
+++ b/media/libwebp/enc/vp8li_enc.h
@@ -69,9 +69,11 @@ typedef struct {
   int use_palette_;
   int palette_size_;
   uint32_t palette_[MAX_PALETTE_SIZE];
+  // Sorted version of palette_ for cache purposes.
+  uint32_t palette_sorted_[MAX_PALETTE_SIZE];
 
   // Some 'scratch' (potentially large) objects.
-  struct VP8LBackwardRefs refs_[3];  // Backward Refs array for temporaries.
+  struct VP8LBackwardRefs refs_[4];  // Backward Refs array for temporaries.
   VP8LHashChain hash_chain_;         // HashChain data for constructing
                                      // backward references.
 } VP8LEncoder;
diff --git a/media/libwebp/enc/webp_enc.c b/media/libwebp/enc/webp_enc.c
new file mode 100644
index 0000000000..47ba405f5d
--- /dev/null
+++ b/media/libwebp/enc/webp_enc.c
@@ -0,0 +1,410 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebP encoder: main entry point
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/vp8li_enc.h"
+#include "../utils/utils.h"
+
+// #define PRINT_MEMORY_INFO
+
+#ifdef PRINT_MEMORY_INFO
+#include <stdio.h>
+#endif
+
+//------------------------------------------------------------------------------
+
+int WebPGetEncoderVersion(void) {
+  return (ENC_MAJ_VERSION << 16) | (ENC_MIN_VERSION << 8) | ENC_REV_VERSION;
+}
+
+//------------------------------------------------------------------------------
+// VP8Encoder
+//------------------------------------------------------------------------------
+
+static void ResetSegmentHeader(VP8Encoder* const enc) {
+  VP8EncSegmentHeader* const hdr = &enc->segment_hdr_;
+  hdr->num_segments_ = enc->config_->segments;
+  hdr->update_map_  = (hdr->num_segments_ > 1);
+  hdr->size_ = 0;
+}
+
+static void ResetFilterHeader(VP8Encoder* const enc) {
+  VP8EncFilterHeader* const hdr = &enc->filter_hdr_;
+  hdr->simple_ = 1;
+  hdr->level_ = 0;
+  hdr->sharpness_ = 0;
+  hdr->i4x4_lf_delta_ = 0;
+}
+
+static void ResetBoundaryPredictions(VP8Encoder* const enc) {
+  // init boundary values once for all
+  // Note: actually, initializing the preds_[] is only needed for intra4.
+  int i;
+  uint8_t* const top = enc->preds_ - enc->preds_w_;
+  uint8_t* const left = enc->preds_ - 1;
+  for (i = -1; i < 4 * enc->mb_w_; ++i) {
+    top[i] = B_DC_PRED;
+  }
+  for (i = 0; i < 4 * enc->mb_h_; ++i) {
+    left[i * enc->preds_w_] = B_DC_PRED;
+  }
+  enc->nz_[-1] = 0;   // constant
+}
+
+// Mapping from config->method_ to coding tools used.
+//-------------------+---+---+---+---+---+---+---+
+//   Method          | 0 | 1 | 2 | 3 |(4)| 5 | 6 |
+//-------------------+---+---+---+---+---+---+---+
+// fast probe        | x |   |   | x |   |   |   |
+//-------------------+---+---+---+---+---+---+---+
+// dynamic proba     | ~ | x | x | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// fast mode analysis|[x]|[x]|   |   | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// basic rd-opt      |   |   |   | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// disto-refine i4/16| x | x | x |   |   |   |   |
+//-------------------+---+---+---+---+---+---+---+
+// disto-refine uv   |   | x | x |   |   |   |   |
+//-------------------+---+---+---+---+---+---+---+
+// rd-opt i4/16      |   |   | ~ | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// token buffer (opt)|   |   |   | x | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+// Trellis           |   |   |   |   |   | x |Ful|
+//-------------------+---+---+---+---+---+---+---+
+// full-SNS          |   |   |   |   | x | x | x |
+//-------------------+---+---+---+---+---+---+---+
+
+static void MapConfigToTools(VP8Encoder* const enc) {
+  const WebPConfig* const config = enc->config_;
+  const int method = config->method;
+  const int limit = 100 - config->partition_limit;
+  enc->method_ = method;
+  enc->rd_opt_level_ = (method >= 6) ? RD_OPT_TRELLIS_ALL
+                     : (method >= 5) ? RD_OPT_TRELLIS
+                     : (method >= 3) ? RD_OPT_BASIC
+                     : RD_OPT_NONE;
+  enc->max_i4_header_bits_ =
+      256 * 16 * 16 *                 // upper bound: up to 16bit per 4x4 block
+      (limit * limit) / (100 * 100);  // ... modulated with a quadratic curve.
+
+  // partition0 = 512k max.
+  enc->mb_header_limit_ =
+      (score_t)256 * 510 * 8 * 1024 / (enc->mb_w_ * enc->mb_h_);
+
+  enc->thread_level_ = config->thread_level;
+
+  enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0);
+  if (!config->low_memory) {
+#if !defined(DISABLE_TOKEN_BUFFER)
+    enc->use_tokens_ = (enc->rd_opt_level_ >= RD_OPT_BASIC);  // need rd stats
+#endif
+    if (enc->use_tokens_) {
+      enc->num_parts_ = 1;   // doesn't work with multi-partition
+    }
+  }
+}
+
+// Memory scaling with dimensions:
+//  memory (bytes) ~= 2.25 * w + 0.0625 * w * h
+//
+// Typical memory footprint (614x440 picture)
+//              encoder: 22111
+//                 info: 4368
+//                preds: 17741
+//          top samples: 1263
+//             non-zero: 175
+//             lf-stats: 0
+//                total: 45658
+// Transient object sizes:
+//       VP8EncIterator: 3360
+//         VP8ModeScore: 872
+//       VP8SegmentInfo: 732
+//          VP8EncProba: 18352
+//              LFStats: 2048
+// Picture size (yuv): 419328
+
+static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
+                                  WebPPicture* const picture) {
+  VP8Encoder* enc;
+  const int use_filter =
+      (config->filter_strength > 0) || (config->autofilter > 0);
+  const int mb_w = (picture->width + 15) >> 4;
+  const int mb_h = (picture->height + 15) >> 4;
+  const int preds_w = 4 * mb_w + 1;
+  const int preds_h = 4 * mb_h + 1;
+  const size_t preds_size = preds_w * preds_h * sizeof(*enc->preds_);
+  const int top_stride = mb_w * 16;
+  const size_t nz_size = (mb_w + 1) * sizeof(*enc->nz_) + WEBP_ALIGN_CST;
+  const size_t info_size = mb_w * mb_h * sizeof(*enc->mb_info_);
+  const size_t samples_size =
+      2 * top_stride * sizeof(*enc->y_top_)  // top-luma/u/v
+      + WEBP_ALIGN_CST;                      // align all
+  const size_t lf_stats_size =
+      config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
+  const size_t top_derr_size =
+      (config->quality <= ERROR_DIFFUSION_QUALITY || config->pass > 1) ?
+          mb_w * sizeof(*enc->top_derr_) : 0;
+  uint8_t* mem;
+  const uint64_t size = (uint64_t)sizeof(*enc)   // main struct
+                      + WEBP_ALIGN_CST           // cache alignment
+                      + info_size                // modes info
+                      + preds_size               // prediction modes
+                      + samples_size             // top/left samples
+                      + top_derr_size            // top diffusion error
+                      + nz_size                  // coeff context bits
+                      + lf_stats_size;           // autofilter stats
+
+#ifdef PRINT_MEMORY_INFO
+  printf("===================================\n");
+  printf("Memory used:\n"
+         "             encoder: %ld\n"
+         "                info: %ld\n"
+         "               preds: %ld\n"
+         "         top samples: %ld\n"
+         "       top diffusion: %ld\n"
+         "            non-zero: %ld\n"
+         "            lf-stats: %ld\n"
+         "               total: %ld\n",
+         sizeof(*enc) + WEBP_ALIGN_CST, info_size,
+         preds_size, samples_size, top_derr_size, nz_size, lf_stats_size, size);
+  printf("Transient object sizes:\n"
+         "      VP8EncIterator: %ld\n"
+         "        VP8ModeScore: %ld\n"
+         "      VP8SegmentInfo: %ld\n"
+         "         VP8EncProba: %ld\n"
+         "             LFStats: %ld\n",
+         sizeof(VP8EncIterator), sizeof(VP8ModeScore),
+         sizeof(VP8SegmentInfo), sizeof(VP8EncProba),
+         sizeof(LFStats));
+  printf("Picture size (yuv): %ld\n",
+         mb_w * mb_h * 384 * sizeof(uint8_t));
+  printf("===================================\n");
+#endif
+  mem = (uint8_t*)WebPSafeMalloc(size, sizeof(*mem));
+  if (mem == NULL) {
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return NULL;
+  }
+  enc = (VP8Encoder*)mem;
+  mem = (uint8_t*)WEBP_ALIGN(mem + sizeof(*enc));
+  memset(enc, 0, sizeof(*enc));
+  enc->num_parts_ = 1 << config->partitions;
+  enc->mb_w_ = mb_w;
+  enc->mb_h_ = mb_h;
+  enc->preds_w_ = preds_w;
+  enc->mb_info_ = (VP8MBInfo*)mem;
+  mem += info_size;
+  enc->preds_ = mem + 1 + enc->preds_w_;
+  mem += preds_size;
+  enc->nz_ = 1 + (uint32_t*)WEBP_ALIGN(mem);
+  mem += nz_size;
+  enc->lf_stats_ = lf_stats_size ? (LFStats*)WEBP_ALIGN(mem) : NULL;
+  mem += lf_stats_size;
+
+  // top samples (all 16-aligned)
+  mem = (uint8_t*)WEBP_ALIGN(mem);
+  enc->y_top_ = mem;
+  enc->uv_top_ = enc->y_top_ + top_stride;
+  mem += 2 * top_stride;
+  enc->top_derr_ = top_derr_size ? (DError*)mem : NULL;
+  mem += top_derr_size;
+  assert(mem <= (uint8_t*)enc + size);
+
+  enc->config_ = config;
+  enc->profile_ = use_filter ? ((config->filter_type == 1) ? 0 : 1) : 2;
+  enc->pic_ = picture;
+  enc->percent_ = 0;
+
+  MapConfigToTools(enc);
+  VP8EncDspInit();
+  VP8DefaultProbas(enc);
+  ResetSegmentHeader(enc);
+  ResetFilterHeader(enc);
+  ResetBoundaryPredictions(enc);
+  VP8EncDspCostInit();
+  VP8EncInitAlpha(enc);
+
+  // lower quality means smaller output -> we modulate a little the page
+  // size based on quality. This is just a crude 1rst-order prediction.
+  {
+    const float scale = 1.f + config->quality * 5.f / 100.f;  // in [1,6]
+    VP8TBufferInit(&enc->tokens_, (int)(mb_w * mb_h * 4 * scale));
+  }
+  return enc;
+}
+
+static int DeleteVP8Encoder(VP8Encoder* enc) {
+  int ok = 1;
+  if (enc != NULL) {
+    ok = VP8EncDeleteAlpha(enc);
+    VP8TBufferClear(&enc->tokens_);
+    WebPSafeFree(enc);
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_DISABLE_STATS)
+static double GetPSNR(uint64_t err, uint64_t size) {
+  return (err > 0 && size > 0) ? 10. * log10(255. * 255. * size / err) : 99.;
+}
+
+static void FinalizePSNR(const VP8Encoder* const enc) {
+  WebPAuxStats* stats = enc->pic_->stats;
+  const uint64_t size = enc->sse_count_;
+  const uint64_t* const sse = enc->sse_;
+  stats->PSNR[0] = (float)GetPSNR(sse[0], size);
+  stats->PSNR[1] = (float)GetPSNR(sse[1], size / 4);
+  stats->PSNR[2] = (float)GetPSNR(sse[2], size / 4);
+  stats->PSNR[3] = (float)GetPSNR(sse[0] + sse[1] + sse[2], size * 3 / 2);
+  stats->PSNR[4] = (float)GetPSNR(sse[3], size);
+}
+#endif  // !defined(WEBP_DISABLE_STATS)
+
+static void StoreStats(VP8Encoder* const enc) {
+#if !defined(WEBP_DISABLE_STATS)
+  WebPAuxStats* const stats = enc->pic_->stats;
+  if (stats != NULL) {
+    int i, s;
+    for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
+      stats->segment_level[i] = enc->dqm_[i].fstrength_;
+      stats->segment_quant[i] = enc->dqm_[i].quant_;
+      for (s = 0; s <= 2; ++s) {
+        stats->residual_bytes[s][i] = enc->residual_bytes_[s][i];
+      }
+    }
+    FinalizePSNR(enc);
+    stats->coded_size = enc->coded_size_;
+    for (i = 0; i < 3; ++i) {
+      stats->block_count[i] = enc->block_count_[i];
+    }
+  }
+#else  // defined(WEBP_DISABLE_STATS)
+  WebPReportProgress(enc->pic_, 100, &enc->percent_);  // done!
+#endif  // !defined(WEBP_DISABLE_STATS)
+}
+
+int WebPEncodingSetError(const WebPPicture* const pic,
+                         WebPEncodingError error) {
+  assert((int)error < VP8_ENC_ERROR_LAST);
+  assert((int)error >= VP8_ENC_OK);
+  ((WebPPicture*)pic)->error_code = error;
+  return 0;
+}
+
+int WebPReportProgress(const WebPPicture* const pic,
+                       int percent, int* const percent_store) {
+  if (percent_store != NULL && percent != *percent_store) {
+    *percent_store = percent;
+    if (pic->progress_hook && !pic->progress_hook(percent, pic)) {
+      // user abort requested
+      WebPEncodingSetError(pic, VP8_ENC_ERROR_USER_ABORT);
+      return 0;
+    }
+  }
+  return 1;  // ok
+}
+//------------------------------------------------------------------------------
+
+int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
+  int ok = 0;
+  if (pic == NULL) return 0;
+
+  WebPEncodingSetError(pic, VP8_ENC_OK);  // all ok so far
+  if (config == NULL) {  // bad params
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
+  }
+  if (!WebPValidateConfig(config)) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  }
+  if (pic->width <= 0 || pic->height <= 0) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+  if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
+
+  if (pic->stats != NULL) memset(pic->stats, 0, sizeof(*pic->stats));
+
+  if (!config->lossless) {
+    VP8Encoder* enc = NULL;
+
+    if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
+      // Make sure we have YUVA samples.
+      if (config->use_sharp_yuv || (config->preprocessing & 4)) {
+        if (!WebPPictureSharpARGBToYUVA(pic)) {
+          return 0;
+        }
+      } else {
+        float dithering = 0.f;
+        if (config->preprocessing & 2) {
+          const float x = config->quality / 100.f;
+          const float x2 = x * x;
+          // slowly decreasing from max dithering at low quality (q->0)
+          // to 0.5 dithering amplitude at high quality (q->100)
+          dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
+        }
+        if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
+          return 0;
+        }
+      }
+    }
+
+    if (!config->exact) {
+      WebPCleanupTransparentArea(pic);
+    }
+
+    enc = InitVP8Encoder(config, pic);
+    if (enc == NULL) return 0;  // pic->error is already set.
+    // Note: each of the tasks below account for 20% in the progress report.
+    ok = VP8EncAnalyze(enc);
+
+    // Analysis is done, proceed to actual coding.
+    ok = ok && VP8EncStartAlpha(enc);   // possibly done in parallel
+    if (!enc->use_tokens_) {
+      ok = ok && VP8EncLoop(enc);
+    } else {
+      ok = ok && VP8EncTokenLoop(enc);
+    }
+    ok = ok && VP8EncFinishAlpha(enc);
+
+    ok = ok && VP8EncWrite(enc);
+    StoreStats(enc);
+    if (!ok) {
+      VP8EncFreeBitWriters(enc);
+    }
+    ok &= DeleteVP8Encoder(enc);  // must always be called, even if !ok
+  } else {
+    // Make sure we have ARGB samples.
+    if (pic->argb == NULL && !WebPPictureYUVAToARGB(pic)) {
+      return 0;
+    }
+
+    if (!config->exact) {
+      WebPReplaceTransparentPixels(pic, 0x000000);
+    }
+
+    ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.
+  }
+
+  return ok;
+}
diff --git a/media/libwebp/moz.build b/media/libwebp/moz.build
index 5450e2b47c..5580b9a3dc 100644
--- a/media/libwebp/moz.build
+++ b/media/libwebp/moz.build
@@ -9,6 +9,8 @@ with Files('**'):
 EXPORTS.webp += [
     'webp/decode.h',
     'webp/demux.h',
+    'webp/encode.h',
+    'webp/format_constants.h',
     'webp/mux_types.h',
     'webp/types.h',
 ]
@@ -17,6 +19,7 @@ DIRS += [
     'dec',
     'demux',
     'dsp',
+    'enc',
     'moz',
     'utils',
 ]
diff --git a/media/libwebp/moz/cpu.cpp b/media/libwebp/moz/cpu.cpp
index 39b9f3500e..5def5c2b25 100644
--- a/media/libwebp/moz/cpu.cpp
+++ b/media/libwebp/moz/cpu.cpp
@@ -4,7 +4,7 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 /* This file replaces the CPU info methods originally implemented in
- * src/dsp/cpu.c, due to missing dependencies for Andriod builds. It
+ * src/dsp/cpu.c, due to missing dependencies for Android builds. It
  * controls if NEON/SSE/etc is used. */
 
 #include "../dsp/dsp.h"
diff --git a/media/libwebp/update.sh b/media/libwebp/update.sh
index 4fff43d694..9201add982 100644..100755
--- a/media/libwebp/update.sh
+++ b/media/libwebp/update.sh
@@ -21,63 +21,21 @@ cp $1/src/webp/*.h webp
 
 mkdir -p dec
 cp $1/src/dec/*.h dec
-cp $1/src/dec/alpha_dec.c dec
-cp $1/src/dec/buffer_dec.c dec
-cp $1/src/dec/frame_dec.c dec
-cp $1/src/dec/idec_dec.c dec
-cp $1/src/dec/io_dec.c dec
-cp $1/src/dec/quant_dec.c dec
-cp $1/src/dec/tree_dec.c dec
-cp $1/src/dec/vp8_dec.c dec
-cp $1/src/dec/vp8l_dec.c dec
-cp $1/src/dec/webp_dec.c dec
+cp $1/src/dec/*.c dec
 
 mkdir -p demux
 cp $1/src/demux/demux.c demux
 
 mkdir -p dsp
 cp $1/src/dsp/*.h dsp
-cp $1/src/dsp/alpha_processing.c dsp
-cp $1/src/dsp/alpha_processing_neon.c dsp
-cp $1/src/dsp/alpha_processing_sse2.c dsp
-cp $1/src/dsp/alpha_processing_sse41.c dsp
-cp $1/src/dsp/dec.c dsp
-cp $1/src/dsp/dec_clip_tables.c dsp
-cp $1/src/dsp/dec_neon.c dsp
-cp $1/src/dsp/dec_sse2.c dsp
-cp $1/src/dsp/dec_sse41.c dsp
-cp $1/src/dsp/filters.c dsp
-cp $1/src/dsp/filters_neon.c dsp
-cp $1/src/dsp/filters_sse2.c dsp
-cp $1/src/dsp/lossless.c dsp
-cp $1/src/dsp/lossless_neon.c dsp
-cp $1/src/dsp/lossless_sse2.c dsp
-cp $1/src/dsp/rescaler.c dsp
-cp $1/src/dsp/rescaler_neon.c dsp
-cp $1/src/dsp/rescaler_sse2.c dsp
-cp $1/src/dsp/upsampling.c dsp
-cp $1/src/dsp/upsampling_neon.c dsp
-cp $1/src/dsp/upsampling_sse2.c dsp
-cp $1/src/dsp/upsampling_sse41.c dsp
-cp $1/src/dsp/yuv.c dsp
-cp $1/src/dsp/yuv_neon.c dsp
-cp $1/src/dsp/yuv_sse2.c dsp
-cp $1/src/dsp/yuv_sse41.c dsp
+cp $1/src/dsp/*.c dsp
 
 mkdir -p enc
 cp $1/src/enc/*.h enc
+cp $1/src/enc/*.c enc
 
 mkdir -p utils
 cp $1/src/utils/*.h utils
-cp $1/src/utils/bit_reader_utils.c utils
-cp $1/src/utils/color_cache_utils.c utils
-cp $1/src/utils/filters_utils.c utils
-cp $1/src/utils/huffman_utils.c utils
-cp $1/src/utils/quant_levels_dec_utils.c utils
-cp $1/src/utils/quant_levels_utils.c utils
-cp $1/src/utils/random_utils.c utils
-cp $1/src/utils/rescaler_utils.c utils
-cp $1/src/utils/thread_utils.c utils
-cp $1/src/utils/utils.c utils
+cp $1/src/utils/*.c utils
 
 find . \( -name "*.c" -o -name "*.h" \) -exec sed -i 's/#include "src\//#include "..\//g' {} \;
diff --git a/media/libwebp/utils/bit_reader_inl_utils.h b/media/libwebp/utils/bit_reader_inl_utils.h
index 8d1249ef97..78804ed8d2 100644
--- a/media/libwebp/utils/bit_reader_inl_utils.h
+++ b/media/libwebp/utils/bit_reader_inl_utils.h
@@ -55,7 +55,7 @@ void VP8LoadFinalBytes(VP8BitReader* const br);
 
 // makes sure br->value_ has at least BITS bits worth of data
 static WEBP_UBSAN_IGNORE_UNDEF WEBP_INLINE
-void VP8LoadNewBytes(VP8BitReader* const br) {
+void VP8LoadNewBytes(VP8BitReader* WEBP_RESTRICT const br) {
   assert(br != NULL && br->buf_ != NULL);
   // Read 'BITS' bits at a time if possible.
   if (br->buf_ < br->buf_max_) {
@@ -104,7 +104,8 @@ void VP8LoadNewBytes(VP8BitReader* const br) {
 }
 
 // Read a bit with proba 'prob'. Speed-critical function!
-static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
+static WEBP_INLINE int VP8GetBit(VP8BitReader* WEBP_RESTRICT const br,
+                                 int prob, const char label[]) {
   // Don't move this declaration! It makes a big speed difference to store
   // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
   // alter br->range_ value.
@@ -129,13 +130,15 @@ static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
       br->bits_ -= shift;
     }
     br->range_ = range - 1;
+    BT_TRACK(br);
     return bit;
   }
 }
 
 // simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here)
 static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
-int VP8GetSigned(VP8BitReader* const br, int v) {
+int VP8GetSigned(VP8BitReader* WEBP_RESTRICT const br, int v,
+                 const char label[]) {
   if (br->bits_ < 0) {
     VP8LoadNewBytes(br);
   }
@@ -148,11 +151,13 @@ int VP8GetSigned(VP8BitReader* const br, int v) {
     br->range_ += mask;
     br->range_ |= 1;
     br->value_ -= (bit_t)((split + 1) & mask) << pos;
+    BT_TRACK(br);
     return (v ^ mask) - mask;
   }
 }
 
-static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
+static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* WEBP_RESTRICT const br,
+                                    int prob, const char label[]) {
   // Don't move this declaration! It makes a big speed difference to store
   // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
   // alter br->range_ value.
@@ -179,6 +184,7 @@ static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
       br->bits_ -= shift;
     }
     br->range_ = range;
+    BT_TRACK(br);
     return bit;
   }
 }
diff --git a/media/libwebp/utils/bit_reader_utils.c b/media/libwebp/utils/bit_reader_utils.c
index a7cb193bde..1001ec445d 100644
--- a/media/libwebp/utils/bit_reader_utils.c
+++ b/media/libwebp/utils/bit_reader_utils.c
@@ -102,17 +102,18 @@ void VP8LoadFinalBytes(VP8BitReader* const br) {
 //------------------------------------------------------------------------------
 // Higher-level calls
 
-uint32_t VP8GetValue(VP8BitReader* const br, int bits) {
+uint32_t VP8GetValue(VP8BitReader* const br, int bits, const char label[]) {
   uint32_t v = 0;
   while (bits-- > 0) {
-    v |= VP8GetBit(br, 0x80) << bits;
+    v |= VP8GetBit(br, 0x80, label) << bits;
   }
   return v;
 }
 
-int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
-  const int value = VP8GetValue(br, bits);
-  return VP8Get(br) ? -value : value;
+int32_t VP8GetSignedValue(VP8BitReader* const br, int bits,
+                          const char label[]) {
+  const int value = VP8GetValue(br, bits, label);
+  return VP8Get(br, label) ? -value : value;
 }
 
 //------------------------------------------------------------------------------
@@ -220,3 +221,78 @@ uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
 }
 
 //------------------------------------------------------------------------------
+// Bit-tracing tool
+
+#if (BITTRACE > 0)
+
+#include <stdlib.h>   // for atexit()
+#include <stdio.h>
+#include <string.h>
+
+#define MAX_NUM_LABELS 32
+static struct {
+  const char* label;
+  int size;
+  int count;
+} kLabels[MAX_NUM_LABELS];
+
+static int last_label = 0;
+static int last_pos = 0;
+static const uint8_t* buf_start = NULL;
+static int init_done = 0;
+
+static void PrintBitTraces(void) {
+  int i;
+  int scale = 1;
+  int total = 0;
+  const char* units = "bits";
+#if (BITTRACE == 2)
+  scale = 8;
+  units = "bytes";
+#endif
+  for (i = 0; i < last_label; ++i) total += kLabels[i].size;
+  if (total < 1) total = 1;   // avoid rounding errors
+  printf("=== Bit traces ===\n");
+  for (i = 0; i < last_label; ++i) {
+    const int skip = 16 - (int)strlen(kLabels[i].label);
+    const int value = (kLabels[i].size + scale - 1) / scale;
+    assert(skip > 0);
+    printf("%s \%*s: %6d %s   \t[%5.2f%%] [count: %7d]\n",
+           kLabels[i].label, skip, "", value, units,
+           100.f * kLabels[i].size / total,
+           kLabels[i].count);
+  }
+  total = (total + scale - 1) / scale;
+  printf("Total: %d %s\n", total, units);
+}
+
+void BitTrace(const struct VP8BitReader* const br, const char label[]) {
+  int i, pos;
+  if (!init_done) {
+    memset(kLabels, 0, sizeof(kLabels));
+    atexit(PrintBitTraces);
+    buf_start = br->buf_;
+    init_done = 1;
+  }
+  pos = (int)(br->buf_ - buf_start) * 8 - br->bits_;
+  // if there's a too large jump, we've changed partition -> reset counter
+  if (abs(pos - last_pos) > 32) {
+    buf_start = br->buf_;
+    pos = 0;
+    last_pos = 0;
+  }
+  if (br->range_ >= 0x7f) pos += kVP8Log2Range[br->range_ - 0x7f];
+  for (i = 0; i < last_label; ++i) {
+    if (!strcmp(label, kLabels[i].label)) break;
+  }
+  if (i == MAX_NUM_LABELS) abort();   // overflow!
+  kLabels[i].label = label;
+  kLabels[i].size += pos - last_pos;
+  kLabels[i].count += 1;
+  if (i == last_label) ++last_label;
+  last_pos = pos;
+}
+
+#endif  // BITTRACE > 0
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/utils/bit_reader_utils.h b/media/libwebp/utils/bit_reader_utils.h
index 377a7821ad..2df9c417cf 100644
--- a/media/libwebp/utils/bit_reader_utils.h
+++ b/media/libwebp/utils/bit_reader_utils.h
@@ -21,6 +21,27 @@
 #endif
 #include "../webp/types.h"
 
+// Warning! This macro triggers quite some MACRO wizardry around func signature!
+#if !defined(BITTRACE)
+#define BITTRACE 0    // 0 = off, 1 = print bits, 2 = print bytes
+#endif
+
+#if (BITTRACE > 0)
+struct VP8BitReader;
+extern void BitTrace(const struct VP8BitReader* const br, const char label[]);
+#define BT_TRACK(br) BitTrace(br, label)
+#define VP8Get(BR, L) VP8GetValue(BR, 1, L)
+#else
+#define BT_TRACK(br)
+// We'll REMOVE the 'const char label[]' from all signatures and calls (!!):
+#define VP8GetValue(BR, N, L) VP8GetValue(BR, N)
+#define VP8Get(BR, L) VP8GetValue(BR, 1, L)
+#define VP8GetSignedValue(BR, N, L) VP8GetSignedValue(BR, N)
+#define VP8GetBit(BR, P, L) VP8GetBit(BR, P)
+#define VP8GetBitAlt(BR, P, L) VP8GetBitAlt(BR, P)
+#define VP8GetSigned(BR, V, L) VP8GetSigned(BR, V)
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -92,17 +113,15 @@ void VP8BitReaderSetBuffer(VP8BitReader* const br,
 void VP8RemapBitReader(VP8BitReader* const br, ptrdiff_t offset);
 
 // return the next value made of 'num_bits' bits
-uint32_t VP8GetValue(VP8BitReader* const br, int num_bits);
-static WEBP_INLINE uint32_t VP8Get(VP8BitReader* const br) {
-  return VP8GetValue(br, 1);
-}
+uint32_t VP8GetValue(VP8BitReader* const br, int num_bits, const char label[]);
 
 // return the next value with sign-extension.
-int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
+int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits,
+                          const char label[]);
 
 // bit_reader_inl.h will implement the following methods:
-//   static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob)
-//   static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v)
+//   static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob, ...)
+//   static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v, ...)
 // and should be included by the .c files that actually need them.
 // This is to avoid recompiling the whole library whenever this file is touched,
 // and also allowing platform-specific ad-hoc hacks.
diff --git a/media/libwebp/utils/bit_writer_utils.c b/media/libwebp/utils/bit_writer_utils.c
new file mode 100644
index 0000000000..37a63946c4
--- /dev/null
+++ b/media/libwebp/utils/bit_writer_utils.c
@@ -0,0 +1,347 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Bit writing and boolean coder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+//         Vikas Arora (vikaas.arora@gmail.com)
+
+#include <assert.h>
+#include <string.h>   // for memcpy()
+#include <stdlib.h>
+
+#include "../utils/bit_writer_utils.h"
+#include "../utils/endian_inl_utils.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+// VP8BitWriter
+
+static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
+  uint8_t* new_buf;
+  size_t new_size;
+  const uint64_t needed_size_64b = (uint64_t)bw->pos_ + extra_size;
+  const size_t needed_size = (size_t)needed_size_64b;
+  if (needed_size_64b != needed_size) {
+    bw->error_ = 1;
+    return 0;
+  }
+  if (needed_size <= bw->max_pos_) return 1;
+  // If the following line wraps over 32bit, the test just after will catch it.
+  new_size = 2 * bw->max_pos_;
+  if (new_size < needed_size) new_size = needed_size;
+  if (new_size < 1024) new_size = 1024;
+  new_buf = (uint8_t*)WebPSafeMalloc(1ULL, new_size);
+  if (new_buf == NULL) {
+    bw->error_ = 1;
+    return 0;
+  }
+  if (bw->pos_ > 0) {
+    assert(bw->buf_ != NULL);
+    memcpy(new_buf, bw->buf_, bw->pos_);
+  }
+  WebPSafeFree(bw->buf_);
+  bw->buf_ = new_buf;
+  bw->max_pos_ = new_size;
+  return 1;
+}
+
+static void Flush(VP8BitWriter* const bw) {
+  const int s = 8 + bw->nb_bits_;
+  const int32_t bits = bw->value_ >> s;
+  assert(bw->nb_bits_ >= 0);
+  bw->value_ -= bits << s;
+  bw->nb_bits_ -= 8;
+  if ((bits & 0xff) != 0xff) {
+    size_t pos = bw->pos_;
+    if (!BitWriterResize(bw, bw->run_ + 1)) {
+      return;
+    }
+    if (bits & 0x100) {  // overflow -> propagate carry over pending 0xff's
+      if (pos > 0) bw->buf_[pos - 1]++;
+    }
+    if (bw->run_ > 0) {
+      const int value = (bits & 0x100) ? 0x00 : 0xff;
+      for (; bw->run_ > 0; --bw->run_) bw->buf_[pos++] = value;
+    }
+    bw->buf_[pos++] = bits & 0xff;
+    bw->pos_ = pos;
+  } else {
+    bw->run_++;   // delay writing of bytes 0xff, pending eventual carry.
+  }
+}
+
+//------------------------------------------------------------------------------
+// renormalization
+
+static const uint8_t kNorm[128] = {  // renorm_sizes[i] = 8 - log2(i)
+     7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0
+};
+
+// range = ((range + 1) << kVP8Log2Range[range]) - 1
+static const uint8_t kNewRange[128] = {
+  127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239,
+  127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239,
+  247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179,
+  183, 187, 191, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239,
+  243, 247, 251, 127, 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149,
+  151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179,
+  181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, 205, 207, 209,
+  211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239,
+  241, 243, 245, 247, 249, 251, 253, 127
+};
+
+int VP8PutBit(VP8BitWriter* const bw, int bit, int prob) {
+  const int split = (bw->range_ * prob) >> 8;
+  if (bit) {
+    bw->value_ += split + 1;
+    bw->range_ -= split + 1;
+  } else {
+    bw->range_ = split;
+  }
+  if (bw->range_ < 127) {   // emit 'shift' bits out and renormalize
+    const int shift = kNorm[bw->range_];
+    bw->range_ = kNewRange[bw->range_];
+    bw->value_ <<= shift;
+    bw->nb_bits_ += shift;
+    if (bw->nb_bits_ > 0) Flush(bw);
+  }
+  return bit;
+}
+
+int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
+  const int split = bw->range_ >> 1;
+  if (bit) {
+    bw->value_ += split + 1;
+    bw->range_ -= split + 1;
+  } else {
+    bw->range_ = split;
+  }
+  if (bw->range_ < 127) {
+    bw->range_ = kNewRange[bw->range_];
+    bw->value_ <<= 1;
+    bw->nb_bits_ += 1;
+    if (bw->nb_bits_ > 0) Flush(bw);
+  }
+  return bit;
+}
+
+void VP8PutBits(VP8BitWriter* const bw, uint32_t value, int nb_bits) {
+  uint32_t mask;
+  assert(nb_bits > 0 && nb_bits < 32);
+  for (mask = 1u << (nb_bits - 1); mask; mask >>= 1) {
+    VP8PutBitUniform(bw, value & mask);
+  }
+}
+
+void VP8PutSignedBits(VP8BitWriter* const bw, int value, int nb_bits) {
+  if (!VP8PutBitUniform(bw, value != 0)) return;
+  if (value < 0) {
+    VP8PutBits(bw, ((-value) << 1) | 1, nb_bits + 1);
+  } else {
+    VP8PutBits(bw, value << 1, nb_bits + 1);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) {
+  bw->range_   = 255 - 1;
+  bw->value_   = 0;
+  bw->run_     = 0;
+  bw->nb_bits_ = -8;
+  bw->pos_     = 0;
+  bw->max_pos_ = 0;
+  bw->error_   = 0;
+  bw->buf_     = NULL;
+  return (expected_size > 0) ? BitWriterResize(bw, expected_size) : 1;
+}
+
+uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) {
+  VP8PutBits(bw, 0, 9 - bw->nb_bits_);
+  bw->nb_bits_ = 0;   // pad with zeroes
+  Flush(bw);
+  return bw->buf_;
+}
+
+int VP8BitWriterAppend(VP8BitWriter* const bw,
+                       const uint8_t* data, size_t size) {
+  assert(data != NULL);
+  if (bw->nb_bits_ != -8) return 0;   // Flush() must have been called
+  if (!BitWriterResize(bw, size)) return 0;
+  memcpy(bw->buf_ + bw->pos_, data, size);
+  bw->pos_ += size;
+  return 1;
+}
+
+void VP8BitWriterWipeOut(VP8BitWriter* const bw) {
+  if (bw != NULL) {
+    WebPSafeFree(bw->buf_);
+    memset(bw, 0, sizeof(*bw));
+  }
+}
+
+//------------------------------------------------------------------------------
+// VP8LBitWriter
+
+// This is the minimum amount of size the memory buffer is guaranteed to grow
+// when extra space is needed.
+#define MIN_EXTRA_SIZE  (32768ULL)
+
+// Returns 1 on success.
+static int VP8LBitWriterResize(VP8LBitWriter* const bw, size_t extra_size) {
+  uint8_t* allocated_buf;
+  size_t allocated_size;
+  const size_t max_bytes = bw->end_ - bw->buf_;
+  const size_t current_size = bw->cur_ - bw->buf_;
+  const uint64_t size_required_64b = (uint64_t)current_size + extra_size;
+  const size_t size_required = (size_t)size_required_64b;
+  if (size_required != size_required_64b) {
+    bw->error_ = 1;
+    return 0;
+  }
+  if (max_bytes > 0 && size_required <= max_bytes) return 1;
+  allocated_size = (3 * max_bytes) >> 1;
+  if (allocated_size < size_required) allocated_size = size_required;
+  // make allocated size multiple of 1k
+  allocated_size = (((allocated_size >> 10) + 1) << 10);
+  allocated_buf = (uint8_t*)WebPSafeMalloc(1ULL, allocated_size);
+  if (allocated_buf == NULL) {
+    bw->error_ = 1;
+    return 0;
+  }
+  if (current_size > 0) {
+    memcpy(allocated_buf, bw->buf_, current_size);
+  }
+  WebPSafeFree(bw->buf_);
+  bw->buf_ = allocated_buf;
+  bw->cur_ = bw->buf_ + current_size;
+  bw->end_ = bw->buf_ + allocated_size;
+  return 1;
+}
+
+int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) {
+  memset(bw, 0, sizeof(*bw));
+  return VP8LBitWriterResize(bw, expected_size);
+}
+
+int VP8LBitWriterClone(const VP8LBitWriter* const src,
+                       VP8LBitWriter* const dst) {
+  const size_t current_size = src->cur_ - src->buf_;
+  assert(src->cur_ >= src->buf_ && src->cur_ <= src->end_);
+  if (!VP8LBitWriterResize(dst, current_size)) return 0;
+  memcpy(dst->buf_, src->buf_, current_size);
+  dst->bits_ = src->bits_;
+  dst->used_ = src->used_;
+  dst->error_ = src->error_;
+  dst->cur_ = dst->buf_ + current_size;
+  return 1;
+}
+
+void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
+  if (bw != NULL) {
+    WebPSafeFree(bw->buf_);
+    memset(bw, 0, sizeof(*bw));
+  }
+}
+
+void VP8LBitWriterReset(const VP8LBitWriter* const bw_init,
+                        VP8LBitWriter* const bw) {
+  bw->bits_ = bw_init->bits_;
+  bw->used_ = bw_init->used_;
+  bw->cur_ = bw->buf_ + (bw_init->cur_ - bw_init->buf_);
+  assert(bw->cur_ <= bw->end_);
+  bw->error_ = bw_init->error_;
+}
+
+void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst) {
+  const VP8LBitWriter tmp = *src;
+  *src = *dst;
+  *dst = tmp;
+}
+
+void VP8LPutBitsFlushBits(VP8LBitWriter* const bw) {
+  // If needed, make some room by flushing some bits out.
+  if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
+    const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
+    if (!CheckSizeOverflow(extra_size) ||
+        !VP8LBitWriterResize(bw, (size_t)extra_size)) {
+      bw->cur_ = bw->buf_;
+      bw->error_ = 1;
+      return;
+    }
+  }
+  *(vp8l_wtype_t*)bw->cur_ = (vp8l_wtype_t)WSWAP((vp8l_wtype_t)bw->bits_);
+  bw->cur_ += VP8L_WRITER_BYTES;
+  bw->bits_ >>= VP8L_WRITER_BITS;
+  bw->used_ -= VP8L_WRITER_BITS;
+}
+
+void VP8LPutBitsInternal(VP8LBitWriter* const bw, uint32_t bits, int n_bits) {
+  assert(n_bits <= 32);
+  // That's the max we can handle:
+  assert(sizeof(vp8l_wtype_t) == 2);
+  if (n_bits > 0) {
+    vp8l_atype_t lbits = bw->bits_;
+    int used = bw->used_;
+    // Special case of overflow handling for 32bit accumulator (2-steps flush).
+#if VP8L_WRITER_BITS == 16
+    if (used + n_bits >= VP8L_WRITER_MAX_BITS) {
+      // Fill up all the VP8L_WRITER_MAX_BITS so it can be flushed out below.
+      const int shift = VP8L_WRITER_MAX_BITS - used;
+      lbits |= (vp8l_atype_t)bits << used;
+      used = VP8L_WRITER_MAX_BITS;
+      n_bits -= shift;
+      bits >>= shift;
+      assert(n_bits <= VP8L_WRITER_MAX_BITS);
+    }
+#endif
+    // If needed, make some room by flushing some bits out.
+    while (used >= VP8L_WRITER_BITS) {
+      if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
+        const uint64_t extra_size = (bw->end_ - bw->buf_) + MIN_EXTRA_SIZE;
+        if (!CheckSizeOverflow(extra_size) ||
+            !VP8LBitWriterResize(bw, (size_t)extra_size)) {
+          bw->cur_ = bw->buf_;
+          bw->error_ = 1;
+          return;
+        }
+      }
+      *(vp8l_wtype_t*)bw->cur_ = (vp8l_wtype_t)WSWAP((vp8l_wtype_t)lbits);
+      bw->cur_ += VP8L_WRITER_BYTES;
+      lbits >>= VP8L_WRITER_BITS;
+      used -= VP8L_WRITER_BITS;
+    }
+    bw->bits_ = lbits | ((vp8l_atype_t)bits << used);
+    bw->used_ = used + n_bits;
+  }
+}
+
+uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw) {
+  // flush leftover bits
+  if (VP8LBitWriterResize(bw, (bw->used_ + 7) >> 3)) {
+    while (bw->used_ > 0) {
+      *bw->cur_++ = (uint8_t)bw->bits_;
+      bw->bits_ >>= 8;
+      bw->used_ -= 8;
+    }
+    bw->used_ = 0;
+  }
+  return bw->buf_;
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/utils/color_cache_utils.c b/media/libwebp/utils/color_cache_utils.c
index c5eb0d8a90..c9e212df53 100644
--- a/media/libwebp/utils/color_cache_utils.c
+++ b/media/libwebp/utils/color_cache_utils.c
@@ -20,22 +20,22 @@
 //------------------------------------------------------------------------------
 // VP8LColorCache.
 
-int VP8LColorCacheInit(VP8LColorCache* const cc, int hash_bits) {
+int VP8LColorCacheInit(VP8LColorCache* const color_cache, int hash_bits) {
   const int hash_size = 1 << hash_bits;
-  assert(cc != NULL);
+  assert(color_cache != NULL);
   assert(hash_bits > 0);
-  cc->colors_ = (uint32_t*)WebPSafeCalloc((uint64_t)hash_size,
-                                          sizeof(*cc->colors_));
-  if (cc->colors_ == NULL) return 0;
-  cc->hash_shift_ = 32 - hash_bits;
-  cc->hash_bits_ = hash_bits;
+  color_cache->colors_ = (uint32_t*)WebPSafeCalloc(
+      (uint64_t)hash_size, sizeof(*color_cache->colors_));
+  if (color_cache->colors_ == NULL) return 0;
+  color_cache->hash_shift_ = 32 - hash_bits;
+  color_cache->hash_bits_ = hash_bits;
   return 1;
 }
 
-void VP8LColorCacheClear(VP8LColorCache* const cc) {
-  if (cc != NULL) {
-    WebPSafeFree(cc->colors_);
-    cc->colors_ = NULL;
+void VP8LColorCacheClear(VP8LColorCache* const color_cache) {
+  if (color_cache != NULL) {
+    WebPSafeFree(color_cache->colors_);
+    color_cache->colors_ = NULL;
   }
 }
 
diff --git a/media/libwebp/utils/color_cache_utils.h b/media/libwebp/utils/color_cache_utils.h
index c46131277d..34e9e68795 100644
--- a/media/libwebp/utils/color_cache_utils.h
+++ b/media/libwebp/utils/color_cache_utils.h
@@ -17,6 +17,7 @@
 
 #include <assert.h>
 
+#include "../dsp/dsp.h"
 #include "../webp/types.h"
 
 #ifdef __cplusplus
@@ -25,15 +26,16 @@ extern "C" {
 
 // Main color cache struct.
 typedef struct {
-  uint32_t *colors_;  // color entries
+  uint32_t* colors_;  // color entries
   int hash_shift_;    // Hash shift: 32 - hash_bits_.
   int hash_bits_;
 } VP8LColorCache;
 
-static const uint64_t kHashMul = 0x1e35a7bdull;
+static const uint32_t kHashMul = 0x1e35a7bdu;
 
-static WEBP_INLINE int VP8LHashPix(uint32_t argb, int shift) {
-  return (int)(((argb * kHashMul) & 0xffffffffu) >> shift);
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+int VP8LHashPix(uint32_t argb, int shift) {
+  return (int)((argb * kHashMul) >> shift);
 }
 
 static WEBP_INLINE uint32_t VP8LColorCacheLookup(
diff --git a/media/libwebp/utils/huffman_encode_utils.c b/media/libwebp/utils/huffman_encode_utils.c
new file mode 100644
index 0000000000..8219cfc168
--- /dev/null
+++ b/media/libwebp/utils/huffman_encode_utils.c
@@ -0,0 +1,416 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+// Entropy encoding (Huffman) for webp lossless.
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../utils/huffman_encode_utils.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+// -----------------------------------------------------------------------------
+// Util function to optimize the symbol map for RLE coding
+
+// Heuristics for selecting the stride ranges to collapse.
+static int ValuesShouldBeCollapsedToStrideAverage(int a, int b) {
+  return abs(a - b) < 4;
+}
+
+// Change the population counts in a way that the consequent
+// Huffman tree compression, especially its RLE-part, give smaller output.
+static void OptimizeHuffmanForRle(int length, uint8_t* const good_for_rle,
+                                  uint32_t* const counts) {
+  // 1) Let's make the Huffman code more compatible with rle encoding.
+  int i;
+  for (; length >= 0; --length) {
+    if (length == 0) {
+      return;  // All zeros.
+    }
+    if (counts[length - 1] != 0) {
+      // Now counts[0..length - 1] does not have trailing zeros.
+      break;
+    }
+  }
+  // 2) Let's mark all population counts that already can be encoded
+  // with an rle code.
+  {
+    // Let's not spoil any of the existing good rle codes.
+    // Mark any seq of 0's that is longer as 5 as a good_for_rle.
+    // Mark any seq of non-0's that is longer as 7 as a good_for_rle.
+    uint32_t symbol = counts[0];
+    int stride = 0;
+    for (i = 0; i < length + 1; ++i) {
+      if (i == length || counts[i] != symbol) {
+        if ((symbol == 0 && stride >= 5) ||
+            (symbol != 0 && stride >= 7)) {
+          int k;
+          for (k = 0; k < stride; ++k) {
+            good_for_rle[i - k - 1] = 1;
+          }
+        }
+        stride = 1;
+        if (i != length) {
+          symbol = counts[i];
+        }
+      } else {
+        ++stride;
+      }
+    }
+  }
+  // 3) Let's replace those population counts that lead to more rle codes.
+  {
+    uint32_t stride = 0;
+    uint32_t limit = counts[0];
+    uint32_t sum = 0;
+    for (i = 0; i < length + 1; ++i) {
+      if (i == length || good_for_rle[i] ||
+          (i != 0 && good_for_rle[i - 1]) ||
+          !ValuesShouldBeCollapsedToStrideAverage(counts[i], limit)) {
+        if (stride >= 4 || (stride >= 3 && sum == 0)) {
+          uint32_t k;
+          // The stride must end, collapse what we have, if we have enough (4).
+          uint32_t count = (sum + stride / 2) / stride;
+          if (count < 1) {
+            count = 1;
+          }
+          if (sum == 0) {
+            // Don't make an all zeros stride to be upgraded to ones.
+            count = 0;
+          }
+          for (k = 0; k < stride; ++k) {
+            // We don't want to change value at counts[i],
+            // that is already belonging to the next stride. Thus - 1.
+            counts[i - k - 1] = count;
+          }
+        }
+        stride = 0;
+        sum = 0;
+        if (i < length - 3) {
+          // All interesting strides have a count of at least 4,
+          // at least when non-zeros.
+          limit = (counts[i] + counts[i + 1] +
+                   counts[i + 2] + counts[i + 3] + 2) / 4;
+        } else if (i < length) {
+          limit = counts[i];
+        } else {
+          limit = 0;
+        }
+      }
+      ++stride;
+      if (i != length) {
+        sum += counts[i];
+        if (stride >= 4) {
+          limit = (sum + stride / 2) / stride;
+        }
+      }
+    }
+  }
+}
+
+// A comparer function for two Huffman trees: sorts first by 'total count'
+// (more comes first), and then by 'value' (more comes first).
+static int CompareHuffmanTrees(const void* ptr1, const void* ptr2) {
+  const HuffmanTree* const t1 = (const HuffmanTree*)ptr1;
+  const HuffmanTree* const t2 = (const HuffmanTree*)ptr2;
+  if (t1->total_count_ > t2->total_count_) {
+    return -1;
+  } else if (t1->total_count_ < t2->total_count_) {
+    return 1;
+  } else {
+    assert(t1->value_ != t2->value_);
+    return (t1->value_ < t2->value_) ? -1 : 1;
+  }
+}
+
+static void SetBitDepths(const HuffmanTree* const tree,
+                         const HuffmanTree* const pool,
+                         uint8_t* const bit_depths, int level) {
+  if (tree->pool_index_left_ >= 0) {
+    SetBitDepths(&pool[tree->pool_index_left_], pool, bit_depths, level + 1);
+    SetBitDepths(&pool[tree->pool_index_right_], pool, bit_depths, level + 1);
+  } else {
+    bit_depths[tree->value_] = level;
+  }
+}
+
+// Create an optimal Huffman tree.
+//
+// (data,length): population counts.
+// tree_limit: maximum bit depth (inclusive) of the codes.
+// bit_depths[]: how many bits are used for the symbol.
+//
+// Returns 0 when an error has occurred.
+//
+// The catch here is that the tree cannot be arbitrarily deep
+//
+// count_limit is the value that is to be faked as the minimum value
+// and this minimum value is raised until the tree matches the
+// maximum length requirement.
+//
+// This algorithm is not of excellent performance for very long data blocks,
+// especially when population counts are longer than 2**tree_limit, but
+// we are not planning to use this with extremely long blocks.
+//
+// See https://en.wikipedia.org/wiki/Huffman_coding
+static void GenerateOptimalTree(const uint32_t* const histogram,
+                                int histogram_size,
+                                HuffmanTree* tree, int tree_depth_limit,
+                                uint8_t* const bit_depths) {
+  uint32_t count_min;
+  HuffmanTree* tree_pool;
+  int tree_size_orig = 0;
+  int i;
+
+  for (i = 0; i < histogram_size; ++i) {
+    if (histogram[i] != 0) {
+      ++tree_size_orig;
+    }
+  }
+
+  if (tree_size_orig == 0) {   // pretty optimal already!
+    return;
+  }
+
+  tree_pool = tree + tree_size_orig;
+
+  // For block sizes with less than 64k symbols we never need to do a
+  // second iteration of this loop.
+  // If we actually start running inside this loop a lot, we would perhaps
+  // be better off with the Katajainen algorithm.
+  assert(tree_size_orig <= (1 << (tree_depth_limit - 1)));
+  for (count_min = 1; ; count_min *= 2) {
+    int tree_size = tree_size_orig;
+    // We need to pack the Huffman tree in tree_depth_limit bits.
+    // So, we try by faking histogram entries to be at least 'count_min'.
+    int idx = 0;
+    int j;
+    for (j = 0; j < histogram_size; ++j) {
+      if (histogram[j] != 0) {
+        const uint32_t count =
+            (histogram[j] < count_min) ? count_min : histogram[j];
+        tree[idx].total_count_ = count;
+        tree[idx].value_ = j;
+        tree[idx].pool_index_left_ = -1;
+        tree[idx].pool_index_right_ = -1;
+        ++idx;
+      }
+    }
+
+    // Build the Huffman tree.
+    qsort(tree, tree_size, sizeof(*tree), CompareHuffmanTrees);
+
+    if (tree_size > 1) {  // Normal case.
+      int tree_pool_size = 0;
+      while (tree_size > 1) {  // Finish when we have only one root.
+        uint32_t count;
+        tree_pool[tree_pool_size++] = tree[tree_size - 1];
+        tree_pool[tree_pool_size++] = tree[tree_size - 2];
+        count = tree_pool[tree_pool_size - 1].total_count_ +
+                tree_pool[tree_pool_size - 2].total_count_;
+        tree_size -= 2;
+        {
+          // Search for the insertion point.
+          int k;
+          for (k = 0; k < tree_size; ++k) {
+            if (tree[k].total_count_ <= count) {
+              break;
+            }
+          }
+          memmove(tree + (k + 1), tree + k, (tree_size - k) * sizeof(*tree));
+          tree[k].total_count_ = count;
+          tree[k].value_ = -1;
+
+          tree[k].pool_index_left_ = tree_pool_size - 1;
+          tree[k].pool_index_right_ = tree_pool_size - 2;
+          tree_size = tree_size + 1;
+        }
+      }
+      SetBitDepths(&tree[0], tree_pool, bit_depths, 0);
+    } else if (tree_size == 1) {  // Trivial case: only one element.
+      bit_depths[tree[0].value_] = 1;
+    }
+
+    {
+      // Test if this Huffman tree satisfies our 'tree_depth_limit' criteria.
+      int max_depth = bit_depths[0];
+      for (j = 1; j < histogram_size; ++j) {
+        if (max_depth < bit_depths[j]) {
+          max_depth = bit_depths[j];
+        }
+      }
+      if (max_depth <= tree_depth_limit) {
+        break;
+      }
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Coding of the Huffman tree values
+
+static HuffmanTreeToken* CodeRepeatedValues(int repetitions,
+                                            HuffmanTreeToken* tokens,
+                                            int value, int prev_value) {
+  assert(value <= MAX_ALLOWED_CODE_LENGTH);
+  if (value != prev_value) {
+    tokens->code = value;
+    tokens->extra_bits = 0;
+    ++tokens;
+    --repetitions;
+  }
+  while (repetitions >= 1) {
+    if (repetitions < 3) {
+      int i;
+      for (i = 0; i < repetitions; ++i) {
+        tokens->code = value;
+        tokens->extra_bits = 0;
+        ++tokens;
+      }
+      break;
+    } else if (repetitions < 7) {
+      tokens->code = 16;
+      tokens->extra_bits = repetitions - 3;
+      ++tokens;
+      break;
+    } else {
+      tokens->code = 16;
+      tokens->extra_bits = 3;
+      ++tokens;
+      repetitions -= 6;
+    }
+  }
+  return tokens;
+}
+
+static HuffmanTreeToken* CodeRepeatedZeros(int repetitions,
+                                           HuffmanTreeToken* tokens) {
+  while (repetitions >= 1) {
+    if (repetitions < 3) {
+      int i;
+      for (i = 0; i < repetitions; ++i) {
+        tokens->code = 0;   // 0-value
+        tokens->extra_bits = 0;
+        ++tokens;
+      }
+      break;
+    } else if (repetitions < 11) {
+      tokens->code = 17;
+      tokens->extra_bits = repetitions - 3;
+      ++tokens;
+      break;
+    } else if (repetitions < 139) {
+      tokens->code = 18;
+      tokens->extra_bits = repetitions - 11;
+      ++tokens;
+      break;
+    } else {
+      tokens->code = 18;
+      tokens->extra_bits = 0x7f;  // 138 repeated 0s
+      ++tokens;
+      repetitions -= 138;
+    }
+  }
+  return tokens;
+}
+
+int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
+                                    HuffmanTreeToken* tokens, int max_tokens) {
+  HuffmanTreeToken* const starting_token = tokens;
+  HuffmanTreeToken* const ending_token = tokens + max_tokens;
+  const int depth_size = tree->num_symbols;
+  int prev_value = 8;  // 8 is the initial value for rle.
+  int i = 0;
+  assert(tokens != NULL);
+  while (i < depth_size) {
+    const int value = tree->code_lengths[i];
+    int k = i + 1;
+    int runs;
+    while (k < depth_size && tree->code_lengths[k] == value) ++k;
+    runs = k - i;
+    if (value == 0) {
+      tokens = CodeRepeatedZeros(runs, tokens);
+    } else {
+      tokens = CodeRepeatedValues(runs, tokens, value, prev_value);
+      prev_value = value;
+    }
+    i += runs;
+    assert(tokens <= ending_token);
+  }
+  (void)ending_token;    // suppress 'unused variable' warning
+  return (int)(tokens - starting_token);
+}
+
+// -----------------------------------------------------------------------------
+
+// Pre-reversed 4-bit values.
+static const uint8_t kReversedBits[16] = {
+  0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
+  0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
+};
+
+static uint32_t ReverseBits(int num_bits, uint32_t bits) {
+  uint32_t retval = 0;
+  int i = 0;
+  while (i < num_bits) {
+    i += 4;
+    retval |= kReversedBits[bits & 0xf] << (MAX_ALLOWED_CODE_LENGTH + 1 - i);
+    bits >>= 4;
+  }
+  retval >>= (MAX_ALLOWED_CODE_LENGTH + 1 - num_bits);
+  return retval;
+}
+
+// Get the actual bit values for a tree of bit depths.
+static void ConvertBitDepthsToSymbols(HuffmanTreeCode* const tree) {
+  // 0 bit-depth means that the symbol does not exist.
+  int i;
+  int len;
+  uint32_t next_code[MAX_ALLOWED_CODE_LENGTH + 1];
+  int depth_count[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
+
+  assert(tree != NULL);
+  len = tree->num_symbols;
+  for (i = 0; i < len; ++i) {
+    const int code_length = tree->code_lengths[i];
+    assert(code_length <= MAX_ALLOWED_CODE_LENGTH);
+    ++depth_count[code_length];
+  }
+  depth_count[0] = 0;  // ignore unused symbol
+  next_code[0] = 0;
+  {
+    uint32_t code = 0;
+    for (i = 1; i <= MAX_ALLOWED_CODE_LENGTH; ++i) {
+      code = (code + depth_count[i - 1]) << 1;
+      next_code[i] = code;
+    }
+  }
+  for (i = 0; i < len; ++i) {
+    const int code_length = tree->code_lengths[i];
+    tree->codes[i] = ReverseBits(code_length, next_code[code_length]++);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Main entry point
+
+void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
+                           uint8_t* const buf_rle, HuffmanTree* const huff_tree,
+                           HuffmanTreeCode* const huff_code) {
+  const int num_symbols = huff_code->num_symbols;
+  memset(buf_rle, 0, num_symbols * sizeof(*buf_rle));
+  OptimizeHuffmanForRle(num_symbols, buf_rle, histogram);
+  GenerateOptimalTree(histogram, num_symbols, huff_tree, tree_depth_limit,
+                      huff_code->code_lengths);
+  // Create the actual bit codes for the bit lengths.
+  ConvertBitDepthsToSymbols(huff_code);
+}
diff --git a/media/libwebp/utils/huffman_encode_utils.h b/media/libwebp/utils/huffman_encode_utils.h
index 236f266e4d..892d514751 100644
--- a/media/libwebp/utils/huffman_encode_utils.h
+++ b/media/libwebp/utils/huffman_encode_utils.h
@@ -51,7 +51,7 @@ int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
 // huffman code tree.
 void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
                            uint8_t* const buf_rle, HuffmanTree* const huff_tree,
-                           HuffmanTreeCode* const tree);
+                           HuffmanTreeCode* const huff_code);
 
 #ifdef __cplusplus
 }
diff --git a/media/libwebp/utils/huffman_utils.c b/media/libwebp/utils/huffman_utils.c
index a2b4b3f897..8e6954f196 100644
--- a/media/libwebp/utils/huffman_utils.c
+++ b/media/libwebp/utils/huffman_utils.c
@@ -91,7 +91,8 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
 
   assert(code_lengths_size != 0);
   assert(code_lengths != NULL);
-  assert(root_table != NULL);
+  assert((root_table != NULL && sorted != NULL) ||
+         (root_table == NULL && sorted == NULL));
   assert(root_bits > 0);
 
   // Build histogram of code lengths.
@@ -120,16 +121,22 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
   for (symbol = 0; symbol < code_lengths_size; ++symbol) {
     const int symbol_code_length = code_lengths[symbol];
     if (code_lengths[symbol] > 0) {
-      sorted[offset[symbol_code_length]++] = symbol;
+      if (sorted != NULL) {
+        sorted[offset[symbol_code_length]++] = symbol;
+      } else {
+        offset[symbol_code_length]++;
+      }
     }
   }
 
   // Special case code with only one value.
   if (offset[MAX_ALLOWED_CODE_LENGTH] == 1) {
-    HuffmanCode code;
-    code.bits = 0;
-    code.value = (uint16_t)sorted[0];
-    ReplicateValue(table, 1, total_size, code);
+    if (sorted != NULL) {
+      HuffmanCode code;
+      code.bits = 0;
+      code.value = (uint16_t)sorted[0];
+      ReplicateValue(table, 1, total_size, code);
+    }
     return total_size;
   }
 
@@ -151,6 +158,7 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
       if (num_open < 0) {
         return 0;
       }
+      if (root_table == NULL) continue;
       for (; count[len] > 0; --count[len]) {
         HuffmanCode code;
         code.bits = (uint8_t)len;
@@ -169,6 +177,7 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
       if (num_open < 0) {
         return 0;
       }
+      if (root_table == NULL) continue;
       for (; count[len] > 0; --count[len]) {
         HuffmanCode code;
         if ((key & mask) != low) {
@@ -206,7 +215,10 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
                           const int code_lengths[], int code_lengths_size) {
   int total_size;
   assert(code_lengths_size <= MAX_CODE_LENGTHS_SIZE);
-  if (code_lengths_size <= SORTED_SIZE_CUTOFF) {
+  if (root_table == NULL) {
+    total_size = BuildHuffmanTable(NULL, root_bits,
+                                   code_lengths, code_lengths_size, NULL);
+  } else if (code_lengths_size <= SORTED_SIZE_CUTOFF) {
     // use local stack-allocated array.
     uint16_t sorted[SORTED_SIZE_CUTOFF];
     total_size = BuildHuffmanTable(root_table, root_bits,
diff --git a/media/libwebp/utils/huffman_utils.h b/media/libwebp/utils/huffman_utils.h
index 7f241aab97..4f54691d20 100644
--- a/media/libwebp/utils/huffman_utils.h
+++ b/media/libwebp/utils/huffman_utils.h
@@ -78,6 +78,8 @@ void VP8LHtreeGroupsFree(HTreeGroup* const htree_groups);
 // the huffman table.
 // Returns built table size or 0 in case of error (invalid tree or
 // memory error).
+// If root_table is NULL, it returns 0 if a lookup cannot be built, something
+// > 0 otherwise (but not the table size).
 int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
                           const int code_lengths[], int code_lengths_size);
 
diff --git a/media/libwebp/utils/moz.build b/media/libwebp/utils/moz.build
index 619eaee6df..32431a9f3b 100644
--- a/media/libwebp/utils/moz.build
+++ b/media/libwebp/utils/moz.build
@@ -8,8 +8,10 @@ with Files('**'):
 
 SOURCES += [
     'bit_reader_utils.c',
+    'bit_writer_utils.c',
     'color_cache_utils.c',
     'filters_utils.c',
+    'huffman_encode_utils.c',
     'huffman_utils.c',
     'quant_levels_dec_utils.c',
     'quant_levels_utils.c',
diff --git a/media/libwebp/utils/quant_levels_dec_utils.c b/media/libwebp/utils/quant_levels_dec_utils.c
index a60de3444e..f960a8aa83 100644
--- a/media/libwebp/utils/quant_levels_dec_utils.c
+++ b/media/libwebp/utils/quant_levels_dec_utils.c
@@ -30,7 +30,7 @@
 
 #define DFIX 4           // extra precision for ordered dithering
 #define DSIZE 4          // dithering size (must be a power of two)
-// cf. http://en.wikipedia.org/wiki/Ordered_dithering
+// cf. https://en.wikipedia.org/wiki/Ordered_dithering
 static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
   {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
   { 12,  4, 14,  6 },
diff --git a/media/libwebp/utils/rescaler_utils.c b/media/libwebp/utils/rescaler_utils.c
index 6e384f5078..3b64e27525 100644
--- a/media/libwebp/utils/rescaler_utils.c
+++ b/media/libwebp/utils/rescaler_utils.c
@@ -12,66 +12,74 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <assert.h>
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #include "../dsp/dsp.h"
 #include "../utils/rescaler_utils.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 
-void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
-                      uint8_t* const dst,
-                      int dst_width, int dst_height, int dst_stride,
-                      int num_channels, rescaler_t* const work) {
+int WebPRescalerInit(WebPRescaler* const rescaler,
+                     int src_width, int src_height,
+                     uint8_t* const dst,
+                     int dst_width, int dst_height, int dst_stride,
+                     int num_channels, rescaler_t* const work) {
   const int x_add = src_width, x_sub = dst_width;
   const int y_add = src_height, y_sub = dst_height;
-  wrk->x_expand = (src_width < dst_width);
-  wrk->y_expand = (src_height < dst_height);
-  wrk->src_width = src_width;
-  wrk->src_height = src_height;
-  wrk->dst_width = dst_width;
-  wrk->dst_height = dst_height;
-  wrk->src_y = 0;
-  wrk->dst_y = 0;
-  wrk->dst = dst;
-  wrk->dst_stride = dst_stride;
-  wrk->num_channels = num_channels;
+  const uint64_t total_size = 2ull * dst_width * num_channels * sizeof(*work);
+  if (!CheckSizeOverflow(total_size)) return 0;
+
+  rescaler->x_expand = (src_width < dst_width);
+  rescaler->y_expand = (src_height < dst_height);
+  rescaler->src_width = src_width;
+  rescaler->src_height = src_height;
+  rescaler->dst_width = dst_width;
+  rescaler->dst_height = dst_height;
+  rescaler->src_y = 0;
+  rescaler->dst_y = 0;
+  rescaler->dst = dst;
+  rescaler->dst_stride = dst_stride;
+  rescaler->num_channels = num_channels;
 
   // for 'x_expand', we use bilinear interpolation
-  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add;
-  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
-  if (!wrk->x_expand) {  // fx_scale is not used otherwise
-    wrk->fx_scale = WEBP_RESCALER_FRAC(1, wrk->x_sub);
+  rescaler->x_add = rescaler->x_expand ? (x_sub - 1) : x_add;
+  rescaler->x_sub = rescaler->x_expand ? (x_add - 1) : x_sub;
+  if (!rescaler->x_expand) {  // fx_scale is not used otherwise
+    rescaler->fx_scale = WEBP_RESCALER_FRAC(1, rescaler->x_sub);
   }
   // vertical scaling parameters
-  wrk->y_add = wrk->y_expand ? y_add - 1 : y_add;
-  wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
-  wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
-  if (!wrk->y_expand) {
+  rescaler->y_add = rescaler->y_expand ? y_add - 1 : y_add;
+  rescaler->y_sub = rescaler->y_expand ? y_sub - 1 : y_sub;
+  rescaler->y_accum = rescaler->y_expand ? rescaler->y_sub : rescaler->y_add;
+  if (!rescaler->y_expand) {
     // This is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
-    // Its value is <= WEBP_RESCALER_ONE, because dst_height <= wrk->y_add, and
-    // wrk->x_add >= 1;
-    const uint64_t ratio =
-        (uint64_t)dst_height * WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_add);
+    // Its value is <= WEBP_RESCALER_ONE, because dst_height <= rescaler->y_add
+    // and rescaler->x_add >= 1;
+    const uint64_t num = (uint64_t)dst_height * WEBP_RESCALER_ONE;
+    const uint64_t den = (uint64_t)rescaler->x_add * rescaler->y_add;
+    const uint64_t ratio = num / den;
     if (ratio != (uint32_t)ratio) {
       // When ratio == WEBP_RESCALER_ONE, we can't represent the ratio with the
       // current fixed-point precision. This happens when src_height ==
-      // wrk->y_add (which == src_height), and wrk->x_add == 1.
+      // rescaler->y_add (which == src_height), and rescaler->x_add == 1.
       // => We special-case fxy_scale = 0, in WebPRescalerExportRow().
-      wrk->fxy_scale = 0;
+      rescaler->fxy_scale = 0;
     } else {
-      wrk->fxy_scale = (uint32_t)ratio;
+      rescaler->fxy_scale = (uint32_t)ratio;
     }
-    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->y_sub);
+    rescaler->fy_scale = WEBP_RESCALER_FRAC(1, rescaler->y_sub);
   } else {
-    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->x_add);
-    // wrk->fxy_scale is unused here.
+    rescaler->fy_scale = WEBP_RESCALER_FRAC(1, rescaler->x_add);
+    // rescaler->fxy_scale is unused here.
   }
-  wrk->irow = work;
-  wrk->frow = work + num_channels * dst_width;
-  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
+  rescaler->irow = work;
+  rescaler->frow = work + num_channels * dst_width;
+  memset(work, 0, (size_t)total_size);
 
   WebPRescalerDspInit();
+  return 1;
 }
 
 int WebPRescalerGetScaledDimensions(int src_width, int src_height,
@@ -82,19 +90,20 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
   {
     int width = *scaled_width;
     int height = *scaled_height;
+    const int max_size = INT_MAX / 2;
 
     // if width is unspecified, scale original proportionally to height ratio.
-    if (width == 0) {
+    if (width == 0 && src_height > 0) {
       width =
-          (int)(((uint64_t)src_width * height + src_height / 2) / src_height);
+          (int)(((uint64_t)src_width * height + src_height - 1) / src_height);
     }
     // if height is unspecified, scale original proportionally to width ratio.
-    if (height == 0) {
+    if (height == 0 && src_width > 0) {
       height =
-          (int)(((uint64_t)src_height * width + src_width / 2) / src_width);
+          (int)(((uint64_t)src_height * width + src_width - 1) / src_width);
     }
     // Check if the overall dimensions still make sense.
-    if (width <= 0 || height <= 0) {
+    if (width <= 0 || height <= 0 || width > max_size || height > max_size) {
       return 0;
     }
 
@@ -107,31 +116,34 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
 //------------------------------------------------------------------------------
 // all-in-one calls
 
-int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
-  const int num_lines = (wrk->y_accum + wrk->y_sub - 1) / wrk->y_sub;
+int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
+                           int max_num_lines) {
+  const int num_lines =
+      (rescaler->y_accum + rescaler->y_sub - 1) / rescaler->y_sub;
   return (num_lines > max_num_lines) ? max_num_lines : num_lines;
 }
 
-int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
+int WebPRescalerImport(WebPRescaler* const rescaler, int num_lines,
                        const uint8_t* src, int src_stride) {
   int total_imported = 0;
-  while (total_imported < num_lines && !WebPRescalerHasPendingOutput(wrk)) {
-    if (wrk->y_expand) {
-      rescaler_t* const tmp = wrk->irow;
-      wrk->irow = wrk->frow;
-      wrk->frow = tmp;
+  while (total_imported < num_lines &&
+         !WebPRescalerHasPendingOutput(rescaler)) {
+    if (rescaler->y_expand) {
+      rescaler_t* const tmp = rescaler->irow;
+      rescaler->irow = rescaler->frow;
+      rescaler->frow = tmp;
     }
-    WebPRescalerImportRow(wrk, src);
-    if (!wrk->y_expand) {     // Accumulate the contribution of the new row.
+    WebPRescalerImportRow(rescaler, src);
+    if (!rescaler->y_expand) {    // Accumulate the contribution of the new row.
       int x;
-      for (x = 0; x < wrk->num_channels * wrk->dst_width; ++x) {
-        wrk->irow[x] += wrk->frow[x];
+      for (x = 0; x < rescaler->num_channels * rescaler->dst_width; ++x) {
+        rescaler->irow[x] += rescaler->frow[x];
       }
     }
-    ++wrk->src_y;
+    ++rescaler->src_y;
     src += src_stride;
     ++total_imported;
-    wrk->y_accum -= wrk->y_sub;
+    rescaler->y_accum -= rescaler->y_sub;
   }
   return total_imported;
 }
diff --git a/media/libwebp/utils/rescaler_utils.h b/media/libwebp/utils/rescaler_utils.h
index b5d176ecf2..1c1942ddf5 100644
--- a/media/libwebp/utils/rescaler_utils.h
+++ b/media/libwebp/utils/rescaler_utils.h
@@ -47,12 +47,13 @@ struct WebPRescaler {
 };
 
 // Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
-void WebPRescalerInit(WebPRescaler* const rescaler,
-                      int src_width, int src_height,
-                      uint8_t* const dst,
-                      int dst_width, int dst_height, int dst_stride,
-                      int num_channels,
-                      rescaler_t* const work);
+// Returns false in case of error.
+int WebPRescalerInit(WebPRescaler* const rescaler,
+                     int src_width, int src_height,
+                     uint8_t* const dst,
+                     int dst_width, int dst_height, int dst_stride,
+                     int num_channels,
+                     rescaler_t* const work);
 
 // If either 'scaled_width' or 'scaled_height' (but not both) is 0 the value
 // will be calculated preserving the aspect ratio, otherwise the values are
diff --git a/media/libwebp/utils/thread_utils.c b/media/libwebp/utils/thread_utils.c
index e87ffbeac4..d61e89d59d 100644
--- a/media/libwebp/utils/thread_utils.c
+++ b/media/libwebp/utils/thread_utils.c
@@ -73,7 +73,7 @@ typedef struct {
 #endif
 
 static int pthread_create(pthread_t* const thread, const void* attr,
-                          unsigned int (__stdcall *start)(void*), void* arg) {
+                          unsigned int (__stdcall* start)(void*), void* arg) {
   (void)attr;
 #ifdef USE_CREATE_THREAD
   *thread = CreateThread(NULL,   /* lpThreadAttributes */
@@ -217,8 +217,12 @@ static THREADFN ThreadLoop(void* ptr) {
       done = 1;
     }
     // signal to the main thread that we're done (for Sync())
-    pthread_cond_signal(&impl->condition_);
+    // Note the associated mutex does not need to be held when signaling the
+    // condition. Unlocking the mutex first may improve performance in some
+    // implementations, avoiding the case where the waiting thread can't
+    // reacquire the mutex when woken.
     pthread_mutex_unlock(&impl->mutex_);
+    pthread_cond_signal(&impl->condition_);
   }
   return THREAD_RETURN(NULL);    // Thread is finished
 }
@@ -240,7 +244,13 @@ static void ChangeState(WebPWorker* const worker, WebPWorkerStatus new_status) {
     // assign new status and release the working thread if needed
     if (new_status != OK) {
       worker->status_ = new_status;
+      // Note the associated mutex does not need to be held when signaling the
+      // condition. Unlocking the mutex first may improve performance in some
+      // implementations, avoiding the case where the waiting thread can't
+      // reacquire the mutex when woken.
+      pthread_mutex_unlock(&impl->mutex_);
       pthread_cond_signal(&impl->condition_);
+      return;
     }
   }
   pthread_mutex_unlock(&impl->mutex_);
diff --git a/media/libwebp/utils/utils.c b/media/libwebp/utils/utils.c
index 9bda6a7169..97713d2832 100644
--- a/media/libwebp/utils/utils.c
+++ b/media/libwebp/utils/utils.c
@@ -23,7 +23,7 @@
 // alloc/free etc) is printed. For debugging/tuning purpose only (it's slow,
 // and not multi-thread safe!).
 // An interesting alternative is valgrind's 'massif' tool:
-//    http://valgrind.org/docs/manual/ms-manual.html
+//    https://valgrind.org/docs/manual/ms-manual.html
 // Here is an example command line:
 /*    valgrind --tool=massif --massif-out-file=massif.out \
                --stacks=yes --alloc-fn=WebPSafeMalloc --alloc-fn=WebPSafeCalloc
@@ -101,6 +101,9 @@ static void Increment(int* const v) {
 #if defined(MALLOC_LIMIT)
     {
       const char* const malloc_limit_str = getenv("MALLOC_LIMIT");
+#if MALLOC_LIMIT > 1
+      mem_limit = (size_t)MALLOC_LIMIT;
+#endif
       if (malloc_limit_str != NULL) {
         mem_limit = atoi(malloc_limit_str);
       }
@@ -169,16 +172,16 @@ static int CheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
   const uint64_t total_size = nmemb * size;
   if (nmemb == 0) return 1;
   if ((uint64_t)size > WEBP_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
-  if (total_size != (size_t)total_size) return 0;
+  if (!CheckSizeOverflow(total_size)) return 0;
 #if defined(PRINT_MEM_INFO) && defined(MALLOC_FAIL_AT)
   if (countdown_to_fail > 0 && --countdown_to_fail == 0) {
     return 0;    // fake fail!
   }
 #endif
-#if defined(MALLOC_LIMIT)
+#if defined(PRINT_MEM_INFO) && defined(MALLOC_LIMIT)
   if (mem_limit > 0) {
     const uint64_t new_total_mem = (uint64_t)total_mem + total_size;
-    if (new_total_mem != (size_t)new_total_mem ||
+    if (!CheckSizeOverflow(new_total_mem) ||
         new_total_mem > mem_limit) {
       return 0;   // fake fail!
     }
@@ -216,9 +219,14 @@ void WebPSafeFree(void* const ptr) {
   free(ptr);
 }
 
-// Public API function.
+// Public API functions.
+
+void* WebPMalloc(size_t size) {
+  return WebPSafeMalloc(1, size);
+}
+
 void WebPFree(void* ptr) {
-  free(ptr);
+  WebPSafeFree(ptr);
 }
 
 //------------------------------------------------------------------------------
@@ -226,7 +234,7 @@ void WebPFree(void* ptr) {
 void WebPCopyPlane(const uint8_t* src, int src_stride,
                    uint8_t* dst, int dst_stride, int width, int height) {
   assert(src != NULL && dst != NULL);
-  assert(src_stride >= width && dst_stride >= width);
+  assert(abs(src_stride) >= width && abs(dst_stride) >= width);
   while (height-- > 0) {
     memcpy(dst, src, width);
     src += src_stride;
diff --git a/media/libwebp/utils/utils.h b/media/libwebp/utils/utils.h
index d22151b0fc..20abf03c69 100644
--- a/media/libwebp/utils/utils.h
+++ b/media/libwebp/utils/utils.h
@@ -42,6 +42,10 @@ extern "C" {
 #endif
 #endif  // WEBP_MAX_ALLOCABLE_MEMORY
 
+static WEBP_INLINE int CheckSizeOverflow(uint64_t size) {
+  return size == (size_t)size;
+}
+
 // size-checking safe malloc/calloc: verify that the requested size is not too
 // large, or return NULL. You don't need to call these for constructs like
 // malloc(sizeof(foo)), but only if there's picture-dependent size involved
@@ -92,14 +96,14 @@ static WEBP_INLINE uint32_t GetLE32(const uint8_t* const data) {
 // Store 16, 24 or 32 bits in little-endian order.
 static WEBP_INLINE void PutLE16(uint8_t* const data, int val) {
   assert(val < (1 << 16));
-  data[0] = (val >> 0);
-  data[1] = (val >> 8);
+  data[0] = (val >> 0) & 0xff;
+  data[1] = (val >> 8) & 0xff;
 }
 
 static WEBP_INLINE void PutLE24(uint8_t* const data, int val) {
   assert(val < (1 << 24));
   PutLE16(data, val & 0xffff);
-  data[2] = (val >> 16);
+  data[2] = (val >> 16) & 0xff;
 }
 
 static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
@@ -107,24 +111,33 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
   PutLE16(data + 2, (int)(val >> 16));
 }
 
-// Returns (int)floor(log2(n)). n must be > 0.
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+// Returns (int)floor(log2(n)). n must be > 0.
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
   return 31 ^ __builtin_clz(n);
 }
+// counts the number of trailing zero
+static WEBP_INLINE int BitsCtz(uint32_t n) { return __builtin_ctz(n); }
 #elif defined(_MSC_VER) && _MSC_VER > 1310 && \
       (defined(_M_X64) || defined(_M_IX86))
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
+#pragma intrinsic(_BitScanForward)
 
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  unsigned long first_set_bit;
+  unsigned long first_set_bit;  // NOLINT (runtime/int)
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
-#else   // default: use the C-version.
+static WEBP_INLINE int BitsCtz(uint32_t n) {
+  unsigned long first_set_bit;  // NOLINT (runtime/int)
+  _BitScanForward(&first_set_bit, n);
+  return first_set_bit;
+}
+#else   // default: use the (slow) C-version.
+#define WEBP_HAVE_SLOW_CLZ_CTZ   // signal that the Clz/Ctz function are slow
 // Returns 31 ^ clz(n) = log2(n). This is the default C-implementation, either
 // based on table or not. Can be used as fallback if clz() is not available.
 #define WEBP_NEED_LOG_TABLE_8BIT
@@ -139,6 +152,15 @@ static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
 }
 
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); }
+
+static WEBP_INLINE int BitsCtz(uint32_t n) {
+  int i;
+  for (i = 0; i < 32; ++i, n >>= 1) {
+    if (n & 1) return i;
+  }
+  return 32;
+}
+
 #endif
 
 //------------------------------------------------------------------------------
diff --git a/media/libwebp/webp/config.h b/media/libwebp/webp/config.h
index dd31c3cfaa..3496bc2c49 100644
--- a/media/libwebp/webp/config.h
+++ b/media/libwebp/webp/config.h
@@ -13,6 +13,9 @@
 /* Set to 1 if __builtin_bswap64 is available */
 #define HAVE_BUILTIN_BSWAP64 1
 
+/* Define to 1 if you have the <cpu-features.h> header file. */
+/* #undef HAVE_CPU_FEATURES_H */
+
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #define HAVE_DLFCN_H 1
 
@@ -20,14 +23,11 @@
 /* #undef HAVE_GLUT_GLUT_H */
 
 /* Define to 1 if you have the <GL/glut.h> header file. */
-/* #undef HAVE_GL_GLUT_H */
+#define HAVE_GL_GLUT_H 1
 
 /* Define to 1 if you have the <inttypes.h> header file. */
 #define HAVE_INTTYPES_H 1
 
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
 /* Define to 1 if you have the <OpenGL/glut.h> header file. */
 /* #undef HAVE_OPENGL_GLUT_H */
 
@@ -40,6 +40,9 @@
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HAVE_STDINT_H 1
 
+/* Define to 1 if you have the <stdio.h> header file. */
+#define HAVE_STDIO_H 1
+
 /* Define to 1 if you have the <stdlib.h> header file. */
 #define HAVE_STDLIB_H 1
 
@@ -77,38 +80,34 @@
 #define PACKAGE_NAME "libwebp"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libwebp 0.5.1"
+#define PACKAGE_STRING "libwebp 1.2.2"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "libwebp"
 
 /* Define to the home page for this package. */
-#define PACKAGE_URL "http://developers.google.com/speed/webp"
+#define PACKAGE_URL "https://developers.google.com/speed/webp"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "0.5.1"
+#define PACKAGE_VERSION "1.2.2"
 
 /* Define to necessary symbol if this constant uses a non-standard name on
    your system. */
 /* #undef PTHREAD_CREATE_JOINABLE */
 
-/* Define to 1 if you have the ANSI C header files. */
+/* Define to 1 if all of the C90 standard headers exist (not just the ones
+   required in a freestanding environment). This macro is provided for
+   backward compatibility; new code need not use it. */
 #define STDC_HEADERS 1
 
 /* Version number of package */
-#define VERSION "0.5.1"
-
-/* Enable experimental code */
-/* #undef WEBP_EXPERIMENTAL_FEATURES */
-
-/* Set to 1 if AVX2 is supported */
-#define WEBP_HAVE_AVX2 1
+#define VERSION "1.2.2"
 
 /* Set to 1 if GIF library is installed */
 #define WEBP_HAVE_GIF 1
 
 /* Set to 1 if OpenGL is supported */
-/* #undef WEBP_HAVE_GL */
+#define WEBP_HAVE_GL 1
 
 /* Set to 1 if JPEG library is installed */
 #define WEBP_HAVE_JPEG 1
@@ -122,6 +121,9 @@
 /* Set to 1 if PNG library is installed */
 #define WEBP_HAVE_PNG 1
 
+/* Set to 1 if SDL library is installed */
+#define WEBP_HAVE_SDL 1
+
 /* Set to 1 if SSE2 is supported */
 #define WEBP_HAVE_SSE2 1
 
@@ -131,6 +133,9 @@
 /* Set to 1 if TIFF library is installed */
 #define WEBP_HAVE_TIFF 1
 
+/* Enable near lossless encoding */
+#define WEBP_NEAR_LOSSLESS 1
+
 /* Undefine this to disable thread support. */
 #define WEBP_USE_THREAD 1
 
diff --git a/media/libwebp/webp/decode.h b/media/libwebp/webp/decode.h
index ae8bfe840e..d98247509a 100644
--- a/media/libwebp/webp/decode.h
+++ b/media/libwebp/webp/decode.h
@@ -20,7 +20,7 @@
 extern "C" {
 #endif
 
-#define WEBP_DECODER_ABI_VERSION 0x0208    // MAJOR(8b) + MINOR(8b)
+#define WEBP_DECODER_ABI_VERSION 0x0209    // MAJOR(8b) + MINOR(8b)
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@@ -85,15 +85,12 @@ WEBP_EXTERN uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
 // Upon return, the Y buffer has a stride returned as '*stride', while U and V
 // have a common stride returned as '*uv_stride'.
 // Return NULL in case of error.
-// (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
+// (*) Also named Y'CbCr. See: https://en.wikipedia.org/wiki/YCbCr
 WEBP_EXTERN uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
                                    int* width, int* height,
                                    uint8_t** u, uint8_t** v,
                                    int* stride, int* uv_stride);
 
-// Releases memory returned by the WebPDecode*() functions above.
-WEBP_EXTERN void WebPFree(void* ptr);
-
 // These five functions are variants of the above ones, that decode the image
 // directly into a pre-allocated buffer 'output_buffer'. The maximum storage
 // available in this buffer is indicated by 'output_buffer_size'. If this
@@ -456,7 +453,7 @@ struct WebPDecoderOptions {
   int scaled_width, scaled_height;    // final resolution
   int use_threads;                    // if true, use multi-threaded decoding
   int dithering_strength;             // dithering strength (0=Off, 100=full)
-  int flip;                           // flip output vertically
+  int flip;                           // if true, flip output vertically
   int alpha_dithering_strength;       // alpha dithering strength in [0..100]
 
   uint32_t pad[5];                    // padding for later use
diff --git a/media/libwebp/webp/encode.h b/media/libwebp/webp/encode.h
index 549cf07730..b4c599df87 100644
--- a/media/libwebp/webp/encode.h
+++ b/media/libwebp/webp/encode.h
@@ -20,7 +20,7 @@
 extern "C" {
 #endif
 
-#define WEBP_ENCODER_ABI_VERSION 0x020e    // MAJOR(8b) + MINOR(8b)
+#define WEBP_ENCODER_ABI_VERSION 0x020f    // MAJOR(8b) + MINOR(8b)
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@@ -62,6 +62,10 @@ WEBP_EXTERN size_t WebPEncodeBGRA(const uint8_t* bgra,
 // These functions are the equivalent of the above, but compressing in a
 // lossless manner. Files are usually larger than lossy format, but will
 // not suffer any compression loss.
+// Note these functions, like the lossy versions, use the library's default
+// settings. For lossless this means 'exact' is disabled. RGB values in
+// transparent areas will be modified to improve compression. To avoid this,
+// use WebPEncode() and set WebPConfig::exact to 1.
 WEBP_EXTERN size_t WebPEncodeLosslessRGB(const uint8_t* rgb,
                                          int width, int height, int stride,
                                          uint8_t** output);
@@ -75,9 +79,6 @@ WEBP_EXTERN size_t WebPEncodeLosslessBGRA(const uint8_t* bgra,
                                           int width, int height, int stride,
                                           uint8_t** output);
 
-// Releases memory returned by the WebPEncode*() functions above.
-WEBP_EXTERN void WebPFree(void* ptr);
-
 //------------------------------------------------------------------------------
 // Coding parameters
 
@@ -147,7 +148,8 @@ struct WebPConfig {
   int use_delta_palette;  // reserved for future lossless feature
   int use_sharp_yuv;      // if needed, use sharp (and slow) RGB->YUV conversion
 
-  uint32_t pad[2];        // padding for later use
+  int qmin;               // minimum permissible quality factor
+  int qmax;               // maximum permissible quality factor
 };
 
 // Enumerate some predefined settings for WebPConfig, depending on the type
@@ -290,6 +292,11 @@ typedef enum WebPEncodingError {
 #define WEBP_MAX_DIMENSION 16383
 
 // Main exchange structure (input samples, output bytes, statistics)
+//
+// Once WebPPictureInit() has been called, it's ok to make all the INPUT fields
+// (use_argb, y/u/v, argb, ...) point to user-owned data, even if
+// WebPPictureAlloc() has been called. Depending on the value use_argb,
+// it's guaranteed that either *argb or *y/*u/*v content will be kept untouched.
 struct WebPPicture {
   //   INPUT
   //////////////
@@ -302,7 +309,7 @@ struct WebPPicture {
   // YUV input (mostly used for input to lossy compression)
   WebPEncCSP colorspace;     // colorspace: should be YUV420 for now (=Y'CbCr).
   int width, height;         // dimensions (less or equal to WEBP_MAX_DIMENSION)
-  uint8_t *y, *u, *v;        // pointers to luma/chroma planes.
+  uint8_t* y, *u, *v;        // pointers to luma/chroma planes.
   int y_stride, uv_stride;   // luma/chroma strides.
   uint8_t* a;                // pointer to the alpha plane
   int a_stride;              // stride of the alpha plane
@@ -346,7 +353,7 @@ struct WebPPicture {
   uint32_t pad3[3];       // padding for later use
 
   // Unused for now
-  uint8_t *pad4, *pad5;
+  uint8_t* pad4, *pad5;
   uint32_t pad6[8];       // padding for later use
 
   // PRIVATE FIELDS
diff --git a/media/libwebp/webp/mux.h b/media/libwebp/webp/mux.h
index 66096a92e0..7d27489a40 100644
--- a/media/libwebp/webp/mux.h
+++ b/media/libwebp/webp/mux.h
@@ -57,7 +57,7 @@ extern "C" {
   WebPMuxGetChunk(mux, "ICCP", &icc_profile);
   // ... (Consume icc_data).
   WebPMuxDelete(mux);
-  free(data);
+  WebPFree(data);
 */
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
@@ -245,7 +245,7 @@ WEBP_EXTERN WebPMuxError WebPMuxPushFrame(
     WebPMux* mux, const WebPMuxFrameInfo* frame, int copy_data);
 
 // Gets the nth frame from the mux object.
-// The content of 'frame->bitstream' is allocated using malloc(), and NOT
+// The content of 'frame->bitstream' is allocated using WebPMalloc(), and NOT
 // owned by the 'mux' object. It MUST be deallocated by the caller by calling
 // WebPDataClear().
 // nth=0 has a special meaning - last position.
@@ -376,10 +376,10 @@ WEBP_EXTERN WebPMuxError WebPMuxNumChunks(const WebPMux* mux,
 // Assembles all chunks in WebP RIFF format and returns in 'assembled_data'.
 // This function also validates the mux object.
 // Note: The content of 'assembled_data' will be ignored and overwritten.
-// Also, the content of 'assembled_data' is allocated using malloc(), and NOT
-// owned by the 'mux' object. It MUST be deallocated by the caller by calling
-// WebPDataClear(). It's always safe to call WebPDataClear() upon return,
-// even in case of error.
+// Also, the content of 'assembled_data' is allocated using WebPMalloc(), and
+// NOT owned by the 'mux' object. It MUST be deallocated by the caller by
+// calling WebPDataClear(). It's always safe to call WebPDataClear() upon
+// return, even in case of error.
 // Parameters:
 //   mux - (in/out) object whose chunks are to be assembled
 //   assembled_data - (out) assembled WebP data
diff --git a/media/libwebp/webp/mux_types.h b/media/libwebp/webp/mux_types.h
index ceea77dfc6..2fe8195839 100644
--- a/media/libwebp/webp/mux_types.h
+++ b/media/libwebp/webp/mux_types.h
@@ -14,7 +14,6 @@
 #ifndef WEBP_WEBP_MUX_TYPES_H_
 #define WEBP_WEBP_MUX_TYPES_H_
 
-#include <stdlib.h>  // free()
 #include <string.h>  // memset()
 #include "./types.h"
 
@@ -56,6 +55,7 @@ typedef enum WebPMuxAnimBlend {
 
 // Data type used to describe 'raw' data, e.g., chunk data
 // (ICC profile, metadata) and WebP compressed image data.
+// 'bytes' memory must be allocated using WebPMalloc() and such.
 struct WebPData {
   const uint8_t* bytes;
   size_t size;
@@ -68,11 +68,11 @@ static WEBP_INLINE void WebPDataInit(WebPData* webp_data) {
   }
 }
 
-// Clears the contents of the 'webp_data' object by calling free(). Does not
-// deallocate the object itself.
+// Clears the contents of the 'webp_data' object by calling WebPFree().
+// Does not deallocate the object itself.
 static WEBP_INLINE void WebPDataClear(WebPData* webp_data) {
   if (webp_data != NULL) {
-    free((void*)webp_data->bytes);
+    WebPFree((void*)webp_data->bytes);
     WebPDataInit(webp_data);
   }
 }
@@ -83,7 +83,7 @@ static WEBP_INLINE int WebPDataCopy(const WebPData* src, WebPData* dst) {
   if (src == NULL || dst == NULL) return 0;
   WebPDataInit(dst);
   if (src->bytes != NULL && src->size != 0) {
-    dst->bytes = (uint8_t*)malloc(src->size);
+    dst->bytes = (uint8_t*)WebPMalloc(src->size);
     if (dst->bytes == NULL) return 0;
     memcpy((void*)dst->bytes, src->bytes, src->size);
     dst->size = src->size;
diff --git a/media/libwebp/webp/types.h b/media/libwebp/webp/types.h
index 0ce2622e41..47f7f2b007 100644
--- a/media/libwebp/webp/types.h
+++ b/media/libwebp/webp/types.h
@@ -7,7 +7,7 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-//  Common types
+//  Common types + memory wrappers
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
@@ -49,4 +49,20 @@ typedef long long int int64_t;
 // Macro to check ABI compatibility (same major revision number)
 #define WEBP_ABI_IS_INCOMPATIBLE(a, b) (((a) >> 8) != ((b) >> 8))
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Allocates 'size' bytes of memory. Returns NULL upon error. Memory
+// must be deallocated by calling WebPFree(). This function is made available
+// by the core 'libwebp' library.
+WEBP_EXTERN void* WebPMalloc(size_t size);
+
+// Releases memory returned by the WebPDecode*() functions (from decode.h).
+WEBP_EXTERN void WebPFree(void* ptr);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
 #endif  // WEBP_WEBP_TYPES_H_
author	Moonchild <moonchild@palemoon.org>	2022-06-26 11:47:44 +0000
committer	Moonchild <moonchild@palemoon.org>	2022-06-26 11:47:44 +0000
commit	a08f15245412d0b95eba82c7e9be7b7c64286967 (patch)
tree	fa594344bc140371bc050a7d84f2d1f66c724bf3
parent	20c80bcbc174c975fe16b1d562879489924b218c (diff)
parent	76f825276ddc527f86d0a17e803d820ef67fd355 (diff)
download	uxp-a08f15245412d0b95eba82c7e9be7b7c64286967.tar.gz